XOR with ReLU activation function - neural-network
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
input = [[0,0,1],[0,1,1],[1,0,1],[1,1,1]]
output = [0,1,1,0]
N = np.size(input,0) # number of samples
Ni = np.size(input,1) # dimension of the samples of input
No = 1 # dimension of the sample of output
Nh = 10 # number of hidden units
Ws = 1/4*np.random.rand(Nh,Ni+1)
print(Ws)
Wo = 1/4*np.random.rand(No,Nh)
print(Wo)
alpha = 0.05 # Learning rate
t_ = []
loss_ = []
def ReLU(x):
return np.maximum(0,x)
def sigmoid(x):
return 1/(1+np.exp(-x))
## train the model ====================================================================
for epoch in range(0,3000):
loss = 0
for id_ in range(0,N):
dWs = 0*Ws
dWo = 0*Wo
x = np.append(input[id_],1)
Z_1 = np.dot(Ws,x)
Z_2 = np.dot(Wo,ReLU(Z_1))
y = sigmoid(Z_2)
d = output[id_]
for j in range(0,Nh):
for i in range(0,No):
if Z_1[j] >= 0:
dWo[i,j] = dWo[i,j] + (y[i]-d)*Z_1[j]
#dWo[i,j] = dWo[i,j] + sigmoid(Z_1[j])*(y[i]-d)
else:
dWo[i,j] += 0
Wo = Wo - alpha*dWo
for k in range(0,Ni+1):
for j in range(0,Nh):
for i in range(0,No):
if Z_1[j] >= 0:
dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*(y[i]-d)
#dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*sigmoid(Z_1[j])*(1-sigmoid(Z_1[j]))*(y[i]-d)
else:
dWs[j,k] += 0
Ws = Ws - alpha*dWs
loss = loss + 1/2*np.linalg.norm(y-d)
if np.mod(epoch,50) == 0:
print(epoch,"-th epoch trained")
t_ = np.append(t_,epoch)
loss_ = np.append(loss_,loss)
fig = plt.figure(num=0,figsize=[10,5])
plt.plot(t_,loss_,marker="")
plt.title('Loss decay')
plt.xlabel('epoch',FontSize=20)
plt.ylabel('Loss',FontSize=20)
plt.show()
## figure out the function shape the model==========================================
xn = np.linspace(0,1,20)
yn = np.linspace(0,1,20)
xm, ym = np.meshgrid(xn, yn)
xx = np.reshape(xm,np.size(xm,0)*np.size(xm,1))
yy = np.reshape(ym,np.size(xm,0)*np.size(xm,1))
Z = []
for id__ in range(0,np.size(xm)):
x = np.append([xx[id__],yy[id__]],[1,1])
Z_1 = np.dot(Ws,x)
y_ = sigmoid(np.dot(Wo,ReLU(Z_1)))
Z = np.append(Z,y_)
fig = plt.figure(num=1,figsize=[10,5])
ax = fig.gca(projection='3d')
surf = ax.plot_surface(xm,ym,np.reshape(Z,(np.size(xm,0),np.size(xm,1))),cmap='coolwarm',linewidth=0,antialiased=False)
print("====================================================================")
plt.show()
## test the trained model ====================================================================
for id_ in range(0,N):
x = np.append(input[id_],1)
Z_1 = np.dot(Ws,x)
y = sigmoid(np.dot(Wo,ReLU(Z_1)))
print(y)
If I try this with sigmoid function, it works fine but when the ReLU activation function is implemented, the the program doesn't learning anything.
The NN consist of 3 input, hidden, output layers and sigmoid activation fuction is implemented for output function. Hand calculation seems fine but can't find the flaw.
The code below with sigmoid activation function works just fine.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
input = [[0,0,1],[0,1,1],[1,0,1],[1,1,1]]
output = [0,1,1,0]
N = np.size(input,0) # number of samples
Ni = np.size(input,1) # dimension of the samples of input
No = 1 # dimension of the sample of output
Nh = 5 # number of hidden units
Ws = 1/4*np.random.rand(Nh,Ni+1)
#print(Ws)
Wo = 1/4*np.random.rand(No,Nh)
#print(Wo)
alpha = 0.1 # Learning rate
t_ = []
loss_ = []
def sigmoid(x):
return 1/(1+np.exp(-x))
## train the model ====================================================================
for epoch in range(0,5000):
loss = 0
for id_ in range(0,N):
dWs = 0*Ws
dWo = 0*Wo
x = np.append(input[id_],1)
Z_1 = np.dot(Ws,x)
A_1 = sigmoid(Z_1)
Z_2 = np.dot(Wo,A_1)
y = sigmoid(Z_2)
d = output[id_]
for j in range(0,Nh):
for i in range(0,No):
dWo[i,j] = dWo[i,j] + sigmoid(Z_1[j])*(y[i]-d)
Wo = Wo - alpha*dWo
for k in range(0,Ni+1):
for j in range(0,Nh):
for i in range(0,No):
dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*sigmoid(Z_1[j])*(1-sigmoid(Z_1[j]))*(y[i]-d)
Ws = Ws - alpha*dWs
loss = loss + 1/2*np.linalg.norm(y-d)
if np.mod(epoch,50) == 0:
print(epoch,"-th epoch trained")
t_ = np.append(t_,epoch)
loss_ = np.append(loss_,loss)
fig = plt.figure(num=0,figsize=[10,5])
plt.plot(t_,loss_,marker="")
plt.title('Loss decay')
plt.xlabel('epoch',FontSize=20)
plt.ylabel('Loss',FontSize=20)
plt.show()
## figure out the function shape the model==========================================
xn = np.linspace(0,1,20)
yn = np.linspace(0,1,20)
xm, ym = np.meshgrid(xn, yn)
xx = np.reshape(xm,np.size(xm,0)*np.size(xm,1))
yy = np.reshape(ym,np.size(xm,0)*np.size(xm,1))
Z = []
for id__ in range(0,np.size(xm)):
x = np.append([xx[id__],yy[id__]],[1,1])
Z_1 = np.dot(Ws,x)
y_ = sigmoid(np.dot(Wo,sigmoid(Z_1)))
Z = np.append(Z,y_)
fig = plt.figure(num=1,figsize=[10,5])
ax = fig.gca(projection='3d')
surf = ax.plot_surface(xm,ym,np.reshape(Z,(np.size(xm,0),np.size(xm,1))),cmap='coolwarm',linewidth=0,antialiased=False)
print("====================================================================")
plt.show()
## test the trained model ====================================================================
for id_ in range(0,N):
x = np.append(input[id_],1)
Z_1 = np.dot(Ws,x)
y = sigmoid(np.dot(Wo,sigmoid(Z_1)))
print(y)
I found similar case in Quora.
And have tested it in my networks that involves modelling logics to resolve some noisy cost function.
I found that ReLu outputs are usually blasted all over, by the 3rd layer of MLP, the values before the output have accumulated to thousands if not millions.
And with that, I prefer sigmoid with MLPs. Don't forget, sigmoid limits output to 1, but ReLu does not.
The intuition behind ReLu is that it filters out unneeded info by means of MAX(0,X) function, before forwarded to the next layer of processing. For the same reason you see it being used in Convolution problems. Note: Normalization Layer is used in these cases so that the output values of the nodes will not blast all over.
But in the case of an MLP, you didn't implement any Norm Layer after ReLu, for that reason, it is difficult to model a simple function such as XOR. In short, without Norm Layer, I don't recommend the use of ReLu, although in some cases, it still can function properly.
Related
Modelling membrane evolution over time
I am trying to model the time evolution of a membrane based on the following code in MATLAB. The basic outline is that the evolution is based on a differential equation where j=0,1 and x^0 = x, x^1 = y and x^j(s_i) = x^j_i. My code is the following. import numpy as np from matplotlib import pyplot as plt R0 = 5 #radius N = 360 #number of intervals x0 = 2*np.pi*R0/(N/2) #resting membrane lengths phi = np.linspace(0,2*np.pi, num=360, dtype=float) R1 = R0 + 0.5*np.sin(20*phi) X = R1*np.cos(phi) Y = R1*np.sin(phi) L = np.linspace(-1,358, num=360, dtype=int) R = np.linspace(1,360, num=360,dtype=int) #right and left indexing vectors R[359] = 0 X = R1*np.cos(phi) Y = R1*np.sin(phi) plt.plot(X,Y) plt.axis("equal") plt.show() ds = 1/N ds2 = ds**2 k = 1/10 w = 10**6 for i in range(0,20000): lengths = np.sqrt( (X[R]-X)**2 + (Y[R]-Y)**2 ) Ex = k/ds2*(X[R] - 2*X + X[L] - x0*( (X[R]-X)/lengths - (X-X[L])/lengths[L]) ) Ey = k/ds2*(Y[R] - 2*Y + Y[L] - x0*( (Y[R]-Y)/lengths - (Y-Y[L])/lengths[L]) ) X = X + 1/w*Ex Y = Y + 1/w*Ey plt.plot(X,Y) plt.axis("equal") plt.show() The model is supposed to devolve into a circular membrane, as below but this is what mine does
Your definition of x0 is wrong. In the Matlab code, it is equal to x0 = 2*pi*R/N/2 # which is pi*R/N while in your Python code it is x0 = 2*np.pi*R0/(N/2) # which is 4*np.pi*R0/N Correcting that, the end result is a circular shape, but with a different radius. I'm assuming that this is because of the reduced number of iterations (20000 instead of 1000000). Edit: As expected, using the correct number of iterations results in a plot similar to your expected one.
FastICA Implementation.. Matlab
I have been working on a FastICA algorithm implementation using MatLab. Currently the code does not separate the signals as good as id like. I was wondering if anyone here could give me some advice on what I could do to fix this problem? disp('*****Importing Signals*****'); s = [1,30000]; [m1,Fs1] = audioread('OSR_us_000_0034_8k.wav', s); [f1,Fs2] = audioread('OSR_us_000_0017_8k.wav', s); ss = size(f1,1); n = 2; disp('*****Mixing Signals*****'); A = randn(n,n); %developing mixing matrix x = A*[m1';f1']; %A*x m_x = sum(x, n)/ss; %mean of x xx = x - repmat(m_x, 1, ss); %centering the matrix c = cov(x'); sq = inv(sqrtm(c)); %whitening the data x = c*xx; D = diff(tanh(x)); %setting up newtons method SD = diff(D); disp('*****Generating Weighted Matrix*****'); w = randn(n,1); %Random weight vector w = w/norm(w,2); %unit vector w0 = randn(n,1); w0 = w0/norm(w0,2); %unit vector disp('*****Unmixing Signals*****'); while abs(abs(w0'*w)-1) > size(w,1) w0 = w; w = x*D(w'*x) - sum(SD'*(w'*x))*w; %perform ICA w = w/norm(w, 2); end disp('*****Output After ICA*****'); sound(w'*x); % Supposed to be one of the original signals subplot(4,1,1);plot(m1); title('Original Male Voice'); subplot(4,1,2);plot(f1); title('Original Female Voice'); subplot(4,1,4);plot(w'*x); title('Post ICA: Estimated Signal'); %figure; %plot(z); title('Random Mixed Signal'); %figure; %plot(100*(w'*x)); title('Post ICA: Estimated Signal');
Your covariance matrix c is 2 by 2, you cannot work with that. You have to mix your signal multiple times with random numbers to get anywhere, because you must have some signal (m1) common to different channels. I was unable to follow through your code for fast-ICA but here is a PCA example: url = {'https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0034_8k.wav';... 'https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0017_8k.wav'}; %fs = 8000; m1 = webread(url{1}); m1 = m1(1:30000); f1 = webread(url{2}); f1 = f1(1:30000); ss = size(f1,1); n = 2; disp('*****Mixing Signals*****'); A = randn(50,n); %developing mixing matrix x = A*[m1';f1']; %A*x [www,comp] = pca(x'); sound(comp(:,1)',8000)
BNN with regression using Pymc3
I'm trying to build BNN in a regression task, and I get a result that seems not true. My code First, build toy data #Toy model def build_toy_dataset(N=50, noise_std=0.2): x = np.linspace(-3, 3, num=N) y = np.cos(x) + np.random.normal(0, noise_std, size=N) x = x.reshape((N, 1)) x = scale(x) x = x.astype(floatX) y = y.astype(floatX) return x, y N = 50 # number of data points D = 1 # number of features X_train, Y_train = build_toy_dataset(N) X_test, Y_test = build_toy_dataset(N) fig, ax = plt.subplots() ax.plot(X_test,Y_test,'ro',X_train,Y_train,'bx',alpha=0.2) ax.legend(['Y_test','Y_train']) ax.set(xlabel='X', ylabel='Y', title='Toy Regression data set'); X = scale(X) X = X.astype(floatX) Y = Y.astype(floatX) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5) Then, define BNN with output #2 layers with 5 nodes each def construct_nn_2Layers(ann_input, ann_output): n_hidden = 5 n_features = ann_input.get_value().shape[1] # Initialize random weights between each layer init_1 = np.random.randn(n_features, n_hidden).astype(floatX) init_2 = np.random.randn(n_hidden, n_hidden).astype(floatX) init_out = np.random.randn(n_hidden).astype(floatX) # Initialize random biases in each layer init_b_1 = np.random.randn(n_hidden).astype(floatX) init_b_2 = np.random.randn(n_hidden).astype(floatX) init_b_out = np.random.randn(1).astype(floatX) with pm.Model() as neural_network: # Weights from input to hidden layer weights_in_1 = pm.Normal('w_in_1', 0, sd=1, shape=(n_features, n_hidden), testval=init_1) bias_1 = pm.Normal('b_1', mu=0, sd=1, shape=(n_hidden), testval=init_b_1) # Weights from 1st to 2nd layer weights_1_2 = pm.Normal('w_1_2', 0, sd=1, shape=(n_hidden, n_hidden), testval=init_2) bias_2 = pm.Normal('b_2', mu=0, sd=1, shape=(n_hidden), testval=init_b_2) # Weights from hidden layer to output weights_2_out = pm.Normal('w_2_out', 0, sd=1, shape=(n_hidden,), testval=init_out) bias_out = pm.Normal('b_out', mu=0, sd=1, shape=(1), testval=init_b_out) # Build neural-network using tanh activation function act_1 = pm.math.tanh(pm.math.dot(ann_input, weights_in_1)+bias_1) act_2 = pm.math.tanh(pm.math.dot(act_1, weights_1_2)+bias_2) act_out = pm.math.dot(act_2, weights_2_out)+bias_out sd = pm.HalfNormal('sd', sd=1) out = pm.Normal('out', mu=act_out, sd=sd, observed=ann_output) return neural_network Then construct: ann_input = theano.shared(X_train) ann_output = theano.shared(Y_train) neural_network = construct_nn_2Layers(ann_input, ann_output) run ADVI: with neural_network: inference_no_s = pm.ADVI() # Checking convergence - Tracking parameters tracker = pm.callbacks.Tracker( mean=inference_no_s.approx.mean.eval, # callable that returns mean std=inference_no_s.approx.std.eval # callable that returns std ) approx_no_s = pm.fit(n=30000, method=inference_no_s, callbacks=[tracker]) Predict in test: ann_input.set_value(X_test) ann_output.set_value(Y_test) with neural_network: ppc = pm.sample_posterior_predictive(trace, samples=500, progressbar=False) and this is what I get which seems not relevant. What am I doing wrong?
How to use Neural network for non binary input and output
I tried to use the modified version of NN back propagation code by Phil Brierley (www.philbrierley.com). When i try to solve the XOR problem it works perfectly. but when i try to solve a problem of the form output = x1^2 + x2^2 (ouput = sum of squares of input), the results are not accurate. i have scaled the input and ouput between -1 and 1. I get different results every time i run the same program (i understand its due to random wts initialization), but results are very different. i tried changing learning rate but still results converge. have given the code below %--------------------------------------------------------- % MATLAB neural network backprop code % by Phil Brierley %-------------------------------------------------------- clear; clc; close all; %user specified values hidden_neurons = 4; epochs = 20000; input = []; for i =-10:2.5:10 for j = -10:2.5:10 input = [input;i j]; end end output = (input(:,1).^2 + input(:,2).^2); output1 = output; % Maximum input and output limit and scaling factors m1 = -10; m2 = 10; m3 = 0; m4 = 250; c = -1; d = 1; %Scale input and output for i =1:size(input,2) I = input(:,i); scaledI = ((d-c)*(I-m1) ./ (m2-m1)) + c; input(:,i) = scaledI; end for i =1:size(output,2) I = output(:,i); scaledI = ((d-c)*(I-m3) ./ (m4-m3)) + c; output(:,i) = scaledI; end train_inp = input; train_out = output; %read how many patterns and add bias patterns = size(train_inp,1); train_inp = [train_inp ones(patterns,1)]; %read how many inputs and initialize learning rate inputs = size(train_inp,2); hlr = 0.1; %set initial random weights weight_input_hidden = (randn(inputs,hidden_neurons) - 0.5)/10; weight_hidden_output = (randn(1,hidden_neurons) - 0.5)/10; %Training err = zeros(1,epochs); for iter = 1:epochs alr = hlr; blr = alr / 10; %loop through the patterns, selecting randomly for j = 1:patterns %select a random pattern patnum = round((rand * patterns) + 0.5); if patnum > patterns patnum = patterns; elseif patnum < 1 patnum = 1; end %set the current pattern this_pat = train_inp(patnum,:); act = train_out(patnum,1); %calculate the current error for this pattern hval = (tanh(this_pat*weight_input_hidden))'; pred = hval'*weight_hidden_output'; error = pred - act; % adjust weight hidden - output delta_HO = error.*blr .*hval; weight_hidden_output = weight_hidden_output - delta_HO'; % adjust the weights input - hidden delta_IH= alr.*error.*weight_hidden_output'.*(1-(hval.^2))*this_pat; weight_input_hidden = weight_input_hidden - delta_IH'; end % -- another epoch finished %compute overall network error at end of each epoch pred = weight_hidden_output*tanh(train_inp*weight_input_hidden)'; error = pred' - train_out; err(iter) = ((sum(error.^2))^0.5); %stop if error is small if err(iter) < 0.001 fprintf('converged at epoch: %d\n',iter); break end end %Output after training pred = weight_hidden_output*tanh(train_inp*weight_input_hidden)'; Y = m3 + (m4-m3)*(pred-c)./(d-c); % Testing for a new set of input input_test = [6 -3.1; 0.5 1; -2 3; 3 -2; -4 5; 0.5 4; 6 1.5]; output_test = (input_test(:,1).^2 + input_test(:,2).^2); input1 = input_test; %Scale input for i =1:size(input1,2) I = input1(:,i); scaledI = ((d-c)*(I-m1) ./ (m2-m1)) + c; input1(:,i) = scaledI; end %Predict output train_inp1 = input1; patterns = size(train_inp1,1); bias = ones(patterns,1); train_inp1 = [train_inp1 bias]; pred1 = weight_hidden_output*tanh(train_inp1*weight_input_hidden)'; %Rescale Y1 = m3 + (m4-m3)*(pred1-c)./(d-c); analy_numer = [output_test Y1'] plot(err) This is the sample output i get for problem state after 20000 epochs analy_numer = 45.6100 46.3174 1.2500 -2.9457 13.0000 11.9958 13.0000 9.7097 41.0000 44.9447 16.2500 17.1100 38.2500 43.9815 if i run once more i get different results. as can be observed for small values of input i get totally wrong ans (negative ans not possible). for other values accuracy is still poor. can someone tell what i am doing wrong and how to correct. thanks raman
Simple Linear Neural Network Weights from Training are not compatible with training results
The weights that I get from training, when implied directly on input, return different results! I'll show it on a very simple example let's say we have an input vector x= 0:0.01:1; and target vector t=x^2 (I know it better to use non linear network) after training, 2 layer, linear network, with one neuron at each layer, we get: sim(net,0.95) = 0.7850 (some error in training - that's ok and should be) weights from net.IW,net.LW,net.b: IW = 0.4547 LW = 2.1993 b = 0.3328 -1.0620 if I use the weights: Out = purelin(purelin(0.95*IW+b(1))*LW+b(2)) = 0.6200! , I get different result from the result of the sim! how can it be? what's wrong? the code: %Main_TestWeights close all clear all clc t1 = 0:0.01:1; x = t1.^2; hiddenSizes = 1; net = feedforwardnet(hiddenSizes); [Xs,Xi,Ai,Ts,EWs,shift] = preparets(net,con2seq(t1),con2seq(x)); net.layers{1,1}.transferFcn = 'purelin'; [net,tr,Y,E,Pf,Af] = train(net,Xs,Ts,Xi,Ai); view(net); IW = cat(2,net.IW{1}); LW = cat(2,net.LW{2,1}); b = cat(2,[net.b{1,1},net.b{2,1}]); %Result from Sim t2=0.95; Yk = sim(net,t2) %Result from Weights x1 = IW*t2'+b(1) x1out = purelin(x1) x2 = purelin(x1out*(LW)+b(2))
The neural network toolbox rescales inputs and outputs to the [-1,1] range. You must therefore rescale and unscale it so that your simulation output is the same sim()'s output: %Result from Weights x1 = 2*t2 - 1; # rescale x1 = IW*x1+b(1); x1out = purelin(x1); x2 = purelin(x1out*(LW)+b(2)); x2 = (x2+1)/2 # unscale then >> x2 == Yk ans = 1