I am very new to Matlab. What i am trying to do is classify the iris dataset using Cross-Validation (that means that i have to split the dataset in 3: trainingSet, validationSet, and test set) . In my mind everything i write here is ok (beeing a beginner is hard sometimes). So i could use a little help...
This is the function that splits the data (first 35(70% of the data) are the training set, the rest is the validation set(15%) and 15% i will use later for the test set)
close all; clear ;
load fisheriris;
for i = 1:35
for j = 1:4
trainSeto(i,j) = meas(i,j);
end
end
for i = 51:85
for j = 1:4
trainVers(i-50,j) = meas(i,j);
end
end
for i = 101:135
for j = 1:4
trainVirg(i-100,j) = meas(i,j);
end
end
for i = 36:43
for j = 1:4
valSeto(i-35,j) = meas(i,j);
end
end
for i = 86:93
for j = 1:4
valVers(i-85,j) = meas(i,j);
end
end
for i = 136:143
for j = 1:4
valVirg(i-135,j) = meas(i,j);
end
end
for i = 44:50
for j = 1:4
testSeto(i-43,j) = meas(i,j);
end
end
for i = 94:100
for j = 1:4
testVers(i-93,j) = meas(i,j);
end
end
for i = 144:150
for j = 1:4
testVirg(i-143,j) = meas(i,j);
end
end
And this is the main script:
close all; clear;
%%the 3 tipes of iris
run divinp
% the representation of the 3 classes(their coding)
a = [-1 -1 +1]';
b = [-1 +1 -1]';
c = [+1 -1 -1]';
%training set
trainInp = [trainSeto trainVers trainVirg];
%the targets
T = [repmat(a,1,length(trainSeto)) repmat(b,1,length(trainVers)) repmat(c,1,length(trainVirg))];
%%the training
trainCor = zeros(10,10);
valCor = zeros(10,10);
Xn = zeros(1,10);
Yn = zeros(1,10);
for k = 1:10,
Yn(1,k) = k;
for n = 1:10,
Xn(1,n) = n;
net = newff(trainInp,T,[k n],{},'trainbfg');
net = init(net);
net.divideParam.trainRatio = 1;
net.divideParam.valRatio = 0;
net.divideParam.testRatio = 0;
net.trainParam.max_fail = 2;
valInp = [valSeto valVers valVirg];
valT = [repmat(a,1,length(valSeto)) repmat(b,1,length(valVers)) repmat(c,1,length(valVirg))];
[net,tr] = train(net,trainInp,T);
Y = sim(net,trainInp);
[Yval,Pfval,Afval,Eval,perfval] = sim(net,valInp,[],[],valT);
% calculate [%] of correct classifications
trainCor(k,n) = 100 * length(find(T.*Y > 0)) / length(T);
valCor(k,n) = 100 * length(find(valT.*Yval > 0)) / length(valT);
end
end
figure
surf(Xn,Yn,trainCor/3);
view(2)
figure
surf(Xn,Yn,valCor/3);
view(2)
I get this error
Error using trainbfg (line 120) Inputs and targets have different
numbers of samples.
Error in network/train (line 106) [net,tr] =
feval(net.trainFcn,net,X,T,Xi,Ai,EW,net.trainParam);
Error in ClassIris (line 38)
[net,tr] = train(net,trainInp,T);
close all; clear ;
load fisheriris;
trainSetoIndx = 1:35;
trainVersIndx = 51:85; % or: trainVersIndx = trainSetoIndx + 50;
trainVirgIndx = 101:135;
colIndx = 1:4;
trainSeto = meas(trainSetoIndx, colIndx);
trainVers = meas(trainVersIndx, colIndx);
trainVirg = meas(trainVirgIndx, colIndx);
valSetoIndx = 36:43;
valVersIndx = 86:93;
valVirgIndx = 136:143
valSeto = meas(valSetoIndx, colIndx);
valVers = meas(valVersIndx, colIndx);
valVirg = meas(valVirgIndx, colIndx);
testSetoIndx = 44:50;
testVersIndx = 94:100;
testVirgIndx = 144:150
testSeto = meas(testSetoIndx, colIndx);
testVers = meas(testVersIndx, colIndx);
testVirg = meas(testVirgIndx, colIndx);
i have writen it with ":" also still the same problem it's something with repmat.. i don't know how to use it properly or newff :D
Just to get you started, you can rewrite your code loops as follows:
trainSetoIndx = 1:35;
trainVersIndx = 51:85; % or: trainVersIndx = trainSetoIndx + 50;
trainVirgIndx = 101:135; % or: trainVirgIndx = trainSetoIndx + 100;
colIndx = 1:4; % can't tell if this is all the columns in meas
trainSeto = meas(trainIndx, colIndx);
trainVers = meas(trainVersIndx, colIndx);
trainVirg = meas(trainVirgIndx, colIndx);
The do the same thing for all the others:
valSetoIndx = 36:43;
etc.
Next, simply type whos at the command prompt and you will see the sizes of all the arrays you have created. See whether the ones that need to be the same size have, in fact, the same dimensions.
Related
I'm trying to generate a power a curve which is Gaussian but in the plot generated I need to remove the marginal values. Could someone please guide me how? Thanks
Following is the code I've written for the power curve:
function [xgrid,ygrid,Z] = biVariateContourPlotsGMMCopula(givenData,gmmObject,~,numMeshPoints,x_dim,y_dim)
d = 2;
if nargin < 5
x_dim = 1;
y_dim = 2;
end
if x_dim == y_dim
hist(givenData(:,x_dim),10);
return;
end
numMeshPoints = min(numMeshPoints,256);
givenData = givenData(:,[x_dim y_dim]);
alpha = gmmObject.alpha;
mu = gmmObject.mu(:,[x_dim y_dim]);
sigma = gmmObject.sigma([x_dim y_dim],[x_dim y_dim],:) + 0.005*repmat(eye(d),[1 1 numel(alpha)]);
gmmObject = gmdistribution(mu,sigma,alpha);
bin_num = 256;
for j = 1:2
l_limit = min(gmmObject.mu(:,j))-3*(max(gmmObject.Sigma(j,j,:))^0.5);
u_limit = max(gmmObject.mu(:,j))+3*(max(gmmObject.Sigma(j,j,:))^0.5);
xmesh_inverse_space{j} = (l_limit:(u_limit-l_limit)/(bin_num-1):u_limit);
end
%if isempty(xmesh)||isempty(pdensity)||isempty(cdensity)
% Following for loop does the non-parameteric estimation of marginal
% densities if not provided
for i = 1:d
currentVar = givenData(:,i);
[bandwidth,pdensity{i},xmesh{i}]=kde(currentVar,numMeshPoints);
pdensity{i}(find(pdensity{i}<0)) = 0;
cdensity{i} = cumsum(pdensity{i});
cdensity{i} = (cdensity{i}-min(cdensity{i}))/(max(cdensity{i})-min(cdensity{i}));
end
[xgrid,ygrid] = meshgrid(xmesh{1}(2:end-1),xmesh{2}(2:end-1));
for k = 1:d
marginalLogLikelihood_grid{k} = log(pdensity{k}(2:end-1)+eps);
marginalCDFValues_grid{k} = cdensity{k}(2:end-1);
end
[marg1,marg2] = meshgrid(marginalLogLikelihood_grid{1},marginalLogLikelihood_grid{2});
[xg,yg] = meshgrid(marginalCDFValues_grid{1},marginalCDFValues_grid{2});
inputMatrix = [reshape(xg,numel(xg),1) reshape(yg,numel(yg),1)];
clear xg yg;
copulaLogLikelihoodVals = gmmCopulaPDF(inputMatrix,gmmObject,xmesh_inverse_space);
Z = reshape(copulaLogLikelihoodVals,size(marg1,1),size(marg1,2));
Z = Z+marg1+marg2;
Z = exp(Z);
plot(givenData(:,1),givenData(:,2),'k.','MarkerSize',3);hold
contour(xgrid,ygrid,Z,40);
%title_string = ['GMCM fit (Log-Likelihood = ',num2str(logLikelihoodVal), ')'];
%title(title_string,'FontSize',12,'FontWeight','demi');
axis tight;
Trying to find the optimal hyperparameters for my svm model using a grid search, but it simply returns 1 for the hyperparameters.
function evaluations = inner_kfold_trainer(C,q,k,features_xy,labels)
features_xy_flds = kdivide(features_xy, k);
labels_flds = kdivide(labels, k);
evaluations = zeros(k,3);
for i = 1:k
fprintf('Fold %i of %i\n',i,k);
train_data = cell2mat(features_xy_flds(1:end ~= i));
train_labels = cell2mat(labels_flds(1:end ~= i));
test_data = cell2mat(features_xy_flds(i));
test_labels = cell2mat(labels_flds(i));
%AU1
train_labels = train_labels(:,1);
test_labels = test_labels(:,1);
[k,~] = size(test_labels);
%train
sv = fitcsvm(train_data,train_labels, 'KernelFunction','polynomial', 'PolynomialOrder',q,'BoxConstraint',C);
sv.predict(test_data);
%Calculate evaluative measures
%svm_outputs = zeros(k,1);
sv_predictions = sv.predict(test_data);
[precision,recall,F1] = evaluation(sv_predictions,test_labels);
evaluations(i,1) = precision;
evaluations(i,2) = recall;
evaluations(i,3) = F1;
end
save('eval.mat', 'evaluations');
end
an inner-fold cross validation function
and below the grid function where something seems to be going wrong
function [q,C] = grid_search(features_xy,labels,k)
% n x n grid
n = 3;
q_grid = linspace(1,19,n);
C_grid = linspace(1,59,n);
tic
evals = zeros(n,n,3);
for i = 1:n
for j = 1:n
fprintf('## i=%i, j=%i ##\n', i, j);
svm_results = inner_kfold_trainer(C_grid(i), q_grid(j),k,features_xy,labels);
evals(i,j,:) = mean(svm_results(:,:));
% precision only
%evals(i,j,:) = max(svm_results(:,1));
toc
end
end
f = evals;
% retrieving the best value of the hyper parameters, to use in the outer
% fold
[M1,I1] = max(f);
[~,I2] = max(M1(1,1,:));
index = I1(:,:,I2);
C = C_grid(index(1))
q = q_grid(index(2))
end
When I run grid_search(features_xy,labels,8) for example, I get C=1 and q=1, for any k(the no. of folds) value. Also features_xy is a 500*98 matrix.
I have been playing around with parallelization both using ACC and OpenMP in Fortran. I am now trying to do the same in matlab. I find it very interesting that it seems to be very hard to paralelize a loop using GPUs in matlab. Apparently the only way to do it is to by using arrayfun function. But I might be wrong.
At a conceptual level, I am wondering why is the GPU usage in matlab not more straightforward than in fortran. At a more practical level, I am wondering how to use GPUs on the simple code below.
Below, I am sharing three codes and benchmarks:
Fortran OpenMP code
Fortran ACC code
Matlab parfor code
Matlab CUDA (?) this is the one I don't know how to do.
Fortran OpenMP:
program rbc
use omp_lib ! For timing
use tools
implicit none
real, parameter :: beta = 0.984, eta = 2, alpha = 0.35, delta = 0.01, &
rho = 0.95, sigma = 0.005, zmin=-0.0480384, zmax=0.0480384;
integer, parameter :: nz = 4, nk=4800;
real :: zgrid(nz), kgrid(nk), t_tran_z(nz,nz), tran_z(nz,nz);
real :: kmax, kmin, tol, dif, c(nk), r(nk), w(nk);
real, dimension(nk,nz) :: v=0., v0=0., ev=0., c0=0.;
integer :: i, iz, ik, cnt;
logical :: ind(nk);
real(kind=8) :: start, finish ! For timing
real :: tmpmax, c1
call omp_set_num_threads(12)
!Grid for productivity z
! [1 x 4] grid of values for z
call linspace(zmin,zmax,nz,zgrid)
zgrid = exp(zgrid)
! [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757
tran_z(1,2) = 0.00324265
tran_z(1,3) = 0
tran_z(1,4) = 0
tran_z(2,1) = 0.000385933
tran_z(2,2) = 0.998441
tran_z(2,3) = 0.00117336
tran_z(2,4) = 0
tran_z(3,1) = 0
tran_z(3,2) = 0.00117336
tran_z(3,3) = 0.998441
tran_z(3,4) = 0.000385933
tran_z(4,1) = 0
tran_z(4,2) = 0
tran_z(4,3) = 0.00324265
tran_z(4,4) = 0.996757
! Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)**(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)**(1/(alpha-1));
! [1 x 4800] grid of possible values of k
call linspace(kmin, kmax, nk, kgrid)
! Compute initial wealth c0(k,z)
do iz=1,nz
c0(:,iz) = zgrid(iz)*kgrid**alpha + (1-delta)*kgrid;
end do
dif = 10000
tol = 1e-8
cnt = 1
do while(dif>tol)
!$omp parallel do default(shared) private(ik,iz,i,tmpmax,c1)
do ik=1,nk;
do iz = 1,nz;
tmpmax = -huge(0.)
do i = 1,nk
c1 = c0(ik,iz) - kgrid(i)
if(c1<0) exit
c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
if(tmpmax<c1) tmpmax = c1
end do
v(ik,iz) = tmpmax
end do
end do
!$omp end parallel do
ev = beta*matmul(v,tran_z)
dif = maxval(abs(v-v0))
v0 = v
if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
cnt = cnt+1
end do
end program
Fortran ACC:
Just replace the mainloop syntax on the above code with:
do while(dif>tol)
!$acc kernels
!$acc loop gang
do ik=1,nk;
!$acc loop gang
do iz = 1,nz;
tmpmax = -huge(0.)
do i = 1,nk
c1 = c0(ik,iz) - kgrid(i)
if(c1<0) exit
c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
if(tmpmax<c1) tmpmax = c1
end do
v(ik,iz) = tmpmax
end do
end do
!$acc end kernels
ev = beta*matmul(v,tran_z)
dif = maxval(abs(v-v0))
v0 = v
if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
cnt = cnt+1
end do
Matlab parfor:
(I know the code below could be made faster by using vectorized syntax, but the whole point of the exercise is to compare loop speeds).
tic;
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
while dif>tol
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', [cnt dif])
end
cnt = cnt+1;
end
toc
Matlab CUDA:
This is what I have no clue how to code. Is using arrayfun the only way of doing this? In fortran is so simple to move from OpenMP to OpenACC. Isn't there an easy way in Matlab of going from parfor to GPUs loops?
The time comparison between codes:
Fortran OpenMP: 83.1 seconds
Fortran ACC: 2.4 seconds
Matlab parfor: 1182 seconds
Final remark, I should say the codes above solve a simple Real Business Cycle Model and were written based on this.
Matlab Coder
First, as Dev-iL already mentioned, you can use GPU coder.
It (I use R2019a) would only require minor changes in your code:
function cdapted()
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
while dif>tol
for ik=1:nk
for iz = 1:nz
tmpmax = double(intmin);
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
% I've commented out fprintf because double2single cannot handle it
% (could be manually uncommented in the converted version if needed)
% ------------
% if mod(cnt,1)==0
% fprintf('%1.5f : %1.5f \n', cnt, dif);
% end
cnt = cnt+1;
end
end
The script to build this is:
% unload mex files
clear mex
%% Build for gpu, float64
% Produces ".\codegen\mex\cdapted" folder and "cdapted_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg cdapted
% benchmark it (~7.14s on my GTX1080Ti)
timeit(#() cdapted_mex,0)
%% Build for gpu, float32:
% Produces ".\codegen\cdapted\single" folder
scfg = coder.config('single');
codegen -double2single scfg cdapted
% Produces ".\codegen\mex\cdapted_single" folder and "cdapted_single_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg .\codegen\cdapted\single\cdapted_single.m
% benchmark it (~2.09s on my GTX1080Ti)
timeit(#() cdapted_single_mex,0)
So, if your Fortran binary is using float32 precision (I suspect so), this Matlab Coder result is on par with it. That does not mean that both are highly efficient, though. The code, generated by Matlab Coder is still far from being efficient. And it does not fully utilize the GPU (even TDP is ~50%).
Vectorization and gpuArray
Next, I agree with user10597469 and Nicky Mattsson that your Matlab code does not look like normal "native" vectorized Matlab code.
There are many things to adjust. (But arrayfun is hardly better than for). Firstly, let's remove for loops:
function vertorized1()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
while dif>tol
%% orig-noparfor:
t=tic();
for ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
c1_x = c1_.^(1-eta)/(1-eta);
c2 = c1_x + reshape(ev', [1 nz nk]);
c2(c1_<0) = -Inf;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 17.7 9.8]
% tol = 1e-8 -> 1124 iterations :: t_acc = [1758.6 972.0]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 931.7443
Now, it is strikingly evident that most computationally intense calculations inside the while loop (e.g. ^(1-eta)/(1-eta)) actually produce constants that could be pre-calculated. Once we fix that, the result would be already a bit faster than the original parfor-based version (on my 2xE5-2630v3):
function vertorized2()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=zeros(size(c1_));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);
while dif>tol
%% orig:
t=tic();
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
c2 = c1_x + reshape(ev', [1 nz nk]);
c2 = c2 + mask;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 2.4 1.7]
% tol = 1e-8 -> 1124 iterations :: t_acc = [188.3 115.9]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 117.6217
This vectorized code is still inefficient (e.g. reshape(ev',...), which eats ~60% of time, could be easily avoided by re-ordering dimensions), but it is somewhat suitable for gpuArray():
function vectorized3g()
t0 = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=gpuArray(zeros(nk,nz,'single'));
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=gpuArray(zeros(size(c1_),'single'));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);
c1_x = gpuArray(single(c1_x));
while dif>tol
%% orig:
% t=tic();
% parfor ik=1:nk
% for iz = 1:nz
% tmpmax = -intmax;
%
% for i = 1:nk
% c1 = c0(ik,iz) - kgrid(i);
% if (c1<0)
% continue
% end
% c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
% if tmpmax<c1
% tmpmax = c1;
% end
% end
% v(ik,iz) = tmpmax;
% end
%
% end
% t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
c2 = c1_x + reshape(ev', [1 nz nk]);
c2 = c2 + mask;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
% assert(isequal(v_,v));
v = v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t0));
end
% (all 849 iterations) with commented-out orig :: t_tot = 14.9040
This ~15 sec result is ~7x worse than those (~2sec) we get from Matlab Coder. But this option requires fewer toolboxes. In practice, gpuArray is most handy when you start from writing "native Matlab code". Including interactive use.
Finally, if you build this final vectorized version with Matlab Coder (you would have to do some trivial adjustments), it won't be faster than the first one. It would be 2x-3x slower.
So, this bit is what is going to mess you up on this project. MATLAB stands for Matrix Laboratory. Vectors and matrices are kind of its thing. The number 1 way to optimize anything in MATLAB is to vectorize it. For this reason, when using performance enhancing tools like CUDA, MATLAB assumes that you are going to vectorize your inputs if possible. Given the primacy of vectorizing inputs in the MATLAB coding style, it is not a fair comparison to assess its performance using only loops. It would be like assessing the performance of C++ while refusing to use pointers. If you want to use CUDA with MATLAB, the main way to go about it is to vectorize your inputs and use gpuarray. Honestly, I haven't looked too hard at your code but it kind of looks like your inputs are already mostly vectorized. You may be able to get away with something as simple as gpuarray(1:nk) or kgrid=gpuarray(linspace(...).
I have implemented 3 function for neural network regression:
1) a forward propagation function that given the training inputs and the net structure calculates the predicted output
function [y_predicted] = forwardProp(Theta,Baias,Inputs,NumberOfLayers,RegressionSwitch)
for i = 1:size(Inputs{1},2)
Activation = (Inputs{1}(:,i))';
for j = 2:NumberOfLayers - RegressionSwitch
Activation = 1./(1+exp(-(Activation*Theta{j-1} + Baias{j-1})));
end
if RegressionSwitch == 1
y_predicted(:,i) = Activation*Theta{end} + Baias{end};
else
y_predicted(:,i) = Activation;
end
end
end
2) a cost function that given the predicted and the desired output, calculates the cost of the network
function [Cost] = costFunction(y_predicted, y, Theta, Baias, Lambda)
Cost = 0;
for j = 1:size(y,2)
for i = 1:size(y,1)
Cost = Cost +(((y(i,j) - y_predicted(i,j))^2)/size(y,2));
end
end
Reg = 0;
for i = 1:size(Theta, 2)
for j = 1:size(Theta{i}, 1)
for k = 1:size(Theta{i}, 2)
Reg = Reg + (Theta{i}(j,k))^2;
end
end
end
for i = 1:size(Baias, 2)
for j = 1:length(Baias{i})
Reg = Reg + (Baias{i}(j))^2;
end
end
Cost = Cost + (Lambda/(2*size(y,2)))*Reg;
end
3) a back propagation function that calculates the partial derivative of the cost function for each weight in the network
function [dTheta, dBaias] = Deltas(Theta,Baias,Inputs,NumberOfLayers,RegressionSwitch, Epsilon, Lambda, y)
for i = 1:size(Theta,2)
for j = 1:size(Theta{i},1)
for k = 1:size(Theta{i},2)
dTp = Theta;
dTm = Theta;
dTp{i}(j,k) = dTp{i}(j,k) + Epsilon;
dTm{i}(j,k) = dTm{i}(j,k) - Epsilon;
y_predicted_p = forwardProp(dTp,Baias,Inputs,NumberOfLayers,RegressionSwitch);
y_predicted_m = forwardProp(dTm,Baias,Inputs,NumberOfLayers,RegressionSwitch);
Cost_p = costFunction(y_predicted_p, y, dTp, Baias, Lambda);
Cost_m = costFunction(y_predicted_m, y, dTm, Baias, Lambda);
dTheta{i}(j,k) = (Cost_p - Cost_m)/(2*Epsilon);
end
end
end
for i = 1:size(Baias,2)
for j = 1:length(Baias{i})
dBp = Baias;
dBm = Baias;
dBp{i}(j) = dTp{i}(j) + Epsilon;
dBm{i}(j) = dTm{i}(j) - Epsilon;
y_predicted_p = forwardProp(Theta,dBp,Inputs,NumberOfLayers,RegressionSwitch);
y_predicted_m =forwardProp(Theta,dBm,Inputs,NumberOfLayers,RegressionSwitch);
Cost_p = costFunction(y_predicted_p, y, Theta, dBp, Lambda);
Cost_m = costFunction(y_predicted_m, y, Theta, dBm, Lambda);
dBaias{i}(j) = (Cost_p - Cost_m)/(2*Epsilon);
end end end
I train the neural network with data from an exact mathematical function of the inputs.
The gradient descent seems to work as the cost decrease each iteration, but when i test the trained network the regression is terrible.
The functions are not meant to be efficient, but they should work so I am really frustrated to see they don't... The main function and the data are ok so the problem should be here. Can you please help me to spot it?
here is the "main":
clear;
clc;
Nodes_X = 5;
Training_Data = 1000;
x = rand(Nodes_X, Training_Data)*3;
y = zeros(2,Training_Data);
for j = 1:Nodes_X
for i = 1:Training_Data
y(1,i) = (x(1,i)^2)+x(2,i)-x(3,i)+2*x(4,i)/x(5,i);
y(2,i) = (x(5,i)^2)+x(2,i)-x(3,i)+2*x(4,i)/x(1,i);
end
end
vx = rand(Nodes_X, Training_Data)*3;
vy = zeros(2,Training_Data);
for j = 1:Nodes_X
for i = 1:Training_Data
vy(1,i) = (vx(1,i)^2)+vx(2,i)-vx(3,i)+2*vx(4,i)/vx(5,i);
vy(2,i) = (vx(5,i)^2)+vx(2,i)-vx(3,i)+2*vx(4,i)/vx(1,i);
end
end
%%%%%%%%%%%%%%%%%%%%%%ASSIGN NODES TO EACH LAYER%%%%%%%%%%%%%%%%%%%%%%%%%%%
NumberOfLayers = 4;
Nodes(1) = 5;
Nodes(2) = 10;
Nodes(3) = 10;
Nodes(4) = 2;
if length(Nodes) ~= NumberOfLayers || (Nodes(1)) ~= size(x, 1)
WARNING = msgbox('Nodes assigned incorrectly!');
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%INITIALIZATION%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for i = 1:NumberOfLayers-1
Theta{i} = rand(Nodes(i),Nodes(i+1));
Baias{i} = rand(1,Nodes(i+1));
end
Inputs{1} = x;
Outputs{1} = y;
RegressionSwitch = 1;
Lambda = 10;
Epsilon = 0.00001;
Alpha = 0.01;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%TRAINING%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Epoch = 0;
figure;
hold on;
while Epoch <=20
%%%%%%%%%%%%%%%%%%%%FORWARD PROPAGATION%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
y_predicted = forwardProp(Theta,Baias,Inputs,NumberOfLayers,RegressionSwitch);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%COST%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Cost = costFunction(y_predicted, y, Theta, Baias, Lambda);
scatter(Epoch,Cost);
pause(0.01);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%BACK PROPAGATION%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[dTheta, dBaias] = Deltas(Theta,Baias,Inputs,NumberOfLayers,RegressionSwitch, Epsilon, Lambda, y);
%%%%%%%%%%%%%%%GRADIENT DESCENT%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for i = 1:size(Theta,2)
Theta{i} = Theta{i}-Alpha*dTheta{i};
end
for i = 1:size(Baias,2)
Baias{i} = Baias{i}-Alpha*dBaias{i};
end
Epoch = Epoch + 1;
end
hold off;
V_Inputs{1} = vx;
V_y_predicted = forwardProp(Theta,Baias,V_Inputs,NumberOfLayers,RegressionSwitch);
figure;
hold on;
for i = 1:size(vy,2)
scatter(vy(1,i),V_y_predicted(1,i));
pause(0.01);
end
hold off;
figure;
hold on;
for i = 1:size(vy,2)
scatter(vy(2,i),V_y_predicted(2,i));
pause(0.01);
end
hold off;
I currently have an error that i can't pass this is the short code and everything needed in order to have a general idea about my problem
clear;
close all; clear ;
load fisheriris;
m = meas;
d = num2cell(m);
d(:,5) = species(:,1);
c = cvpartition(d(:,5),'kfold',10);
CeDam = cell(10,1);
CeVrem = cell(10,1);
for i=1:10
CeDam{i} = [d(test(c,i),1) d(test(c,i),2) d(test(c,i),3) d(test(c,i),4)]';
end
for i=1:10
CeVrem{i} = d(test(c,i),5)';
end
for i = 1:10
a = CeVrem{i};
[n,m] = size(a);
for j = 1:n
for k = 1:m
if isequal(a(j,k),'setosa') a{n,m} = [1 0 0];
elseif isequal(a(j,k),'versicolor') a{n,m} = [0 1 0];
else a{j,k} = [0,0,1];
end
end
end
CeVrem{i} = a;
end
net = newff(cell2mat(minmax(CeDam{1})),[3 3 3],{'logsig','logsig','logsig',},'trainlm');
net.LW{2,1} = net.LW{2,1}*0.5;
net.b{2} = net.b{2}*2;
net.performFcn = 'mse';
net.trainParam.epochs = 100;
err = 0;
i = 1;
j = 1;
while i <= 10
while j <= 10
if i~=j net = train(net,CeDam{j},CeVrem{j});
end
j=j+1;
end
end
in the train part of the algorithm it gives me an input mistmatch which is very odd for me.
The error messages:
Error using trainlm (line 109) Number of inputs does not match
net.numInputs.
Error in network/train (line 106) [net,tr] =
feval(net.trainFcn,net,X,T,Xi,Ai,EW,net.trainParam);
i managed to fix everything after much work here is the code that works for anyone having the same problem in the future gl :D :).
clear;
close all; clear ;
load fisheriris;
m = meas;
d = num2cell(m);
d(:,5) = species(:,1);
c = cvpartition(d(:,5),'kfold',10);
CeDam = cell(10,1);
CeVrem = cell(10,1);
for i=1:10
CeDam{i} = [m(test(c,i),1) m(test(c,i),2) m(test(c,i),3) m(test(c,i),4)]';
end
for i=1:10
CeVrem{i} = d(test(c,i),5);
end
for i = 1:10
a = CeVrem{i}';
[n,m] = size(a);
b = zeros(3,m);
for j = 1:n
for k = 1:m
if isequal(a(j,k),{'setosa'}) b(1,k) = 1; b(2,k) = 0; b(3,k) = 0;
elseif isequal(a(j,k),{'versicolor'}) b(1,k) = 0; b(2,k) = 1; b(3,k) = 0;
else b(1,k) = 0; b(2,k) = 0; b(3,k) = 1;
end
end
end
CC{i} = b;
end
CC = CC';
net = newff(minmax(CeDam{1}),[3 3 3],{'logsig','logsig','logsig'},'trainlm');
net.LW{2,1} = net.LW{2,1}*0.6;
net.b{2} = net.b{2}*2;
net.performFcn = 'mse';
net.trainParam.epochs = 100;
errglob = 0;
i = 1;
j = 1;
while i <= 10
while j <= 10
if i~=j net = train(net,CeDam{j},CC{j});
end
j=j+1;
end
y=sim(net,CeDam{i});
y=round(y);
e = y - CC{i};
errcur=mse(net,CC{i},y);
errglob = errglob + mse(net,CC{i},y);
fprintf('Avem o eroare de %.2f pe foldul %d \n',errcur,i)
i=i+1;
end
errglob/10
this thread can be closed thx :)
I think you got some problems with mixing up cell and array formats...
Try to replace:
net = train(net,CeDam{j},CeVrem{j});
by:
net = train(net,cell2mat(CeDam{j}),cell2mat(CeVrem{j}')');
AND: please remove your infinite loops in i, by adding i=i+1; or replace the while loops by more natural for loops, e.g.
for i = 1:10
for j = 1:10
if i~=j
net = train(net,cell2mat(CeDam{j}),cell2mat(CeVrem{j}')');
end
end
end
AND: Where are you using your i inside the loop? I guess something is missing there...