Optimizing bilinear interpolation / Vectorizing - matlab

I'm trying to reduce the processing time of bilinear interpolation of an image on a set of points. First I did a normal for loop. I can't use parfor since I'm already using it on the function that is calling the interp. I tried to vectorize the function, it works, but there is almost no reduce in processing time.
Any ideas of how can I improve the vectorized version?
Some results:
Edited - The results were varying a lot before. I don't know why, but now they seam consistent and the values don't change much.
I added the tmirt2D as #Dev-iL pointed it out. His code is a a lot faster (~3x) than mine, and he uses C++.
Interp 2D tester
Time for each point and speed up in '%' based on the reference time.
Array size: 10000. Repeat: 10. Loops: 20
Normal for loop interp: 145.6354 ns (reference time)
Vect interp (in bounds): 68.6679 ns (112.09%)
Vect interp (out boudns): 66.5793 ns (118.74%)
Vect interp nearest (in boudns): 28.2870 ns (414.85%)
Vect interp nearest (out boudns): 26.3854 ns (451.95%)
tmirt2D from File Exchange: 22.6089 ns (544.15%)
Matlab interp2 linear: 921.0408 ns (-84.19%)
Matlab interp2 nearest: 911.7492 ns (-84.03%)
Testing code:
clearvars; clc; close all;
mustCompareMatlab = 1;
tm1 = 0;
tm2 = 0;
tm3 = 0;
tm4 = 0;
tm5 = 0;
tm6 = 0;
tm7 = 0;
tmirt2D = 0;
baseLoopN = 10;
for j=1:baseLoopN
%% Create data
dataSize = [1200 1600];
data = rand(dataSize)*255;
N = 1e4;
X = rand(1,N)*dataSize(2);
Y = rand(1,N)*dataSize(1);
% Only inside bounds-1 coordinates
Xin = X(X<dataSize(2)-1);
Yin = Y(Y<dataSize(1)-1);
% make X and Y same size
Xin(end+1:length(X)) = ones(1,length(X)-length(Xin));
Yin(end+1:length(X)) = ones(1,length(X)-length(Yin));
% Make sure X and Y have outside bounds values
X(1) = dataSize(2) + 1;
Y(1) = dataSize(1) + 1;
loops = 20;
% Catch
interpData = Interp2D(data,X,Y);
tic
for i=1:loops
interpData = Interp2D(data,X,Y);
end
tm1 = tm1 + toc;
%% Only inside bounds coordinates
% Catch
interpData = Interp2DVect(data,Xin,Yin);
tic
for i=1:loops
interpData = Interp2DVect(data,Xin,Yin);
end
tm2 = tm2 + toc;
%% inside and outside bounds
% Catch
interpData = Interp2DVect(data,X,Y);
tic
for i=1:loops
interpDataInterp2DVect = Interp2DVect(data,X,Y);
end
tm3 = tm3 + toc;
%% inside bounds nearest
% Catch
interpData = Interp2DNearest(data,Xin,Yin);
tic
for i=1:loops
interpData = Interp2DNearest(data,Xin,Yin);
end
tm4 = tm4 + toc;
%% inside and outside bounds nearest
% Catch
interpData = Interp2DNearest(data,X,Y);
tic
for i=1:loops
interpData = Interp2DNearest(data,X,Y);
end
tm5 = tm5 + toc;
% mirt2D_mexinterp
% www.mathworks.com/matlabcentral/fileexchange/24183-2d-interpolation
interpData = mirt2D_mexinterp(data,X,Y);
tic
for i=1:loops
interpDataMirt2D = mirt2D_mexinterp(data,X,Y);
end
tmirt2D = tmirt2D + toc;
%% Matlab interp
if mustCompareMatlab
interpData = interp2(data,X,Y,'linear'); %#ok<*NASGU>
tic
for i=1:loops
interpData = interp2(data,Xin,Yin,'linear');
end
tm6 = tm6 + toc;
interpData = interp2(data,X,Y,'nearest');
tic
for i=1:loops
interpData = interp2(data,Xin,Yin,'nearest');
end
tm7 = tm7 + toc;
end
%%
end
A = interpDataInterp2DVect;
B = interpDataMirt2D;
C = A-B;
fprintf('Interp 2D tester\n');
fprintf('Array size: %d. Repeat: %d. Loops: %d\n',N,baseLoopN,loops);
tm1 = tm1*1e9/(baseLoopN*loops*N);
fprintf('Normal for loop interp: %.4f ns\n',tm1);
tm2 = tm2*1e9/(baseLoopN*loops*N);
fprintf('Vect interp (in bounds): %.4f ns (%.2f%%)\n',tm2,100*(tm1/tm2-1));
tm3 = tm3*1e9/(baseLoopN*loops*N);
fprintf('Vect interp (out boudns): %.4f ns (%.2f%%)\n',tm3,100*(tm1/tm3-1));
tm4 = tm4*1e9/(baseLoopN*loops*N);
fprintf('Vect interp nearest (in boudns): %.4f ns (%.2f%%)\n',tm4,100*(tm1/tm4-1));
tm5 = tm5*1e9/(baseLoopN*loops*N);
fprintf('Vect interp nearest (out boudns): %.4f ns (%.2f%%)\n',tm5,100*(tm1/tm5-1));
tmirt2D = tmirt2D*1e9/(baseLoopN*loops*N);
fprintf('tmirt2D from File Exchange: %.4f ns (%.2f%%)\n',tmirt2D,100*(tm1/tmirt2D-1));
if mustCompareMatlab
tm6 = tm6*1e9/(baseLoopN*loops*N);
fprintf('Matlab interp2 linear: %.4f ns (%.2f%%)\n',tm6,100*(tm1/tm6-1));
tm7 = tm7*1e9/(baseLoopN*loops*N);
fprintf('Matlab interp2 nearest: %.4f ns (%.2f%%)\n',tm7,100*(tm1/tm7-1));
end
Normal version:
function [interpData] = Interp2D(data,X,Y)
[sizY, sizX] = size(data);
interpData =zeros(1,length(X),'like',X);
for item=1:length(X),
valX=floor(X(item));
valY=floor(Y(item));
valYp1=valY+1;
valXp1=valX+1;
if (sizY < valYp1)||(sizX < valXp1)|| valX<=0 || valY<=0
interpData(item)=NaN;
else
Inten00=data(valY,valX);
Inten10=data(valY,valXp1);
Inten01=data(valYp1,valX);
Inten11=data(valYp1,valXp1);
px=(X(item))-valX;
py=(Y(item))-valY;
interpData(item)=((1-px)*Inten00+px*Inten10)*(1-py)+((1-px)*Inten01+px*Inten11)*py;
end
end
Vectorized version:
% Images borders are disconsidered due to simplicity
function [interpData,outBounds] = Interp2DVect(data,X,Y)
[sizeY, sizeX] = size(data);
fX = floor(X);
fY = floor(Y);
sizeOkFlag = ~((sizeY-1 < fY)|(sizeX-1 < fX)| fX <= 0 | fY <= 0);
sizeAllOk = min(sizeOkFlag) >= 1;
outBounds = ~sizeAllOk;
% If we have some invalid points, lets set them to NaN
if ~sizeAllOk
interpData = nan(1,length(X),'like',X);
% Prevent invalid data indexes
fX = fX(sizeOkFlag);
fY = fY(sizeOkFlag);
X = X(sizeOkFlag);
Y = Y(sizeOkFlag);
end
pX = X - fX;
pY = Y - fY;
% We need to get the linear indexes
% www.mathworks.com/company/newsletters/articles/matrix-indexing-in-matlab.html
%ids00 = sub2ind(dataSize, fY, fX);
% A fast replacement for sub2ind
% http://tipstrickshowtos.blogspot.com.br/2010/02/fast-replacement-for-sub2ind.html
%idsYX
ids00 = fY + (fX-1).*sizeY;
ids01 = fY + (fX).*sizeY;
ids10 = fY+1 + (fX-1).*sizeY;
ids11 = fY+1 + fX.*sizeY;
if sizeAllOk
interpData = ((1-pX).*data(ids00) + pX.*data(ids01)).*(1-pY) + ...
((1-pX).*data(ids10) + pX.*data(ids11)).*pY;
else
interpData(sizeOkFlag) = ((1-pX).*data(ids00) + pX.*data(ids01)).*(1-pY) + ...
((1-pX).*data(ids10) + pX.*data(ids11)).*pY;
if (size(X,1) > 1)
interpData = interpData';
end
end
Nearest version (just to compare)
function [interpData] = Interp2DNearest(data,X,Y)
[sizeY, sizeX] = size(data);
X = round(X);
Y = round(Y);
sizeOkFlag = ~((sizeY-1 < Y)|(sizeX-1 < X)| X <= 0 | Y <= 0);
% Slower if use this:
%sizeAllOk = min(sizeOkFlag) >= 1;
% if sizeAllOk
% ids = Y + (X-1).*sizeY;
% interpData = data(ids);
% else
interpData = nan(1,length(X),'like',X);
X = X(sizeOkFlag);
Y = Y(sizeOkFlag);
ids = Y + (X-1).*sizeY;
interpData(sizeOkFlag) = data(ids);

Related

Steepest Descent using Armijo rule

I want to determine the Steepest descent of the Rosenbruck function using Armijo steplength where x = [-1.2, 1]' (the initial column vector).
The problem is, that the code has been running for a long time. I think there will be an infinite loop created here. But I could not understand where the problem was.
Could anyone help me?
n=input('enter the number of variables n ');
% Armijo stepsize rule parameters
x = [-1.2 1]';
s = 10;
m = 0;
sigma = .1;
beta = .5;
obj=func(x);
g=grad(x);
k_max = 10^5;
k=0; % k = # iterations
nf=1; % nf = # function eval.
x_new = zeros([],1) ; % empty vector which can be filled if length is not known ;
[X,Y]=meshgrid(-2:0.5:2);
fx = 100*(X.^2 - Y).^2 + (X-1).^2;
contour(X, Y, fx, 20)
while (norm(g)>10^(-3)) && (k<k_max)
d = -g./abs(g); % steepest descent direction
s = 1;
newobj = func(x + beta.^m*s*d);
m = m+1;
if obj > newobj - (sigma*beta.^m*s*g'*d)
t = beta^m *s;
x = x + t*d;
m_new = m;
newobj = func(x + t*d);
nf = nf+1;
else
m = m+1;
end
obj=newobj;
g=grad(x);
k = k + 1;
x_new = [x_new, x];
end
% Output x and k
x_new, k, nf
fprintf('Optimal Solution x = [%f, %f]\n', x(1), x(2))
plot(x_new)
function y = func(x)
y = 100*(x(1)^2 - x(2))^2 + (x(1)-1)^2;
end
function y = grad(x)
y(1) = 100*(2*(x(1)^2-x(2))*2*x(1)) + 2*(x(1)-1);
end

how run kmean algorithm on sift keypoint in matlab

I need to run the K-Means algorithm on the key points of the Sift algorithm in MATLAB .I want to cluster the key points in the image but I do not know how to do it.
First, put the key points into X with x coordinates in the first column and y coordinates in the second column like this
X=[reshape(keypxcoord,numel(keypxcoord),1),reshape(keypycoord,numel(keypycoord),1))]
Then if you have the statistical toolbox, you can just use the built in 'kmeans' function lik this
output = kmeans(X,num_clusters)
Otherwise, write your own kmeans function:
function [ min_group, mu ] = mykmeans( X,K )
%MYKMEANS
% X = N obervations of D element vectors
% K = number of centroids
assert(K > 0);
D = size(X,1); %No. of r.v.
N = size(X,2); %No. of observations
group_size = zeros(1,K);
min_group = zeros(1,N);
step = 0;
%% init centroids
mu = kpp(X,K);
%% 2-phase iterative approach (local then global)
while step < 400
%% phase 1, batch update
old_group = min_group;
% computing distances
d2 = distances2(X,mu);
% reassignment all points to closest centroids
[~, min_group] = min(d2,[],1);
% recomputing centroids (K number of means)
for k = 1 : K
group_size(k) = sum(min_group==k);
% check empty group
%if group_size(k) == 0
assert(group_size(k)>0);
%else
mu(:,k) = sum(X(:,min_group==k),2)/group_size(k);
%end
end
changed = sum(min_group ~= old_group);
p1_converged = changed <= N*0.001;
%% phase 2, individual update
changed = 0;
for n = 1 : N
d2 = distances2(X(:,n),mu);
[~, new_group] = min(d2,[],1);
% recomputing centroids of affected groups
k = min_group(n);
if (new_group ~= k)
mu(:,k)=(mu(:,k)*group_size(k)-X(:,n));
group_size(k) = group_size(k) - 1;
mu(:,k)=mu(:,k)/group_size(k);
mu(:,new_group) = mu(:,new_group)*group_size(new_group)+ X(:,n);
group_size(new_group) = group_size(new_group) + 1;
mu(:,new_group)=mu(:,new_group)/group_size(new_group);
min_group(n) = new_group;
changed = changed + 1;
end
end
%% check convergence
if p1_converged && changed <= N*0.001
break;
else
step = step + 1;
end
end
end
function d2 = distances2(X, mu)
K = size(mu,2);
N = size(X,2);
d2 = zeros(K,N);
for j = 1 : K
d2(j,:) = sum((X - repmat(mu(:,j),1,N)).^2,1);
end
end
function mu = kpp( X,K )
% kmeans++ init
D = size(X,1); %No. of r.v.
N = size(X,2); %No. of observations
mu = zeros(D, K);
mu(:,1) = X(:,round(rand(1) * (size(X, 2)-1)+1));
for k = 2 : K
% computing distances between centroids and observations
d2 = distances2(X, mu(1:k-1));
% assignment
[min_dist, ~] = min(d2,[],1);
% select new centroids by selecting point with the cumulative dist
% value (distance) larger than random value (falls in range between
% dist(n-1) : dist(n), dist(0)= 0)
rv = sum(min_dist) * rand(1);
for n = 1 : N
if min_dist(n) >= rv
mu(:,k) = X(:,n);
break;
else
rv = rv - min_dist(n);
end
end
end
end

All my weights for gradient descent become 0 on feature expansion

I have 2 features which I expand to contain all possible combinations of the two features under order 6. When I do MATLAB's fminunc, it returns a weight vector where all elements are 0.
The dataset is here
clear all;
clc;
data = load("P2-data1.txt");
m = length(data);
para = 0; % regularization parameter
%% Augment Feature
y = data(:,3);
new_data = newfeature(data(:,1), data(:,2), 3);
[~, n] = size(new_data);
betas1 = zeros(n,1); % initial weights
options = optimset('GradObj', 'on', 'MaxIter', 400);
[beta_new, cost] = fminunc(#(t)(regucostfunction(t, new_data, y, para)), betas1, options);
fprintf('Cost at theta found by fminunc: %f\n', cost);
fprintf('theta: \n');
fprintf(' %f \n', beta_new); % get all 0 here
% Compute accuracy on our training set
p_new = predict(beta_new, new_data);
fprintf('Train Accuracy after feature augmentation: %f\n', mean(double(p_new == y)) * 100);
fprintf('\n');
%% the functions are defined below
function g = sigmoid(z) % running properly
g = zeros(size(z));
g=ones(size(z))./(ones(size(z))+exp(-z));
end
function [J,grad] = regucostfunction(theta,x,y,para) % CalculateCost(x1,betas1,y);
m = length(y); % number of training examples
J = 0;
grad = zeros(size(theta));
hyp = sigmoid(x*theta);
err = (hyp - y)';
grad = (1/m)*(err)*x;
sum = 0;
for k = 2:length(theta)
sum = sum+theta(k)^2;
end
J = (1/m)*((-y' * log(hyp) - (1 - y)' * log(1 - hyp)) + para*(sum) );
end
function p = predict(theta, X)
m = size(X, 1); % Number of training examples
p = zeros(m, 1);
index = find(sigmoid(theta'*X') >= 0.5);
p(index,1) = 1;
end
function out = newfeature(X1, X2, degree)
out = ones(size(X1(:,1)));
for i = 1:degree
for j = 0:i
out(:, end+1) = (X1.^(i-j)).*(X2.^j);
end
end
end
data contains 2 columns of rows followed by a third column of 0/1 values.
The functions used are: newfeature returns the expanded features and regucostfunction computes the cost. When I did the same approach with the default features, it worked and I think the problem here has to do with some coding issue.

CUDA loop on matlab

I have been playing around with parallelization both using ACC and OpenMP in Fortran. I am now trying to do the same in matlab. I find it very interesting that it seems to be very hard to paralelize a loop using GPUs in matlab. Apparently the only way to do it is to by using arrayfun function. But I might be wrong.
At a conceptual level, I am wondering why is the GPU usage in matlab not more straightforward than in fortran. At a more practical level, I am wondering how to use GPUs on the simple code below.
Below, I am sharing three codes and benchmarks:
Fortran OpenMP code
Fortran ACC code
Matlab parfor code
Matlab CUDA (?) this is the one I don't know how to do.
Fortran OpenMP:
program rbc
use omp_lib ! For timing
use tools
implicit none
real, parameter :: beta = 0.984, eta = 2, alpha = 0.35, delta = 0.01, &
rho = 0.95, sigma = 0.005, zmin=-0.0480384, zmax=0.0480384;
integer, parameter :: nz = 4, nk=4800;
real :: zgrid(nz), kgrid(nk), t_tran_z(nz,nz), tran_z(nz,nz);
real :: kmax, kmin, tol, dif, c(nk), r(nk), w(nk);
real, dimension(nk,nz) :: v=0., v0=0., ev=0., c0=0.;
integer :: i, iz, ik, cnt;
logical :: ind(nk);
real(kind=8) :: start, finish ! For timing
real :: tmpmax, c1
call omp_set_num_threads(12)
!Grid for productivity z
! [1 x 4] grid of values for z
call linspace(zmin,zmax,nz,zgrid)
zgrid = exp(zgrid)
! [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757
tran_z(1,2) = 0.00324265
tran_z(1,3) = 0
tran_z(1,4) = 0
tran_z(2,1) = 0.000385933
tran_z(2,2) = 0.998441
tran_z(2,3) = 0.00117336
tran_z(2,4) = 0
tran_z(3,1) = 0
tran_z(3,2) = 0.00117336
tran_z(3,3) = 0.998441
tran_z(3,4) = 0.000385933
tran_z(4,1) = 0
tran_z(4,2) = 0
tran_z(4,3) = 0.00324265
tran_z(4,4) = 0.996757
! Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)**(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)**(1/(alpha-1));
! [1 x 4800] grid of possible values of k
call linspace(kmin, kmax, nk, kgrid)
! Compute initial wealth c0(k,z)
do iz=1,nz
c0(:,iz) = zgrid(iz)*kgrid**alpha + (1-delta)*kgrid;
end do
dif = 10000
tol = 1e-8
cnt = 1
do while(dif>tol)
!$omp parallel do default(shared) private(ik,iz,i,tmpmax,c1)
do ik=1,nk;
do iz = 1,nz;
tmpmax = -huge(0.)
do i = 1,nk
c1 = c0(ik,iz) - kgrid(i)
if(c1<0) exit
c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
if(tmpmax<c1) tmpmax = c1
end do
v(ik,iz) = tmpmax
end do
end do
!$omp end parallel do
ev = beta*matmul(v,tran_z)
dif = maxval(abs(v-v0))
v0 = v
if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
cnt = cnt+1
end do
end program
Fortran ACC:
Just replace the mainloop syntax on the above code with:
do while(dif>tol)
!$acc kernels
!$acc loop gang
do ik=1,nk;
!$acc loop gang
do iz = 1,nz;
tmpmax = -huge(0.)
do i = 1,nk
c1 = c0(ik,iz) - kgrid(i)
if(c1<0) exit
c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
if(tmpmax<c1) tmpmax = c1
end do
v(ik,iz) = tmpmax
end do
end do
!$acc end kernels
ev = beta*matmul(v,tran_z)
dif = maxval(abs(v-v0))
v0 = v
if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
cnt = cnt+1
end do
Matlab parfor:
(I know the code below could be made faster by using vectorized syntax, but the whole point of the exercise is to compare loop speeds).
tic;
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
while dif>tol
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', [cnt dif])
end
cnt = cnt+1;
end
toc
Matlab CUDA:
This is what I have no clue how to code. Is using arrayfun the only way of doing this? In fortran is so simple to move from OpenMP to OpenACC. Isn't there an easy way in Matlab of going from parfor to GPUs loops?
The time comparison between codes:
Fortran OpenMP: 83.1 seconds
Fortran ACC: 2.4 seconds
Matlab parfor: 1182 seconds
Final remark, I should say the codes above solve a simple Real Business Cycle Model and were written based on this.
Matlab Coder
First, as Dev-iL already mentioned, you can use GPU coder.
It (I use R2019a) would only require minor changes in your code:
function cdapted()
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
while dif>tol
for ik=1:nk
for iz = 1:nz
tmpmax = double(intmin);
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
% I've commented out fprintf because double2single cannot handle it
% (could be manually uncommented in the converted version if needed)
% ------------
% if mod(cnt,1)==0
% fprintf('%1.5f : %1.5f \n', cnt, dif);
% end
cnt = cnt+1;
end
end
The script to build this is:
% unload mex files
clear mex
%% Build for gpu, float64
% Produces ".\codegen\mex\cdapted" folder and "cdapted_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg cdapted
% benchmark it (~7.14s on my GTX1080Ti)
timeit(#() cdapted_mex,0)
%% Build for gpu, float32:
% Produces ".\codegen\cdapted\single" folder
scfg = coder.config('single');
codegen -double2single scfg cdapted
% Produces ".\codegen\mex\cdapted_single" folder and "cdapted_single_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg .\codegen\cdapted\single\cdapted_single.m
% benchmark it (~2.09s on my GTX1080Ti)
timeit(#() cdapted_single_mex,0)
So, if your Fortran binary is using float32 precision (I suspect so), this Matlab Coder result is on par with it. That does not mean that both are highly efficient, though. The code, generated by Matlab Coder is still far from being efficient. And it does not fully utilize the GPU (even TDP is ~50%).
Vectorization and gpuArray
Next, I agree with user10597469 and Nicky Mattsson that your Matlab code does not look like normal "native" vectorized Matlab code.
There are many things to adjust. (But arrayfun is hardly better than for). Firstly, let's remove for loops:
function vertorized1()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
while dif>tol
%% orig-noparfor:
t=tic();
for ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
c1_x = c1_.^(1-eta)/(1-eta);
c2 = c1_x + reshape(ev', [1 nz nk]);
c2(c1_<0) = -Inf;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 17.7 9.8]
% tol = 1e-8 -> 1124 iterations :: t_acc = [1758.6 972.0]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 931.7443
Now, it is strikingly evident that most computationally intense calculations inside the while loop (e.g. ^(1-eta)/(1-eta)) actually produce constants that could be pre-calculated. Once we fix that, the result would be already a bit faster than the original parfor-based version (on my 2xE5-2630v3):
function vertorized2()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=zeros(size(c1_));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);
while dif>tol
%% orig:
t=tic();
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
c2 = c1_x + reshape(ev', [1 nz nk]);
c2 = c2 + mask;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 2.4 1.7]
% tol = 1e-8 -> 1124 iterations :: t_acc = [188.3 115.9]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 117.6217
This vectorized code is still inefficient (e.g. reshape(ev',...), which eats ~60% of time, could be easily avoided by re-ordering dimensions), but it is somewhat suitable for gpuArray():
function vectorized3g()
t0 = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=gpuArray(zeros(nk,nz,'single'));
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=gpuArray(zeros(size(c1_),'single'));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);
c1_x = gpuArray(single(c1_x));
while dif>tol
%% orig:
% t=tic();
% parfor ik=1:nk
% for iz = 1:nz
% tmpmax = -intmax;
%
% for i = 1:nk
% c1 = c0(ik,iz) - kgrid(i);
% if (c1<0)
% continue
% end
% c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
% if tmpmax<c1
% tmpmax = c1;
% end
% end
% v(ik,iz) = tmpmax;
% end
%
% end
% t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
c2 = c1_x + reshape(ev', [1 nz nk]);
c2 = c2 + mask;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
% assert(isequal(v_,v));
v = v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t0));
end
% (all 849 iterations) with commented-out orig :: t_tot = 14.9040
This ~15 sec result is ~7x worse than those (~2sec) we get from Matlab Coder. But this option requires fewer toolboxes. In practice, gpuArray is most handy when you start from writing "native Matlab code". Including interactive use.
Finally, if you build this final vectorized version with Matlab Coder (you would have to do some trivial adjustments), it won't be faster than the first one. It would be 2x-3x slower.
So, this bit is what is going to mess you up on this project. MATLAB stands for Matrix Laboratory. Vectors and matrices are kind of its thing. The number 1 way to optimize anything in MATLAB is to vectorize it. For this reason, when using performance enhancing tools like CUDA, MATLAB assumes that you are going to vectorize your inputs if possible. Given the primacy of vectorizing inputs in the MATLAB coding style, it is not a fair comparison to assess its performance using only loops. It would be like assessing the performance of C++ while refusing to use pointers. If you want to use CUDA with MATLAB, the main way to go about it is to vectorize your inputs and use gpuarray. Honestly, I haven't looked too hard at your code but it kind of looks like your inputs are already mostly vectorized. You may be able to get away with something as simple as gpuarray(1:nk) or kgrid=gpuarray(linspace(...).

MATLAB: Not Enough Input Arguements

I've attempted to run this code multiple times and have had zero luck since I added in the last for loop. Before the error, the vector k wouldn't update so the vector L was the same number repeated.
I can't figure out why I am getting the 'Not enough input arguments' error when it was working fine beforehand.
Any help would be much appreciated!
% Set up parameters of the functions
omega = 2*pi/10; % 1/s
g = 9.81; % m/s^2
h = 20; % m
parms = [omega, g, h];
% Set up the root finding variables
etol = 1e-6; % convergence criteria
iter = 100; % maximum number of iterations
f = #my_fun; % function pointer to my_func
fp = #my_fprime; % function pointer to my_fprime
k0 = kguess(parms); % initial guess for root
% Find the root
[k, error, n_iterations] = newtraph(f, fp, k0, etol, iter, parms);
% Get the wavelength
if n_iterations < iter
% Converged correctly
L = 2 * pi / k;
else
% Did not converge
disp('ERROR: Maximum number of iterations exceeded')
return
end
wave = load('wavedata.dat');
dt = 0.04; %s
%dh = 0.234; %water depth in meters
wave = wave*.01; %covnverts from meters to cm
nw = wave([926:25501],1);
a = length(nw);
t = 0;
spot = 1;
points = zeros(1,100);
for i = 1:a-1
t=t+dt;
if nw(i) < 0
if nw(i+1) > 0
points(spot)=t;
spot=spot+1;
t=0;
end
end
end
omega = 2*pi./points; %w
l = length(points);
L = zeros(1,509);
k = zeros(1,509);
for j = 1:l
g = 9.81; % m/s^2
h = 0.234; % m
parms = [omega(j), g, h];
% Set up the root finding variables
etol = 1e-6; % convergence criteria
iter = 100; % maximum number of iterations
f = #my_fun; % function pointer to my_func
fp = #my_fprime; % function pointer to my_fprime
k0(j) = kguess(parms); % initial guess for root
% Find the root
[k(j), error, n_iterations] = newtraph(f, fp, k0(j), etol, iter, parms);
% Get the wavelength
if n_iterations < iter
% Converged correctly
L(j) = 2 * pi / k(j);
else
% Did not converge
disp('ERROR: Maximum number of iterations exceeded')
return
end
end
function [ f ] = my_fun(k,parms)
%MY_FUN creates a function handle for linear dispersion
% Detailed explanation goes here
w = parms(1) ;
g = parms(2);
h = parms(3);
f = g*k*tanh(k*h)-(w^2);
end
function [ fp ] = my_fprime(k,parms)
%MY_FPRIME creates a function handle for first derivative of linear
% dispersion.
g = parms(2);
h = parms(3);
% w = 2*pi/10; % 1/s
% g = 9.81; % m/s^2
% h = 20; % m
fp = g*(k*h*((sech(k*h)).^2) + tanh(k*h));
end
function [ k, error, n_iterations ] = newtraph( f, fp, k0, etol, iterA, parms )
%NEWTRAPH Estimates the value of k using the newton raphson method.
if nargin<3,error('at least 3 input arguments required'),end
if nargin<4|isempty(etol),es=etol;end
if nargin<5|isempty(iterA),maxit=iterA;end
iter = 0;
k = k0;
%func =#f;
%dfunc =#fp;
while (1)
xrold = k;
k = k - f(k)/fp(k);
iter = iter + 1;
if k ~= 0, ea = abs((k - xrold)/k) * 100; end
if ea <= etol | iter >= iterA, break, end
end
error = ea;
n_iterations = iter;
end
In function newtraph at line 106 (second line in the while(1) loop), you forgot to pass parms to the function call f:
k = k - f(k)/fp(k);
should become
k = k - f(k,parms)/fp(k,parms);