I have been playing around with parallelization both using ACC and OpenMP in Fortran. I am now trying to do the same in matlab. I find it very interesting that it seems to be very hard to paralelize a loop using GPUs in matlab. Apparently the only way to do it is to by using arrayfun function. But I might be wrong.
At a conceptual level, I am wondering why is the GPU usage in matlab not more straightforward than in fortran. At a more practical level, I am wondering how to use GPUs on the simple code below.
Below, I am sharing three codes and benchmarks:
Fortran OpenMP code
Fortran ACC code
Matlab parfor code
Matlab CUDA (?) this is the one I don't know how to do.
Fortran OpenMP:
program rbc
use omp_lib ! For timing
use tools
implicit none
real, parameter :: beta = 0.984, eta = 2, alpha = 0.35, delta = 0.01, &
rho = 0.95, sigma = 0.005, zmin=-0.0480384, zmax=0.0480384;
integer, parameter :: nz = 4, nk=4800;
real :: zgrid(nz), kgrid(nk), t_tran_z(nz,nz), tran_z(nz,nz);
real :: kmax, kmin, tol, dif, c(nk), r(nk), w(nk);
real, dimension(nk,nz) :: v=0., v0=0., ev=0., c0=0.;
integer :: i, iz, ik, cnt;
logical :: ind(nk);
real(kind=8) :: start, finish ! For timing
real :: tmpmax, c1
call omp_set_num_threads(12)
!Grid for productivity z
! [1 x 4] grid of values for z
call linspace(zmin,zmax,nz,zgrid)
zgrid = exp(zgrid)
! [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757
tran_z(1,2) = 0.00324265
tran_z(1,3) = 0
tran_z(1,4) = 0
tran_z(2,1) = 0.000385933
tran_z(2,2) = 0.998441
tran_z(2,3) = 0.00117336
tran_z(2,4) = 0
tran_z(3,1) = 0
tran_z(3,2) = 0.00117336
tran_z(3,3) = 0.998441
tran_z(3,4) = 0.000385933
tran_z(4,1) = 0
tran_z(4,2) = 0
tran_z(4,3) = 0.00324265
tran_z(4,4) = 0.996757
! Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)**(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)**(1/(alpha-1));
! [1 x 4800] grid of possible values of k
call linspace(kmin, kmax, nk, kgrid)
! Compute initial wealth c0(k,z)
do iz=1,nz
c0(:,iz) = zgrid(iz)*kgrid**alpha + (1-delta)*kgrid;
end do
dif = 10000
tol = 1e-8
cnt = 1
do while(dif>tol)
!$omp parallel do default(shared) private(ik,iz,i,tmpmax,c1)
do ik=1,nk;
do iz = 1,nz;
tmpmax = -huge(0.)
do i = 1,nk
c1 = c0(ik,iz) - kgrid(i)
if(c1<0) exit
c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
if(tmpmax<c1) tmpmax = c1
end do
v(ik,iz) = tmpmax
end do
end do
!$omp end parallel do
ev = beta*matmul(v,tran_z)
dif = maxval(abs(v-v0))
v0 = v
if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
cnt = cnt+1
end do
end program
Fortran ACC:
Just replace the mainloop syntax on the above code with:
do while(dif>tol)
!$acc kernels
!$acc loop gang
do ik=1,nk;
!$acc loop gang
do iz = 1,nz;
tmpmax = -huge(0.)
do i = 1,nk
c1 = c0(ik,iz) - kgrid(i)
if(c1<0) exit
c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
if(tmpmax<c1) tmpmax = c1
end do
v(ik,iz) = tmpmax
end do
end do
!$acc end kernels
ev = beta*matmul(v,tran_z)
dif = maxval(abs(v-v0))
v0 = v
if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
cnt = cnt+1
end do
Matlab parfor:
(I know the code below could be made faster by using vectorized syntax, but the whole point of the exercise is to compare loop speeds).
tic;
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
while dif>tol
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', [cnt dif])
end
cnt = cnt+1;
end
toc
Matlab CUDA:
This is what I have no clue how to code. Is using arrayfun the only way of doing this? In fortran is so simple to move from OpenMP to OpenACC. Isn't there an easy way in Matlab of going from parfor to GPUs loops?
The time comparison between codes:
Fortran OpenMP: 83.1 seconds
Fortran ACC: 2.4 seconds
Matlab parfor: 1182 seconds
Final remark, I should say the codes above solve a simple Real Business Cycle Model and were written based on this.
Matlab Coder
First, as Dev-iL already mentioned, you can use GPU coder.
It (I use R2019a) would only require minor changes in your code:
function cdapted()
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
while dif>tol
for ik=1:nk
for iz = 1:nz
tmpmax = double(intmin);
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
% I've commented out fprintf because double2single cannot handle it
% (could be manually uncommented in the converted version if needed)
% ------------
% if mod(cnt,1)==0
% fprintf('%1.5f : %1.5f \n', cnt, dif);
% end
cnt = cnt+1;
end
end
The script to build this is:
% unload mex files
clear mex
%% Build for gpu, float64
% Produces ".\codegen\mex\cdapted" folder and "cdapted_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg cdapted
% benchmark it (~7.14s on my GTX1080Ti)
timeit(#() cdapted_mex,0)
%% Build for gpu, float32:
% Produces ".\codegen\cdapted\single" folder
scfg = coder.config('single');
codegen -double2single scfg cdapted
% Produces ".\codegen\mex\cdapted_single" folder and "cdapted_single_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg .\codegen\cdapted\single\cdapted_single.m
% benchmark it (~2.09s on my GTX1080Ti)
timeit(#() cdapted_single_mex,0)
So, if your Fortran binary is using float32 precision (I suspect so), this Matlab Coder result is on par with it. That does not mean that both are highly efficient, though. The code, generated by Matlab Coder is still far from being efficient. And it does not fully utilize the GPU (even TDP is ~50%).
Vectorization and gpuArray
Next, I agree with user10597469 and Nicky Mattsson that your Matlab code does not look like normal "native" vectorized Matlab code.
There are many things to adjust. (But arrayfun is hardly better than for). Firstly, let's remove for loops:
function vertorized1()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
while dif>tol
%% orig-noparfor:
t=tic();
for ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
c1_x = c1_.^(1-eta)/(1-eta);
c2 = c1_x + reshape(ev', [1 nz nk]);
c2(c1_<0) = -Inf;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 17.7 9.8]
% tol = 1e-8 -> 1124 iterations :: t_acc = [1758.6 972.0]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 931.7443
Now, it is strikingly evident that most computationally intense calculations inside the while loop (e.g. ^(1-eta)/(1-eta)) actually produce constants that could be pre-calculated. Once we fix that, the result would be already a bit faster than the original parfor-based version (on my 2xE5-2630v3):
function vertorized2()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=zeros(size(c1_));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);
while dif>tol
%% orig:
t=tic();
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
c2 = c1_x + reshape(ev', [1 nz nk]);
c2 = c2 + mask;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 2.4 1.7]
% tol = 1e-8 -> 1124 iterations :: t_acc = [188.3 115.9]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 117.6217
This vectorized code is still inefficient (e.g. reshape(ev',...), which eats ~60% of time, could be easily avoided by re-ordering dimensions), but it is somewhat suitable for gpuArray():
function vectorized3g()
t0 = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=gpuArray(zeros(nk,nz,'single'));
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=gpuArray(zeros(size(c1_),'single'));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);
c1_x = gpuArray(single(c1_x));
while dif>tol
%% orig:
% t=tic();
% parfor ik=1:nk
% for iz = 1:nz
% tmpmax = -intmax;
%
% for i = 1:nk
% c1 = c0(ik,iz) - kgrid(i);
% if (c1<0)
% continue
% end
% c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
% if tmpmax<c1
% tmpmax = c1;
% end
% end
% v(ik,iz) = tmpmax;
% end
%
% end
% t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
c2 = c1_x + reshape(ev', [1 nz nk]);
c2 = c2 + mask;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
% assert(isequal(v_,v));
v = v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t0));
end
% (all 849 iterations) with commented-out orig :: t_tot = 14.9040
This ~15 sec result is ~7x worse than those (~2sec) we get from Matlab Coder. But this option requires fewer toolboxes. In practice, gpuArray is most handy when you start from writing "native Matlab code". Including interactive use.
Finally, if you build this final vectorized version with Matlab Coder (you would have to do some trivial adjustments), it won't be faster than the first one. It would be 2x-3x slower.
So, this bit is what is going to mess you up on this project. MATLAB stands for Matrix Laboratory. Vectors and matrices are kind of its thing. The number 1 way to optimize anything in MATLAB is to vectorize it. For this reason, when using performance enhancing tools like CUDA, MATLAB assumes that you are going to vectorize your inputs if possible. Given the primacy of vectorizing inputs in the MATLAB coding style, it is not a fair comparison to assess its performance using only loops. It would be like assessing the performance of C++ while refusing to use pointers. If you want to use CUDA with MATLAB, the main way to go about it is to vectorize your inputs and use gpuarray. Honestly, I haven't looked too hard at your code but it kind of looks like your inputs are already mostly vectorized. You may be able to get away with something as simple as gpuarray(1:nk) or kgrid=gpuarray(linspace(...).
I've implemented this error state kalman filter, which has IMU data as input (220Hz) and it corrects the prediction with UWB measurements (50 Hz).
I want to estimate the pose of a quadrotor.
The code is:
function [x_hat,bound_p] = ESKF(d_meas,p_am,clockUWB,dt,u)
% u = IMU inputs
% d_meas = UWB measure
% p_am = anchor coordinate. There are 4 anchors that send the measurement one at a time cyclically
g = [0 0 9.81]';
am = u(1:3); % from acc
wm = u(4:6); % from gyro
persistent P Q R I dx_hat x_n p q v ba bw Fw
if isempty(P)
sig_an = 0.05; % [m/s^2]
sig_wn = 0.01; % [rad/s]
sig_wbn = 0.0001; % [rad/s*sqrt(s)]
sig_abn = 0.0001; % [m/(s^2*sqrt(s)]
sig_uwb = 0.0214; % [m]
Q_an = sig_an*sig_an*eye(3); % [m^2/s^2]
Q_wn = sig_wn*sig_wn*eye(3); % [rad^2]
Q_abn = sig_abn*sig_abn*eye(3); % [m^2/s^4]
Q_wbn = sig_wbn*sig_wbn*eye(3); % [rad^2/s^2]
clockUWB = 0;
dx_hat = zeros(15,1); % error state
x_n = [5 4 5 1 0 0 1 0 0 0 0 0 0 0 0 0]'; % initial state
P = eye(15,15);
Fw = [zeros(3,12);eye(12,12)];
Q = blkdiag(Q_an,Q_wn,Q_wbn,Q_abn);
R = sig_uwb*sig_uwb;
I = eye(length(dx_hat));
p = x_n(1:3);
v = x_n(4:6);
q = x_n(7:10);
bw = x_n(11:13);
ba = x_n(14:16);
end
%% NOMINAL STATE
if ((wm - bw) == [0 0 0]')
qw = [1 0 0 0]';
else
nw = norm(wm - bw);
qw = [cos(nw*dt/2); ((wm - bw)/nw)*sin(nw*dt/2)];
end
R_ui = quat2rotm(q');
% PREDICTION
p = p + v*dt + 1/2*(R_ui*(am - ba) + g)*dt*dt;
v = v + (R_ui*(am - ba) + g)*dt;
q = quatmultiply(q',qw')';
q = q/norm(q);
% bw = bw;
% ba = ba;
%% ERROR STATE
% delta_p = delta_p + delta_v*dt;
% delta_v = delta_v + (-R_ui*skew(am - ba)*delta_th - R_ui*delta_ba)*dt + an);
% delta_th = R_ui'*delta_th - delta_bw*dt + wn;
% delta_bw = delta_bw + wbn*ones(3,1);
% delta_ba = delta_ba + abn*ones(3,1);
F = [ eye(3), eye(3)*dt, zeros(3,3), zeros(3,3), zeros(3,3);
zeros(3,3), eye(3), -R_ui*skew(am-ba)*dt, zeros(3,3), -R_ui*dt;
zeros(3,3), zeros(3,3), R_ui', -eye(3)*dt, zeros(3,3);
zeros(3,3), zeros(3,3), zeros(3,3), eye(3), zeros(3,3);
zeros(3,3), zeros(3,3), zeros(3,3), zeros(3,3), eye(3)];
%PREDCTION
dx_hat = F*dx_hat;
P = F * P * F' + Fw * Q * Fw';
%% UPDATE
% Only when measures arrive
if (clockUWB == 1)
h = p - p_am;
d_am = norm(h);
H = [ (1/(2*d_am))*2*h'*eye(3), zeros(1,3), zeros(1,3), zeros(1,3), zeros(1,3)];
K = P * H' * inv(H * P * H' + R);
dx_hat = K * (d_meas - d_am);
delta_x = [dx_hat(1:6);[1,1/2*dx_hat(7:9)']';dx_hat(10:15)];
P = (I - K * H) * P * (I - K * H)' + K * R * K';
%% ERROR INJECTION
p = p + delta_x(1:3);
v = v + delta_x(4:6);
q = quatmultiply(q',delta_x(7:10)')';
q = q/norm(q);
bw = bw + delta_x(11:13);
ba = ba + delta_x(14:16);
sig_p = 3*[sqrt(P(1,1));sqrt(P(2,2));sqrt(P(2,2))];
bound_p = [p(1) + sig_p(1),p(1) - sig_p(1),p(2) + sig_p(2),p(2) - sig_p(2),p(3) + sig_p(3),p(3) - sig_p(3)];
x_hat = [p;v;q;bw;ba];
yaw_err = abs(dx_hat(9));
yaw_err_old = yaw_err;
%% RESET ERROR
G = blkdiag(eye(6),eye(3) - skew(1/2*dx_hat(7:9)),eye(6));
delta_x = zeros(16,1);
dx_hat = zeros(15,1);
P = G * P * G';
else
x_hat = [p;v;q;bw;ba];
delta_x = zeros(16,1);
sig_p = 3*[sqrt(P(1,1));sqrt(P(2,2));sqrt(P(2,2))];
bound_p = [p(1) + sig_p(1),p(1) - sig_p(1),p(2) + sig_p(2),p(2) - sig_p(2),p(3) + sig_p(3),p(3) - sig_p(3)];
end
end
It estimates position perfectly, and also velocity is nice.
The problem is that it estimates roll, pitch and yaw (which I derive from the quaternion thanks to quat2eul function) very badly and some bias are totally wrong.
Can someone tell me where the code is wrong?
Thanks
This is the main simulink model
In the green block there is the function of the filter.
In order to simulate a trajectory and UWB measurements in the UWB block there is this script:
function [xt,yt,zt,d_meas,p_am] = fcn(t,clock)
sig_uwb = 0.0214;
dn = normrnd(0,sig_uwb);
persistent k i
if isempty(k)
k = 0;
clock = 0;
i = 0;
end
x = 3*cos(1/3*t) + 3;
y = 3*sin(1/3*t) + 3;
z = 5;
a1 = [-4.12,-3.67,2.72]; % anchors coordinates
a2 = [2.45,-2.70,0.063];
a3 = [-2.43,3.07,0.075];
a4 = [3.65,2.42,2.62];
d1 = norm(a1 - [x,y,z]);
d2 = norm(a2 - [x,y,z]);
d3 = norm(a3 - [x,y,z]);
d4 = norm(a4 - [x,y,z]);
A = [a1,d1;a2,d2;a3,d3;a4,d4];
if (clock == 1)
k = k + 1;
i = mod(k,4) + 1;
d_meas = A(i,4) + dn;
p_am = A(i,(1:3))';
else
d_meas = 0;
p_am = zeros(3,1);
end
xt = x;
yt = y;
zt = z;
So the drone simulates a circular trajectory with radius equals to 3.
The IMU block contains just 2 vector:
am = [0 -1/3 -9.81] and wm = [0 0 1/3].
The bias should be constant and 0. I obtain instead values like 3 or 2 and non constant.
Roll and pitch aren't 0 as they should.
The text I am using to implement the ESKF are Quaternion kinematics for the error-state KF from pag. 52.
I would like to perform what follows:
where
The parameters shown in the latter figure can be obtained as follows:
%% Inizialization
time = 614.4; % Analysis Time
Uhub = 11;
HubHt = 90;
alpha = 0.14;
TI = 'A'; % Turbulent Intensity (A,B,C as in the IEC or Specific value)
N1 = 4096;
N2 = 32;
N3 = 32;
N = N1*N2*N3; % Total Number of Point
t = 0:(time/(N1-1)):time; % Sampled Time Vector
L1 = Uhub*time; % Box length along X
L2 = 150; % Box length along Y
L3 = 220; % Box length along Z
dx = L1/N1; % Grid Resolution along X-axis
dy = L2/N2; % Grid Resolution along Y-axis
dz = L3/N3; % Grid Resolution along Z-axis
V = L1*L2*L3; % Analysis Box Volume
gamma = 3.9; % Turbulent Eddies Distorsion Factor
c = 1.476;
b = 5.6;
if HubHt < 60
lambda1 = 0.7*HubHt;
else
lambda1 = 42;
end
L = 0.8*lambda1;
if isequal(TI,'A')
Iref = 0.16;
sigma1 = Iref*(0.75*Uhub + b);
elseif isequal(TI,'B')
Iref = 0.14;
sigma1 = Iref*(0.75*Uhub + b);
elseif isequal(TI,'C')
Iref = 0.12;
sigma1 = Iref*(0.75*Uhub + b);
else
sigma1 = str2num(TI)*Uhub/100;
end
sigma_iso = 0.55*sigma1;
sigma2 = 0.7*sigma1;
sigma3 = 0.5*sigma1;
%% Wave number vectors
ik1 = cat(2,(-N1/2:-1/2),(1/2:N1/2));
ik2 = -N2/2:N2/2-1;
ik3 = -N3/2:N3/2-1;
[x y z] = ndgrid(ik1,ik2,ik3);
k1 = reshape((2*pi*L/L1)*x,N1*N2*N3,1);
k2 = reshape((2*pi*L/L2)*y,N1*N2*N3,1);
k3 = reshape((2*pi*L/L3)*z,N1*N2*N3,1);
k = sqrt(k1.^2 + k2.^2 + k3.^2);
%% Calculation of beta by means of the Energy Spectrum Integration
E = #(k) (1.453*k.^4)./((1 + k.^2).^(17/6));
%//Independent integration on segments
Local_int = arrayfun(#(i)quad(E,i-1,i), 2:(N1*N2*N3));
%//integral additivity + cumulative removal of queues
E_int = 1.5 - [0 fliplr(cumsum(fliplr(Local_int)))]; %//To remove queues
E_int = reshape(E_int,N,1);
S = k.*sqrt(E_int);
beta = (c*gamma)./S;
%% Help Parameters
k30 = k3 + beta.*k1;
k0 = sqrt(k1.^2 + k2.^2 + k30.^2);
C1 = (beta.*k1.^2.*(k1.^2 + k2.^2 - k3.*k30))./(k.^2.*(k1.^2 + k2.^2));
C2 = (k2.*k0.^2./((k1.^2 + k2.^2).^(3/2))).*atan2((beta.*k1.*sqrt(k1.^2 + k2.^2)),(k0.^2 - k30.*k1.*beta));
xhsi1 = C1 - (k2./k1).*C2;
xhsi2 = (k2./k1).*C1 + C2;
E_k0 = (1.453*k0.^4)./((1 + k0.^2).^(17/6));
For instance, typing in
phi_33 = #(k2,k3) (E_k0./(4*pi.*k.^4)).*((k1.^2 + k2.^2));
F_33 = arrayfun(#(i) dblquad(phi_33,k3(i),k3(i+1),k2(i),k2(i+1)), 1:((N1*N2*N3)-1));
Matlab retrieves the following error msg:
Error using +
Matrix dimensions must agree.
Error in #(k2,k3)(E_k0./(4*pi.*k.^4)).*((k1.^2+k2.^2))
Do you have a clue how to overcome this issue?
I really look forward to hearing from you.
Best regards,
FPE
The error is easily explained:
First you define E_k0 then you try to call Ek0.
phi_11 = #(k1,k2,k3) (E_k0./4*pi.*kabs.^4).*(k0abs.^2 - k1.^2 - 2*k1.*k03.*xhsi1 + (k1.^2 + k2.^2).*xhsi1.^2);
I solved it this way:
Code a function for each of the PHI elements, such as (for PHI11)
function phi_11 = phi_11_new(k1,k2,k3,beta,i)
k = sqrt(k1(i).^2 + k2.^2 + k3.^2);
k30 = k3 + beta(i).*k1(i);
k0 = sqrt(k1(i).^2 + k2.^2 + k30.^2);
E_k0 = 1.453.*k0.^4./((1 + k0.^2).^(17/6));
C1 = (beta(i).k1(i).^2).(k1(i).^2 + k2.^2 - k3.k30)./(k.^2.(k1(i).^2 + k2.^2));
C2 = k2.*k0.^2./((k1(i).^2 + k2.^2).^(3/2)).*atan2((beta(i).*k1(i).*sqrt(k1(i).^2 + k2.^2)),(k0.^2 - k30.*k1(i).*beta(i)));
xhsi1 = C1 - k2./k1(i).*C2;
xhsi1_q = xhsi1.^2;
phi_11 = E_k0./(4.*pi.k0.^4).(k0.^2 - k1(i).^2 - 2.*k1(i).*k30.*xhsi1 + (k1(i).^2 + k2.^2).*xhsi1_q);
end
I recall this function within the main code as follows
for l = 1:numel(k1)
phi11 = #(k2,k3) phi_11(k1,k2,k3,l)
F11(l) = integral2(phi,-1000,1000,-1000,1000);
end
at it seems it works.
I have this piece of code:
time = 614.4;
Uhub = 11;
HubHt = 90;
TI = 'A';
N1 = 4096;
N2 = 32;
N3 = 32;
L1 = Uhub*time;
L2 = 150;
L3 = 220;
V = L1*L2*L3;
gamma = 3.9;
c = 1.476;
b = 5.6;
if HubHt < 60
lambda1 = 0.7*HubHt;
else
lambda1 = 42;
end
L = 0.8*lambda1;
if isequal(TI,'A')
Iref = 0.16;
sigma1 = Iref*(0.75*Uhub + b);
elseif isequal(TI,'B')
Iref = 0.14;
sigma1 = Iref*(0.75*Uhub + b);
elseif isequal(TI,'C')
Iref = 0.12;
sigma1 = Iref*(0.75*Uhub + b);
else
sigma1 = str2num(TI)*Uhub/100;
end
sigma_iso = 0.55*sigma1;
%% Wave number vectors
ik1 = cat(2,(-N1/2:-1/2),(1/2:N1/2));
ik2 = -N2/2:N2/2-1;
ik3 = -N3/2:N3/2-1;
[x y z] = ndgrid(ik1,ik2,ik3);
k1 = reshape((2*pi*L/L1)*x,N1*N2*N3,1);
k2 = reshape((2*pi*L/L2)*y,N1*N2*N3,1);
k3 = reshape((2*pi*L/L3)*z,N1*N2*N3,1);
k = sqrt(k1.^2 + k2.^2 + k3.^2);
Now I should calculate
where
The procedure to calculate the integral is
At the moment I'm using this loop
E = #(k) (1.453*k.^4)./((1 + k.^2).^(17/6));
E_int = zeros(1,N1*N2*N3);
E_int(1) = 1.5;
for i = 2:(N1*N2*N3)
E_int(i) = E_int(i) + quad(E,i-1,i);
end
neglecting for the k>400 approximation. I believe that my loop is not right.
How would you suggest to calculate the integral?
I thank you in advance.
WKR,
Francesco
This is a list of correction from the more obvious to the possibly more subtle. (Indeed I start from what you wrote in the final part going upwards).
From what you write:
E = #(k) (1.453*k.^4)./((1 + k.^2).^(17/6));
E_int = zeros(1,N1*N2*N3);
E_int(1) = 1.5;
for i = 2:(N1*N2*N3)
%//No point in doing this:
%//E_int(i) = E_int(i) + quad(E,i-1,i);
%//According to what you write, it should be:
E_int(i) = E_int(i-1) + quad(E,i-1,i);
end
You could speed the whole thing up by doing
%//Independent integration on segments
Local_int = arrayfun(#(i)quad(E,i-1,i), 2:(N1*N2*N3));
Local_int = [1.5 Local_int];
%//integral additivity
E_int = cumsum(Local_int);
Moreover, if the known condition (point 2.) really is "... ( = 1.5 if k' = 0)", then the whole implementation should really be more like
%//Independent integration on segments
Local_int = arrayfun(#(i)quad(E,i-1,i), 2:(N1*N2*N3));
%//integral additivity + cumulative removal of queues
E_int = 1.5 - [0 fliplr(cumsum(fliplr(Local_int)))]; %//To remove queues