I'm trying to write an image compression script in MATLAB using multilayer 3D DWT(color image). along the way, I want to apply thresholding on coefficient matrices, both global and local thresholds.
I like to use the formula below to calculate my local threshold:
where sigma is variance and N is the number of elements.
Global thresholding works fine; but my problem is that the calculated local threshold is (most often!) greater than the maximum band coefficient, therefore no thresholding is applied.
Everything else works fine and I get a result too, but I suspect the local threshold is miscalculated. Also, the resulting image is larger than the original!
I'd appreciate any help on the correct way to calculate the local threshold, or if there's a pre-set MATLAB function.
here's an example output:
here's my code:
clear;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% COMPRESSION %%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% read base image
% dwt 3/5-L on base images
% quantize coeffs (local/global)
% count zero value-ed coeffs
% calculate mse/psnr
% save and show result
% read images
base = imread('circ.jpg');
fam = 'haar'; % wavelet family
lvl = 3; % wavelet depth
% set to 1 to apply global thr
thr_type = 0;
% global threshold value
gthr = 180;
% convert base to grayscale
%base = rgb2gray(base);
% apply dwt on base image
dc = wavedec3(base, lvl, fam);
% extract coeffs
ll_base = dc.dec{1};
lh_base = dc.dec{2};
hl_base = dc.dec{3};
hh_base = dc.dec{4};
ll_var = var(ll_base, 0);
lh_var = var(lh_base, 0);
hl_var = var(hl_base, 0);
hh_var = var(hh_base, 0);
% count number of elements
ll_n = numel(ll_base);
lh_n = numel(lh_base);
hl_n = numel(hl_base);
hh_n = numel(hh_base);
% find local threshold
ll_t = ll_var * (sqrt(2 * log2(ll_n)));
lh_t = lh_var * (sqrt(2 * log2(lh_n)));
hl_t = hl_var * (sqrt(2 * log2(hl_n)));
hh_t = hh_var * (sqrt(2 * log2(hh_n)));
% global
if thr_type == 1
ll_t = gthr; lh_t = gthr; hl_t = gthr; hh_t = gthr;
end
% count zero values in bands
ll_size = size(ll_base);
lh_size = size(lh_base);
hl_size = size(hl_base);
hh_size = size(hh_base);
% count zero values in new band matrices
ll_zeros = sum(ll_base==0,'all');
lh_zeros = sum(lh_base==0,'all');
hl_zeros = sum(hl_base==0,'all');
hh_zeros = sum(hh_base==0,'all');
% initiate new matrices
ll_new = zeros(ll_size);
lh_new = zeros(lh_size);
hl_new = zeros(lh_size);
hh_new = zeros(lh_size);
% apply thresholding on bands
% if new value < thr => 0
% otherwise, keep the previous value
for id=1:ll_size(1)
for idx=1:ll_size(2)
if ll_base(id,idx) < ll_t
ll_new(id,idx) = 0;
else
ll_new(id,idx) = ll_base(id,idx);
end
end
end
for id=1:lh_size(1)
for idx=1:lh_size(2)
if lh_base(id,idx) < lh_t
lh_new(id,idx) = 0;
else
lh_new(id,idx) = lh_base(id,idx);
end
end
end
for id=1:hl_size(1)
for idx=1:hl_size(2)
if hl_base(id,idx) < hl_t
hl_new(id,idx) = 0;
else
hl_new(id,idx) = hl_base(id,idx);
end
end
end
for id=1:hh_size(1)
for idx=1:hh_size(2)
if hh_base(id,idx) < hh_t
hh_new(id,idx) = 0;
else
hh_new(id,idx) = hh_base(id,idx);
end
end
end
% count zeros of the new matrices
ll_new_size = size(ll_new);
lh_new_size = size(lh_new);
hl_new_size = size(hl_new);
hh_new_size = size(hh_new);
% count number of zeros among new values
ll_new_zeros = sum(ll_new==0,'all');
lh_new_zeros = sum(lh_new==0,'all');
hl_new_zeros = sum(hl_new==0,'all');
hh_new_zeros = sum(hh_new==0,'all');
% set new band matrices
dc.dec{1} = ll_new;
dc.dec{2} = lh_new;
dc.dec{3} = hl_new;
dc.dec{4} = hh_new;
% count how many coeff. were thresholded
ll_zeros_diff = ll_new_zeros - ll_zeros;
lh_zeros_diff = lh_zeros - lh_new_zeros;
hl_zeros_diff = hl_zeros - hl_new_zeros;
hh_zeros_diff = hh_zeros - hh_new_zeros;
% show coeff. matrices vs. thresholded version
figure
colormap(gray);
subplot(2,4,1); imagesc(ll_base); title('LL');
subplot(2,4,2); imagesc(lh_base); title('LH');
subplot(2,4,3); imagesc(hl_base); title('HL');
subplot(2,4,4); imagesc(hh_base); title('HH');
subplot(2,4,5); imagesc(ll_new); title({'LL thr';ll_zeros_diff});
subplot(2,4,6); imagesc(lh_new); title({'LH thr';lh_zeros_diff});
subplot(2,4,7); imagesc(hl_new); title({'HL thr';hl_zeros_diff});
subplot(2,4,8); imagesc(hh_new); title({'HH thr';hh_zeros_diff});
% idwt to reconstruct compressed image
cmp = waverec3(dc);
cmp = uint8(cmp);
% calculate mse/psnr
D = abs(cmp - base) .^2;
mse = sum(D(:))/numel(base);
psnr = 10*log10(255*255/mse);
% show images and mse/psnr
figure
subplot(1,2,1);
imshow(base); title("Original"); axis square;
subplot(1,2,2);
imshow(cmp); colormap(gray); axis square;
msg = strcat("MSE: ", num2str(mse), " | PSNR: ", num2str(psnr));
title({"Compressed";msg});
% save image locally
imwrite(cmp, 'compressed.png');
I solved the question.
the sigma in the local threshold formula is not variance, it's the standard deviation. I applied these steps:
used stdfilt() std2() to find standard deviation of my coeff. matrices (thanks to #Rotem for pointing this out)
used numel() to count the number of elements in coeff. matrices
this is a summary of the process. it's the same for other bands (LH, HL, HH))
[c, s] = wavedec2(image, wname, level); %apply dwt
ll = appcoeff2(c, s, wname); %find LL
ll_std = std2(ll); %find standard deviation
ll_n = numel(ll); %find number of coeffs in LL
ll_t = ll_std * (sqrt(2 * log2(ll_n))); %local the formula
ll_new = ll .* double(ll > ll_t); %thresholding
replace the LL values in c in a for loop
reconstruct by applying IDWT using waverec2
this is a sample output:
I am very new to Scilab, but so far have not been able to find an answer (either here or via google) to my question. I'm sure it's a simple solution, but I'm at a loss. I have a lot of MATLAB scripts I wrote in grad school, but now that I'm out of school, I no longer have access to MATLAB (and can't justify the cost). Scilab looked like the best open alternative. I'm trying to convert my .m files to Scilab compatible versions using mfile2sci, but when running the mfile2sci GUI, I get the error/message shown below. Attached is the original code from the M-file, in case it's relevant.
I Searched Stack Overflow and companion sites, Google, Scilab documentation.
The M-file code follows (it's a super basic MATLAB script as part of an old homework question -- I chose it as it's the shortest, most straightforward M-file I had):
Mmax = 15;
N = 20;
T = 2000;
%define upper limit for sparsity of signal
smax = 15;
mNE = zeros(smax,Mmax);
mESR= zeros(smax,Mmax);
for M = 1:Mmax
aNormErr = zeros(smax,1);
aSz = zeros(smax,1);
ESR = zeros(smax,1);
for s=1:smax % for-loop to loop script smax times
normErr = zeros(1,T);
vESR = zeros(1,T);
sz = zeros(1,T);
for t=1:T %for-loop to carry out 2000 trials per s-value
esr = 0;
A = randn(M,N); % generate random MxN matrix
[M,N] = size(A);
An = zeros(M,N); % initialize normalized matrix
for h = 1:size(A,2) % normalize columns of matrix A
V = A(:,h)/norm(A(:,h));
An(:,h) = V;
end
A = An; % replace A with its column-normalized counterpart
c = randperm(N,s); % create random support vector with s entries
x = zeros(N,1); % initialize vector x
for i = 1:size(c,2)
val = (10-1)*rand + 1;% generate interval [1,10]
neg = mod(randi(10),2); % include [-10,-1]
if neg~=0
val = -1*val;
end
x(c(i)) = val; %replace c(i)th value of x with the nonzero value
end
y = A*x; % generate measurement vector (y)
R = y;
S = []; % initialize array to store selected columns of A
indx = []; % vector to store indices of selected columns
coeff = zeros(1,s); % vector to store coefficients of approx.
stop = 10; % init. stop condition
in = 0; % index variable
esr = 0;
xhat = zeros(N,1); % intialize estimated x signal
while (stop>0.5 && size(S,2)<smax)
%MAX = abs(A(:,1)'*R);
maxV = zeros(1,N);
for i = 1:size(A,2)
maxV(i) = abs(A(:,i)'*R);
end
in = find(maxV == max(maxV));
indx = [indx in];
S = [S A(:,in)];
coeff = [coeff R'*S(:,size(S,2))]; % update coefficient vector
for w=1:size(S,2)
r = y - ((R'*S(:,w))*S(:,w)); % update residuals
if norm(r)<norm(R)
index = w;
end
R = r;
stop = norm(R); % update stop condition
end
for j=1:size(S,2) % place coefficients into xhat at correct indices
xhat(indx(j))=coeff(j);
end
nE = norm(x-xhat)/norm(x); % calculate normalized error for this estimate
%esr = 0;
indx = sort(indx);
c = sort(c);
if isequal(indx,c)
esr = esr+1;
end
end
vESR(t) = esr;
sz(t) = size(S,2);
normErr(t) = nE;
end
%avsz = sum(sz)/T;
aSz(s) = sum(sz)/T;
%aESR = sum(vESR)/T;
ESR(s) = sum(vESR)/T;
%avnormErr = sum(normErr)/T; % produce average normalized error for these run
aNormErr(s) = sum(normErr)/T; % add new avnormErr to vector of all av norm errors
end
% just put this here to view the vector
mNE(:,M) = aNormErr;
mESR(:,M) = ESR;
% had an 'end' placed here, might've been unmatched
mNE%reshape(mNE,[],Mmax)
mESR%reshape(mESR,[],Mmax)]
figure
dimx = [1 Mmax];
dimy = [1 smax];
imagesc(dimx,dimy,mESR)
colormap gray
strESR = sprintf('Average ESR, N=%d',N);
title(strESR);
xlabel('M');
ylabel('s');
strNE = sprintf('Average Normed Error, N=%d',N);
figure
imagesc(dimx,dimy,mNE)
colormap gray
title(strNE)
xlabel('M');
ylabel('s');
The command used (and results) follow:
--> mfile2sci
ans =
[]
****** Beginning of mfile2sci() session ******
File to convert: C:/Users/User/Downloads/WTF_new.m
Result file path: C:/Users/User/DOWNLO~1/
Recursive mode: OFF
Only double values used in M-file: NO
Verbose mode: 3
Generate formatted code: NO
M-file reading...
M-file reading: Done
Syntax modification...
Syntax modification: Done
File contains no instruction, no translation made...
****** End of mfile2sci() session ******
To convert the foo.m file one has to enter
mfile2sci <path>/foo.m
where stands for the path of the directoty where foo.m is. The result is written in /foo.sci
Remove the ```` at the begining of each line, the conversion will proceed normally ?. However, don't expect to obtain a working .sci file as the m2sci converter is (to me) still an experimental tool !
I have the following data:
N = 10^3;
x = randn(N,1);
y = randn(N,1);
z = randn(N,1);
f = x.^2+y.^2+z.^2;
Now I want to split this continuous 3D space into nB bins.
nB = 20;
[~,~,x_bins] = histcounts(x,nB);
[~,~,y_bins] = histcounts(y,nB);
[~,~,z_bins] = histcounts(z,nB);
And put in each cube average f or nan if no observations happen in the cube:
F = nan(50,50,50);
for iX = 1:20
for iY = 1:20
for iZ = 1:20
idx = (x_bins==iX)&(y_bins==iY)&(z_bins==iZ);
F(iX,iY,iZ) = mean(f(idx));
end
end
end
isosurface(F,0.5)
This code does what I want. My problem is the speed. This code is extremely slow when N > 10^5 and nB = 100.
How can I speed up this code?
I also tried the accumarray() function:
subs=([x_bins,y_bins,z_bins]);
F2 = accumarray(subs,f,[],#mean);
all(F(:) == F2(:)) % false
However, this code produces a different result.
The problem with the code in the OP is that it tests all elements of the data for each element in the output array. The output array has nB^3 elements, the data has N elements, so the algorithm is O(N*nB^3). Instead, one can loop over the N elements of the input, and set the corresponding element in the output array, which is an operation O(N) (2nd code block below).
The accumarray solution in the OP needs to use the fillvals parameter, set it to NaN (3rd code block below).
To compare the results, one needs to explicitly test that both arrays have NaN in the same locations, and have equal non-NaN values elsewhere:
all( ( isnan(F(:)) & isnan(F2(:)) ) | ( F(:) == F2(:) ) )
% \-------same NaN values------/ \--same values--/
Here is code. All three versions produce identical results. Timings in Octave 4.4.1 (no JIT), in MATLAB the loop code should be faster. (Using input data from OP, with N=10^3 and nB=20).
%% OP's code, O(N*nB^3)
tic
F = nan(nB,nB,nB);
for iX = 1:nB
for iY = 1:nB
for iZ = 1:nB
idx = (x_bins==iX)&(y_bins==iY)&(z_bins==iZ);
F(iX,iY,iZ) = mean(f(idx));
end
end
end
toc
% Elapsed time is 1.61736 seconds.
%% Looping over input, O(N)
tic
s = zeros(nB,nB,nB);
c = zeros(nB,nB,nB);
ind = sub2ind([nB,nB,nB],x_bins,y_bins,z_bins);
for ii=1:N
s(ind(ii)) = s(ind(ii)) + f(ii);
c(ind(ii)) = c(ind(ii)) + 1;
end
F2 = s ./ c;
toc
% Elapsed time is 0.0606539 seconds.
%% Other alternative, using accumarray
tic
ind = sub2ind([nB,nB,nB],x_bins,y_bins,z_bins);
F3 = accumarray(ind,f,[nB,nB,nB],#mean,NaN);
toc
% Elapsed time is 0.14113 seconds.
I am trying to implement a simple pixel level center-surround image enhancement. Center-surround technique makes use of statistics between the center pixel of the window and the surrounding neighborhood as a means to decide what enhancement needs to be done. In the code given below I have compared the center pixel with average of the surrounding information and based on that I switch between two cases to enhance the contrast. The code that I have written is as follows:
im = normalize8(im,1); %to set the range of pixel from 0-255
s1 = floor(K1/2); %K1 is the size of the window for surround
M = 1000; %is a constant value
out1 = padarray(im,[s1,s1],'symmetric');
out1 = CE(out1,s1,M);
out = (out1(s1+1:end-s1,s1+1:end-s1));
out = normalize8(out,0); %to set the range of pixel from 0-1
function [out] = CE(out,s,M)
B = 255;
out1 = out;
for i = s+1 : size(out,1) - s
for j = s+1 : size(out,2) - s
temp = out(i-s:i+s,j-s:j+s);
Yij = out1(i,j);
Sij = (1/(2*s+1)^2)*sum(sum(temp));
if (Yij>=Sij)
Aij = A(Yij-Sij,M);
out1(i,j) = ((B + Aij)*Yij)/(Aij+Yij);
else
Aij = A(Sij-Yij,M);
out1(i,j) = (Aij*Yij)/(Aij+B-Yij);
end
end
end
out = out1;
function [Ax] = A(x,M)
if x == 0
Ax = M;
else
Ax = M/x;
end
The code does the following things:
1) Normalize the image to 0-255 range and pad it with additional elements to perform windowing operation.
2) Calls the function CE.
3) In the function CE obtain the windowed image(temp).
4) Find the average of the window (Sij).
5) Compare the center of the window (Yij) with the average value (Sij).
6) Based on the result of comparison perform one of the two enhancement operation.
7) Finally set the range back to 0-1.
I have to run this for multiple window size (K1,K2,K3, etc.) and the images are of size 1728*2034. When the window size is selected as 100, the time consumed is very high.
Can I use vectorization at some stage to reduce the time for loops?
The profiler result (for window size 21) is as follows:
The profiler result (for window size 100) is as follows:
I have changed the code of my function and have written it without the sub-function. The code is as follows:
function [out] = CE(out,s,M)
B = 255;
Aij = zeros(1,2);
out1 = out;
n_factor = (1/(2*s+1)^2);
for i = s+1 : size(out,1) - s
for j = s+1 : size(out,2) - s
temp = out(i-s:i+s,j-s:j+s);
Yij = out1(i,j);
Sij = n_factor*sum(sum(temp));
if Yij-Sij == 0
Aij(1) = M;
Aij(2) = M;
else
Aij(1) = M/(Yij-Sij);
Aij(2) = M/(Sij-Yij);
end
if (Yij>=Sij)
out1(i,j) = ((B + Aij(1))*Yij)/(Aij(1)+Yij);
else
out1(i,j) = (Aij(2)*Yij)/(Aij(2)+B-Yij);
end
end
end
out = out1;
There is a slight improvement in the speed from 93 sec to 88 sec. Suggestions for any other improvements to my code are welcomed.
I have tried to incorporate the suggestions given to replace sliding window with convolution and then vectorize the rest of it. The code below is my implementation and I'm not getting the result expected.
function [out_im] = CE_conv(im,s,M)
B = 255;
temp = ones(2*s,2*s);
temp = temp ./ numel(temp);
out1 = conv2(im,temp,'same');
out_im = im;
Aij = im-out1; %same as Yij-Sij
Aij1 = out1-im; %same as Sij-Yij
Mij = Aij;
Mij(Aij>0) = M./Aij(Aij>0); % if Yij>Sij Mij = M/Yij-Sij;
Mij(Aij<0) = M./Aij1(Aij<0); % if Yij<Sij Mij = M/Sij-Yij;
Mij(Aij==0) = M; % if Yij-Sij == 0 Mij = M;
out_im(Aij>=0) = ((B + Mij(Aij>=0)).*im(Aij>=0))./(Mij(Aij>=0)+im(Aij>=0));
out_im(Aij<0) = (Mij(Aij<0).*im(Aij<0))./ (Mij(Aij<0)+B-im(Aij<0));
I am not able to figure out where I'm going wrong.
A detailed explanation of what I'm trying to implement is given in the following paper:
Vonikakis, Vassilios, and Ioannis Andreadis. "Multi-scale image contrast enhancement." In Control, Automation, Robotics and Vision, 2008. ICARCV 2008. 10th International Conference on, pp. 856-861. IEEE, 2008.
I've tried to see if I could get those times down by processing with colfiltand nlfilter, since both are usually much faster than for-loops for sliding window image processing.
Both worked fine for relatively small windows. For an image of 2048x2048 pixels and a window of 10x10, the solution with colfilt takes about 5 seconds (on my personal computer). With a window of 21x21 the time jumped to 27 seconds, but that is still a relative improvement on the times displayed on the question. Unfortunately I don't have enough memory to colfilt using windows of 100x100, but the solution with nlfilter works, though taking about 120 seconds.
Here the code
Solution with colfilt:
function outval = enhancematrix(inputmatrix,M,B)
%Inputmatrix is a 2D matrix or column vector, outval is a 1D row vector.
% If inputmatrix is made of integers...
inputmatrix = double(inputmatrix);
%1. Compute S and Y
normFactor = 1 / (size(inputmatrix,1) + 1).^2; %Size of column.
S = normFactor*sum(inputmatrix,1); % Sum over the columns.
Y = inputmatrix(ceil(size(inputmatrix,1)/2),:); % Center row.
% So far we have all S and Y, one value per column.
%2. Compute A(abs(Y-S))
A = Afunc(abs(S-Y),M);
% And all A: one value per column.
%3. The tricky part. If Y(i)-S(i) > 0 do something.
doPositive = (Y > S);
doNegative = ~doPositive;
outval = zeros(1,size(inputmatrix,2));
outval(doPositive) = (B + A(doPositive) .* Y(doPositive)) ./ (A(doPositive) + Y(doPositive));
outval(doNegative) = (A(doNegative) .* Y(doNegative)) ./ (A(doNegative) + B - Y(doNegative));
end
function out = Afunc(x,M)
% Input x is a row vector. Output is another row vector.
out = x;
out(x == 0) = M;
out(x ~= 0) = M./x(x ~= 0);
end
And to call it, simply do:
M = 1000; B = 255; enhancenow = #(x) enhancematrix(x,M,B);
w = 21 % windowsize
result = colfilt(inputImage,[w w],'sliding',enhancenow);
Solution with nlfilter:
function outval = enhanceimagecontrast(neighbourhood,M,B)
%1. Compute S and Y
normFactor = 1 / (length(neighbourhood) + 1).^2;
S = normFactor*sum(neighbourhood(:));
Y = neighbourhood(ceil(size(neighbourhood,1)/2),ceil(size(neighbourhood,2)/2));
%2. Compute A(abs(Y-S))
test = (Y>=S);
A = Afunc(abs(Y-S),M);
%3. Return outval
if test
outval = ((B + A) * Y) / (A + Y);
else
outval = (A * Y) / (A + B - Y);
end
function aval = Afunc(x,M)
if (x == 0)
aval = M;
else
aval = M/x;
end
And to call it, simply do:
M = 1000; B = 255; enhancenow = #(x) enhanceimagecontrast(x,M,B);
w = 21 % windowsize
result = nlfilter(inputImage,[w w], enhancenow);
I didn't spend much time checking that everything is 100% correct, but I did see some nice contrast enhancement (hair looks particularly nice).
This answer is the implementation that was suggested by Peter. I debugged the implementation and presenting the final working version of the fast implementation.
function [out_im] = CE_conv(im,s,M)
B = 255;
im = ( im - min(im(:)) ) ./ ( max(im(:)) - min(im(:)) )*255;
h = ones(s,s)./(s*s);
out1 = imfilter(im,h,'conv');
out_im = im;
Aij = im-out1; %same as Yij-Sij
Aij1 = out1-im; %same as Sij-Yij
Mij = Aij;
Mij(Aij>0) = M./Aij(Aij>0); % if Yij>Sij Mij = M/(Yij-Sij);
Mij(Aij<0) = M./Aij1(Aij<0); % if Yij<Sij Mij = M/(Sij-Yij);
Mij(Aij==0) = M; % if Yij-Sij == 0 Mij = M;
out_im(Aij>=0) = ((B + Mij(Aij>=0)).*im(Aij>=0))./(Mij(Aij>=0)+im(Aij>=0));
out_im(Aij<0) = (Mij(Aij<0).*im(Aij<0))./ (Mij(Aij<0)+B-im(Aij<0));
out_im = ( out_im - min(out_im(:)) ) ./ ( max(out_im(:)) - min(out_im(:)) );
To call this use the following code
I = imread('pout.tif');
w_size = 51;
M = 4000;
output = CE_conv(I(:,:,1),w_size,M);
The output for the 'pout.tif' image is given below
The execution time for Bigger image and with 100*100 block size is around 5 secs with this implementation.
I've found myself needing to do a least-squares (or similar matrix-based operation) for every pixel in an image. Every pixel has a set of numbers associated with it, and so it can be arranged as a 3D matrix.
(This next bit can be skipped)
Quick explanation of what I mean by least-squares estimation :
Let's say we have some quadratic system that is modeled by Y = Ax^2 + Bx + C and we're looking for those A,B,C coefficients. With a few samples (at least 3) of X and the corresponding Y, we can estimate them by:
Arrange the (lets say 10) X samples into a matrix like X = [x(:).^2 x(:) ones(10,1)];
Arrange the Y samples into a similar matrix: Y = y(:);
Estimate the coefficients A,B,C by solving: coeffs = (X'*X)^(-1)*X'*Y;
Try this on your own if you want:
A = 5; B = 2; C = 1;
x = 1:10;
y = A*x(:).^2 + B*x(:) + C + .25*randn(10,1); % added some noise here
X = [x(:).^2 x(:) ones(10,1)];
Y = y(:);
coeffs = (X'*X)^-1*X'*Y
coeffs =
5.0040
1.9818
0.9241
START PAYING ATTENTION AGAIN IF I LOST YOU THERE
*MAJOR REWRITE*I've modified to bring it as close to the real problem that I have and still make it a minimum working example.
Problem Setup
%// Setup
xdim = 500;
ydim = 500;
ncoils = 8;
nshots = 4;
%// matrix size for each pixel is ncoils x nshots (an overdetermined system)
%// each pixel has a matrix stored in the 3rd and 4rth dimensions
regressor = randn(xdim,ydim, ncoils,nshots);
regressand = randn(xdim, ydim,ncoils);
So my problem is that I have to do a (X'*X)^-1*X'*Y (least-squares or similar) operation for every pixel in an image. While that itself is vectorized/matrixized the only way that I have to do it for every pixel is in a for loop, like:
Original code style
%// Actual work
tic
estimate = zeros(xdim,ydim);
for col=1:size(regressor,2)
for row=1:size(regressor,1)
X = squeeze(regressor(row,col,:,:));
Y = squeeze(regressand(row,col,:));
B = X\Y;
% B = (X'*X)^(-1)*X'*Y; %// equivalently
estimate(row,col) = B(1);
end
end
toc
Elapsed time = 27.6 seconds
EDITS in reponse to comments and other ideas
I tried some things:
1. Reshaped into a long vector and removed the double for loop. This saved some time.
2. Removed the squeeze (and in-line transposing) by permute-ing the picture before hand: This save alot more time.
Current example:
%// Actual work
tic
estimate2 = zeros(xdim*ydim,1);
regressor_mod = permute(regressor,[3 4 1 2]);
regressor_mod = reshape(regressor_mod,[ncoils,nshots,xdim*ydim]);
regressand_mod = permute(regressand,[3 1 2]);
regressand_mod = reshape(regressand_mod,[ncoils,xdim*ydim]);
for ind=1:size(regressor_mod,3) % for every pixel
X = regressor_mod(:,:,ind);
Y = regressand_mod(:,ind);
B = X\Y;
estimate2(ind) = B(1);
end
estimate2 = reshape(estimate2,[xdim,ydim]);
toc
Elapsed time = 2.30 seconds (avg of 10)
isequal(estimate2,estimate) == 1;
Rody Oldenhuis's way
N = xdim*ydim*ncoils; %// number of columns
M = xdim*ydim*nshots; %// number of rows
ii = repmat(reshape(1:N,[ncoils,xdim*ydim]),[nshots 1]); %//column indicies
jj = repmat(1:M,[ncoils 1]); %//row indicies
X = sparse(ii(:),jj(:),regressor_mod(:));
Y = regressand_mod(:);
B = X\Y;
B = reshape(B(1:nshots:end),[xdim ydim]);
Elapsed time = 2.26 seconds (avg of 10)
or 2.18 seconds (if you don't include the definition of N,M,ii,jj)
SO THE QUESTION IS:
Is there an (even) faster way?
(I don't think so.)
You can achieve a ~factor of 2 speed up by precomputing the transposition of X. i.e.
for x=1:size(picture,2) % second dimension b/c already transposed
X = picture(:,x);
XX = X';
Y = randn(n_timepoints,1);
%B = (X'*X)^-1*X'*Y; ;
B = (XX*X)^-1*XX*Y;
est(x) = B(1);
end
Before: Elapsed time is 2.520944 seconds.
After: Elapsed time is 1.134081 seconds.
EDIT:
Your code, as it stands in your latest edit, can be replaced by the following
tic
xdim = 500;
ydim = 500;
n_timepoints = 10; % for example
% Actual work
picture = randn(xdim,ydim,n_timepoints);
picture = reshape(picture, [xdim*ydim,n_timepoints])'; % note transpose
YR = randn(n_timepoints,size(picture,2));
% (XX*X).^-1 = sum(picture.*picture).^-1;
% XX*Y = sum(picture.*YR);
est = sum(picture.*picture).^-1 .* sum(picture.*YR);
est = reshape(est,[xdim,ydim]);
toc
Elapsed time is 0.127014 seconds.
This is an order of magnitude speed up on the latest edit, and the results are all but identical to the previous method.
EDIT2:
Okay, so if X is a matrix, not a vector, things are a little more complicated. We basically want to precompute as much as possible outside of the for-loop to keep our costs down. We can also get a significant speed-up by computing XT*X manually - since the result will always be a symmetric matrix, we can cut a few corners to speed things up. First, the symmetric multiplication function:
function XTX = sym_mult(X) % X is a 3-d matrix
n = size(X,2);
XTX = zeros(n,n,size(X,3));
for i=1:n
for j=i:n
XTX(i,j,:) = sum(X(:,i,:).*X(:,j,:));
if i~=j
XTX(j,i,:) = XTX(i,j,:);
end
end
end
Now the actual computation script
xdim = 500;
ydim = 500;
n_timepoints = 10; % for example
Y = randn(10,xdim*ydim);
picture = randn(xdim,ydim,n_timepoints); % 500x500x10
% Actual work
tic % start timing
picture = reshape(picture, [xdim*ydim,n_timepoints])';
% Here we precompute the (XT*Y) calculation to speed things up later
picture_y = [sum(Y);sum(Y.*picture)];
% initialize
est = zeros(size(picture,2),1);
picture = permute(picture,[1,3,2]);
XTX = cat(2,ones(n_timepoints,1,size(picture,3)),picture);
XTX = sym_mult(XTX); % precompute (XT*X) for speed
X = zeros(2,2); % preallocate for speed
XY = zeros(2,1);
for x=1:size(picture,2) % second dimension b/c already transposed
%For some reason this is a lot faster than X = XTX(:,:,x);
X(1,1) = XTX(1,1,x);
X(2,1) = XTX(2,1,x);
X(1,2) = XTX(1,2,x);
X(2,2) = XTX(2,2,x);
XY(1) = picture_y(1,x);
XY(2) = picture_y(2,x);
% Here we utilise the fact that A\B is faster than inv(A)*B
% We also use the fact that (A*B)*C = A*(B*C) to speed things up
B = X\XY;
est(x) = B(1);
end
est = reshape(est,[xdim,ydim]);
toc % end timing
Before: Elapsed time is 4.56 seconds.
After: Elapsed time is 2.24 seconds.
This is a speed up of about a factor of 2. This code should be extensible to X being any dimensions you want. For instance, in the case where X = [1 x x^2], you would change picture_y to the following
picture_y = [sum(Y);sum(Y.*picture);sum(Y.*picture.^2)];
and change XTX to
XTX = cat(2,ones(n_timepoints,1,size(picture,3)),picture,picture.^2);
You would also change a lot of 2s to 3s in the code, and add XY(3) = picture_y(3,x) to the loop. It should be fairly straight-forward, I believe.
Results
I sped up your original version, since your edit 3 was actually not working (and also does something different).
So, on my PC:
Your (original) version: 8.428473 seconds.
My obfuscated one-liner given below: 0.964589 seconds.
First, for no other reason than to impress, I'll give it as I wrote it:
%%// Some example data
xdim = 500;
ydim = 500;
n_timepoints = 10; % for example
estimate = zeros(xdim,ydim); %// initialization with explicit size
picture = randn(xdim,ydim,n_timepoints);
%%// Your original solution
%// (slightly altered to make my version's results agree with yours)
tic
Y = randn(n_timepoints,xdim*ydim);
ii = 1;
for x = 1:xdim
for y = 1:ydim
X = squeeze(picture(x,y,:)); %// or similar creation of X matrix
B = (X'*X)^(-1)*X' * Y(:,ii);
ii = ii+1;
%// sometimes you keep everything and do
%// estimate(x,y,:) = B(:);
%// sometimes just the first element is important and you do
estimate(x,y) = B(1);
end
end
toc
%%// My version
tic
%// UNLEASH THE FURY!!
estimate2 = reshape(sparse(1:xdim*ydim*n_timepoints, ...
builtin('_paren', ones(n_timepoints,1)*(1:xdim*ydim),:), ...
builtin('_paren', permute(picture, [3 2 1]),:))\Y(:), ydim,xdim).'; %'
toc
%%// Check for equality
max(abs(estimate(:)-estimate2(:))) % (always less than ~1e-14)
Breakdown
First, here's the version that you should actually use:
%// Construct sparse block-diagonal matrix
%// (Type "help sparse" for more information)
N = xdim*ydim; %// number of columns
M = N*n_timepoints; %// number of rows
ii = 1:N;
jj = ones(n_timepoints,1)*(1:N);
s = permute(picture, [3 2 1]);
X = sparse(ii,jj(:), s(:));
%// Compute ALL the estimates at once
estimates = X\Y(:);
%// You loop through the *second* dimension first, so to make everything
%// agree, we have to extract elements in the "wrong" order, and transpose:
estimate2 = reshape(estimates, ydim,xdim).'; %'
Here's an example of what picture and the corresponding matrix X looks like for xdim = ydim = n_timepoints = 2:
>> clc, picture, full(X)
picture(:,:,1) =
-0.5643 -2.0504
-0.1656 0.4497
picture(:,:,2) =
0.6397 0.7782
0.5830 -0.3138
ans =
-0.5643 0 0 0
0.6397 0 0 0
0 -2.0504 0 0
0 0.7782 0 0
0 0 -0.1656 0
0 0 0.5830 0
0 0 0 0.4497
0 0 0 -0.3138
You can see why sparse is necessary -- it's mostly zeros, but will grow large quickly. The full matrix would quickly consume all your RAM, while the sparse one will not consume much more than the original picture matrix does.
With this matrix X, the new problem
X·b = Y
now contains all the problems
X1 · b1 = Y1
X2 · b2 = Y2
...
where
b = [b1; b2; b3; ...]
Y = [Y1; Y2; Y3; ...]
so, the single command
X\Y
will solve all your systems at once.
This offloads all the hard work to a set of highly specialized, compiled to machine-specific code, optimized-in-every-way algorithms, rather than the interpreted, generic, always-two-steps-away from the hardware loops in MATLAB.
It should be straightforward to convert this to a version where X is a matrix; you'll end up with something like what blkdiag does, which can also be used by mldivide in exactly the same way as above.
I had a wee play around with an idea, and I decided to stick it as a separate answer, as its a completely different approach to my other idea, and I don't actually condone what I'm about to do. I think this is the fastest approach so far:
Orignal (unoptimised): 13.507176 seconds.
Fast Cholesky-decomposition method: 0.424464 seconds
First, we've got a function to quickly do the X'*X multiplication. We can speed things up here because the result will always be symmetric.
function XX = sym_mult(X)
n = size(X,2);
XX = zeros(n,n,size(X,3));
for i=1:n
for j=i:n
XX(i,j,:) = sum(X(:,i,:).*X(:,j,:));
if i~=j
XX(j,i,:) = XX(i,j,:);
end
end
end
The we have a function to do LDL Cholesky decomposition of a 3D matrix (we can do this because the (X'*X) matrix will always be symmetric) and then do forward and backwards substitution to solve the LDL inversion equation
function Y = fast_chol(X,XY)
n=size(X,2);
L = zeros(n,n,size(X,3));
D = zeros(n,n,size(X,3));
B = zeros(n,1,size(X,3));
Y = zeros(n,1,size(X,3));
% These loops compute the LDL decomposition of the 3D matrix
for i=1:n
D(i,i,:) = X(i,i,:);
L(i,i,:) = 1;
for j=1:i-1
L(i,j,:) = X(i,j,:);
for k=1:(j-1)
L(i,j,:) = L(i,j,:) - L(i,k,:).*L(j,k,:).*D(k,k,:);
end
D(i,j,:) = L(i,j,:);
L(i,j,:) = L(i,j,:)./D(j,j,:);
if i~=j
D(i,i,:) = D(i,i,:) - L(i,j,:).^2.*D(j,j,:);
end
end
end
for i=1:n
B(i,1,:) = XY(i,:);
for j=1:(i-1)
B(i,1,:) = B(i,1,:)-D(i,j,:).*B(j,1,:);
end
B(i,1,:) = B(i,1,:)./D(i,i,:);
end
for i=n:-1:1
Y(i,1,:) = B(i,1,:);
for j=n:-1:(i+1)
Y(i,1,:) = Y(i,1,:)-L(j,i,:).*Y(j,1,:);
end
end
Finally, we have the main script which calls all of this
xdim = 500;
ydim = 500;
n_timepoints = 10; % for example
Y = randn(10,xdim*ydim);
picture = randn(xdim,ydim,n_timepoints); % 500x500x10
tic % start timing
picture = reshape(pr, [xdim*ydim,n_timepoints])';
% Here we precompute the (XT*Y) calculation
picture_y = [sum(Y);sum(Y.*picture)];
% initialize
est2 = zeros(size(picture,2),1);
picture = permute(picture,[1,3,2]);
% Now we calculate the X'*X matrix
XTX = cat(2,ones(n_timepoints,1,size(picture,3)),picture);
XTX = sym_mult(XTX);
% Call our fast Cholesky decomposition routine
B = fast_chol(XTX,picture_y);
est2 = B(1,:);
est2 = reshape(est2,[xdim,ydim]);
toc
Again, this should work equally well for a Nx3 X matrix, or however big you want.
I use octave, thus I can't say anything about the resulting performance in Matlab, but would expect this code to be slightly faster:
pictureT=picture'
est=arrayfun(#(x)( (pictureT(x,:)*picture(:,x))^-1*pictureT(x,:)*randn(n_ti
mepoints,1)),1:size(picture,2));