How to read input matrix of a Matlab mex function row-wise? - matlab

I need create a Matlab mex function that will take an input matrix and return the matrix diagonal.
Input:
1 2 3
4 5 6
Expected output:
1 2 3 0 0 0
0 0 0 4 5 6
My problem is that since Matlab reads matrices column-wise instead of row-wise, my mex function gives the wrong output.
Current output:
1 4 0 0 0 0
0 0 2 5 0 0
0 0 0 0 3 6
How would you go about changing my code to read the input matrix row-wise so I can have the proper output?
My code is as follows:
#include <matrix.h>
#include <mex.h>
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
mxArray *a_in, *b_out;
const mwSize *dims;
double *a, *b;
int rows, cols;
// Input array:
a_in = mxDuplicateArray(prhs[0]);
// Get dimensions of input array:
dims = mxGetDimensions(prhs[0]);
rows = (int) dims[0];
cols = (int) dims[1];
// Output array:
if(rows == cols){
b_out = plhs[0] = mxCreateDoubleMatrix(rows, rows*cols, mxREAL);
}
else{
b_out = plhs[0] = mxCreateDoubleMatrix(cols, rows*cols, mxREAL);
}
// Access the contents of the input and output arrays:
a = mxGetPr(a_in);
b = mxGetPr(b_out);
// Compute exdiag function of the input array
int count = 0;
for (int i = 0; i < rows; i++) {
for(int j = 0; j<cols;j++){
if(rows == cols){
b[rows*count+count/rows] = a[j + rows * i];
count++;
}
else if(rows < cols){
b[cols*count+count/rows] = a[j + cols * i];
count++;
}
else if(rows>cols){
b[cols*count+count/rows] = a[j + cols * i];
count++;
}
}
}
}

Inside your loops, i is the row index and j is the column index. You do a[j + rows * i], mixing up the two indices. MATLAB stores data column-wise, so you need to do a[i + rows * j] to read the input matrix correctly.
For indexing the output, you want the row to remain i, and you want the column to be i * cols + j:
b[i + rows * (i * cols + j)] = a[i + rows * j];
Note that you don't need to do a_in = mxDuplicateArray(prhs[0]), since you are not writing into a_in. You can directly access the prhs[0] matrix, or do a_in = prhs[0] if you want an alias.
Also, casting array sizes to int will cause trouble if the array is very large. It is best to use mwSize and mwIndex for array sizes and indices.
Finally, you should always check the type of the input array, if you are given an array that is not doubles, you will likely cause a read out of bounds error.
This is my code:
#include <matrix.h>
#include <mex.h>
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
mwSize const* dims;
double *a, *b;
mwSize rows, cols;
if (!mxIsDouble(prhs[0])) {
mexErrMsgTxt("Input must be doubles");
}
// Get dimensions of input array:
dims = mxGetDimensions(prhs[0]);
rows = dims[0];
cols = dims[1];
// Output array:
plhs[0] = mxCreateDoubleMatrix(rows, rows*cols, mxREAL);
// Access the contents of the input and output arrays:
a = mxGetPr(prhs[0]);
b = mxGetPr(plhs[0]);
// Compute exdiag function of the input array
for (mwIndex i = 0; i < rows; i++) {
for (mwIndex j = 0; j < cols; j++) {
b[i + rows * (i * cols + j)] = a[i + rows * j];
}
}
}

Related

Determine if matrix A is subset of matrix B

For a matrix such as
A = [...
12 34 67;
90 78 15;
10 71 24];
how could we determine efficiently if it is subset of a larger matrix?
B = [...
12 34 67; % found
89 67 45;
90 78 15; % found
10 71 24; % found, so A is subset of B.
54 34 11];
Here are conditions:
all numbers are integers
matrices are so large, i.e., row# > 100000, column# may vary from 1 to 10 (same for A and B).
Edit:
It seems that ismember for the case of this question, when called only few times works just fine. My initial impression was due to previous experiences where ismember was being invoked many times inside a nested loop resulting in the worst performance.
clear all; clc
n = 200000;
k = 10;
B = randi(n,n,k);
f = randperm(n);
A = B(f(1:1000),:);
tic
assert(sum(ismember(A,B,'rows')) == size(A,1));
toc
tic
assert(all(any(all(bsxfun(#eq,B,permute(A,[3,2,1])),2),1))); %user2999345
toc
which results in:
Elapsed time is 1.088552 seconds.
Elapsed time is 12.154969 seconds.
Here are more benchmarks:
clear all; clc
n = 20000;
f = randperm(n);
k = 10;
t1 = 0;
t2 = 0;
t3 = 0;
for i=1:7
B = randi(n,n,k);
A = B(f(1:n/10),:);
%A(100,2) = 0; % to make A not submat of B
tic
b = sum(ismember(A,B,'rows')) == size(A,1);
t1 = t1+toc;
assert(b);
tic
b = ismember_mex(A,sortrows(B));
t2 = t2+toc;
assert(b);
tic
b = issubmat(A,B);
t3 = t3+toc;
assert(b);
end
George's skm's
ismember | ismember_mex | issubmat
n=20000,k=10 0.6326 0.1064 11.6899
n=1000,k=100 0.2652 0.0155 0.0577
n=1000,k=1000 1.1705 0.1582 0.2202
n=1000,k=10000 13.2470 2.0033 2.6367
*issubmat eats RAM when n or k is over 10000!
*issubmat(A,B), A is being checked as submat of B.
It seems that ismember is hard to beat, at least using MATLAB code. I created a C implementation which can be used using the MEX compiler.
#include "mex.h"
#if MX_API_VER < 0x07030000
typedef int mwIndex;
typedef int mwSize;
#endif /* MX_API_VER */
#include <math.h>
#include <stdlib.h>
#include <string.h>
int ismember(const double *y, const double *x, int yrow, int xrow, int ncol);
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
mwSize xcol, ycol, xrow, yrow;
/* output data */
int* result;
/* arguments */
const mxArray* y;
const mxArray* x;
if (nrhs != 2)
{
mexErrMsgTxt("2 input required.");
}
y = prhs[0];
x = prhs[1];
ycol = mxGetN(y);
yrow = mxGetM(y);
xcol = mxGetN(x);
xrow = mxGetM(x);
/* The first input must be a sparse matrix. */
if (!mxIsDouble(y) || !mxIsDouble(x))
{
mexErrMsgTxt("Input must be of type 'double'.");
}
if (xcol != ycol)
{
mexErrMsgTxt("Inputs must have the same number of columns");
}
plhs[0] = mxCreateLogicalMatrix(1, 1);
result = mxGetPr(plhs[0]);
*result = ismember(mxGetPr(y), mxGetPr(x), yrow, xrow, ycol);
}
int ismemberinner(const double *y, int idx, const double *x, int yrow, int xrow, int ncol) {
int from, to, i;
from = 0;
to = xrow-1;
for(i = 0; i < ncol; ++i) {
// Perform binary search
double yi = *(y + i * yrow + idx);
double *curx = x + i * xrow;
int l = from;
int u = to;
while(l <= u) {
int mididx = l + (u-l)/2;
if(yi < curx[mididx]) {
u = mididx-1;
}
else if(yi > curx[mididx]) {
l = mididx+1;
}
else {
// This can be further optimized by performing additional binary searches
for(from = mididx; from > l && curx[from-1] == yi; --from);
for(to = mididx; to < u && curx[to+1] == yi; ++to);
break;
}
}
if(l > u) {
return 0;
}
}
return 1;
}
int ismember(const double *y, const double *x, int yrow, int xrow, int ncol) {
int i;
for(i = 0; i < yrow; ++i) {
if(!ismemberinner(y, i, x, yrow, xrow, ncol)) {
return 0;
}
}
return 1;
}
Compile it using:
mex -O ismember_mex.c
It can be called as follows:
ismember_mex(x, sortrows(x))
First of all, it assumes that the columns of the matrices have the same size. It works by first sorting the rows of the larger matrix (x in this case, the second argument to the function). Then, a type of binary search is employed to identify whether the rows of the smaller matrix (y hereafter) are contained in x. This is done for each row of y separately (see ismember C function).
For a given row of y, it starts from the first entry and finds the range of indices (using the from and to variables) that match with the first column of x using binary search. This is repeated for the remaining entries, unless some value is not found, in which case it terminates and returns 0.
I tried implementing it this idea in MATLAB, but it didn't work that well. Regarding performance, I found that: (a) in case there are mismatches, it is usually much faster than ismember (b) in case the range of values in x and y is large, it is again faster than ismember, and (c) in case everything matches and the number of possible values in x and y is small (e.g. less than 1000), then ismember may be faster in some situations.
Finally, I want to point out that some parts of the C implementation may be further optimized.
EDIT 1
I fixed the warnings and further improved the function.
#include "mex.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>
int ismember(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol);
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
unsigned int xcol, ycol, nrowx, nrowy;
/* arguments */
const mxArray* y;
const mxArray* x;
if (nrhs != 2)
{
mexErrMsgTxt("2 inputs required.");
}
y = prhs[0];
x = prhs[1];
ycol = (unsigned int) mxGetN(y);
nrowy = (unsigned int) mxGetM(y);
xcol = (unsigned int) mxGetN(x);
nrowx = (unsigned int) mxGetM(x);
/* The first input must be a sparse matrix. */
if (!mxIsDouble(y) || !mxIsDouble(x))
{
mexErrMsgTxt("Input must be of type 'double'.");
}
if (xcol != ycol)
{
mexErrMsgTxt("Inputs must have the same number of columns");
}
plhs[0] = mxCreateLogicalScalar(ismember(mxGetPr(y), mxGetPr(x), nrowy, nrowx, ycol));
}
int ismemberinner(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol) {
unsigned int from = 0, to = nrowx-1, i;
for(i = 0; i < ncol; ++i) {
// Perform binary search
const double yi = *(y + i * nrowy);
const double *curx = x + i * nrowx;
unsigned int l = from;
unsigned int u = to;
while(l <= u) {
const unsigned int mididx = l + (u-l)/2;
const double midx = curx[mididx];
if(yi < midx) {
u = mididx-1;
}
else if(yi > midx) {
l = mididx+1;
}
else {
{
// Binary search to identify smallest index of x that equals yi
// Equivalent to for(from = mididx; from > l && curx[from-1] == yi; --from)
unsigned int limit = mididx;
while(curx[from] != yi) {
const unsigned int mididx = from + (limit-from)/2;
if(curx[mididx] < yi) {
from = mididx+1;
}
else {
limit = mididx-1;
}
}
}
{
// Binary search to identify largest index of x that equals yi
// Equivalent to for(to = mididx; to < u && curx[to+1] == yi; ++to);
unsigned int limit = mididx;
while(curx[to] != yi) {
const unsigned int mididx = limit + (to-limit)/2;
if(curx[mididx] > yi) {
to = mididx-1;
}
else {
limit = mididx+1;
}
}
}
break;
}
}
if(l > u) {
return 0;
}
}
return 1;
}
int ismember(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol) {
unsigned int i;
for(i = 0; i < nrowy; ++i) {
if(!ismemberinner(y + i, x, nrowy, nrowx, ncol)) {
return 0;
}
}
return 1;
}
Using this version I wasn't able to identify any case where ismember is faster. Also, I noticed that one reason ismember is hard to beat is that it uses all cores of the machine! Of course, the function I provided can be optimized to do this too, but this requires much more effort.
Finally, before using my implementation I would advise you to do extensive testing. I did some testing and it seems to work, but I suggest you also do some additional testing.
For small matrices ismember should be enough, probably.
Usage: ismember(B,A,'rows')
ans =
1
0
1
1
0
I put this answer here, emphasizing on a need to solutions with higher performance. I will accept this answer only if there was no better solution.
Using ismember, if a row of A appears twice in B while another one is missing, might wrongly indicate that A is a member of B. The following solution is suitable if the rows of A and B doesn't need to be in the same order. However, I haven't tested its performance for large matrices.
A = [...
34 12 67;
90 78 15;
10 71 24];
B = [...
34 12 67; % found
89 67 45;
90 78 15; % found
10 71 24; % found, so A is subset of B.
54 34 11];
A = permute(A,[3 2 1]);
rowIdx = all(bsxfun(#eq,B,A),2);
colIdx = any(rowIdx,1);
isAMemberB = all(colIdx);
You have said number of columns <= 10. In addition, if the matrix elements are all integers representable as bytes, you could code each row into a two 64 bit integers. That would reduce the number of comparisons by a factor of 64.
For the general case, the following may not be all that much better for thin matrices, but scales very well as the matrices get fat due to the level 3 multiplication:
function yes = is_submat(A,B)
ma = size(A, 1);
mb = size(B, 1);
n = size(B, 2);
yes = false;
if ma >= mb
a = A(:,1);
b = B(:,1);
D = (0 == bsxfun(#minus, a, b'));
q = any(D, 2);
yes = all(any(D,1));
if yes && (n > 1)
A = A(q, :);
C = B*A';
za = sum(A.*A, 2);
zb = sum(B.*B, 2);
Z = sqrt(zb)*sqrt(za');
[~, ix] = max(C./Z, [], 2);
A = A(ix,:);
yes = all(A(:) == B(:));
end
end
end
In the above, I use the fact that the dot product is maximized when two unit vectors are equal.
For fat matrices (say 5000+ columns) with large numbers of unique elements the performance beats ismember quite handily, but otherwise, it is slower than ismember. For thin matrices ismember is faster by an order of magnitude.
Best case test for this function:
A = randi(50000, [10000, 10000]);
B = A(2:3:end, :);
B = B(randperm(size(B,1)),:);
fprintf('%s: %u\n', 'Number of columns', size(A,2));
fprintf('%s: %u\n', 'Element spread', 50000);
tic; is_submat(A,B); toc;
tic; all(ismember(B,A,'rows')); toc;
fprintf('________\n\n');
is_submat_test;
Number of columns: 10000
Element spread: 50000
Elapsed time is 10.713310 seconds (is_submat).
Elapsed time is 17.446682 seconds (ismember).
So I have to admit, all round ismember seems to be much better.
Edits: Edited to correct bug when there is only one column - fixing this also results in more efficient code. Also previous version did not distinguish between positive and negative numbers. Added timing tests.

Matlab crashed when I call mexCallMATLAB(1,a,1,b,"transpose")

I want to create a new sparse matrix and pass the value of the input sparse matrix to it, and then call Matlab to get its transpose. The code crashed, can you give me some suggestions?
extern void mexFunction(int iNbOut, mxArray *pmxOut[],
int iNbIn, const mxArray *pmxIn[])
{
mxArray *A, *B;
double *d,*p,*pin,*out;
mwSize m,n,nzmax;
mwIndex *ir, *jc;
// get the information of input sparse matrix
m = mxGetM(pmxIn[0]);
n = mxGetN(pmxIn[0]);
nzmax = mxGetNzmax(pmxIn[0]);
ir = mxGetIr(pmxIn[0]);
jc = mxGetJc(pmxIn[0]);
pin = mxGetPr(pmxIn[0]);
// create a new sparse matrix and pass the input matrix to it
A = mxCreateSparse(m, n, nzmax, mxREAL);
mxSetIr(A, ir);
mxSetJc(A, jc);
mxSetPr(A, pin);
d = mxGetPr(A);
B = mxCreateSparse(m, n, nzmax, mxREAL);
// get the transpose of A by call matlab
mexCallMATLAB(1, &B, 1, &A, "transpose");
p = mxGetPr(B);
pmxOut[0] = mxCreateNumericArray(1 , nzmax, mxSINGLE_CLASS,mxREAL);
out = mxGetPr(pmxOut[0]);
for(mwSize i = 0; i < nzmax; i++)
{
out[i] = p[i];
mexPrintf("pmxOut = %f", out[i]);
}
}

Accelerate Matlab nested for loop with bsxfun

I have a graph n x n graph W described as its adjacency matrix and a n vector of group labels (integers) of every node.
I need to count the number of links (edges) between nodes in group c and nodes in group d for every pair of groups. Do to this I wrote a nested for loop but I'm sure that this is not the fastest way to compute the matrix that in the code I call mcd, i.e. the matrix that counts the number of edges betweeen group c and d.
Is it possible through the bsxfun to make this operation faster?
function mcd = interlinks(W,ci)
%// W is the adjacency matrix of a simple undirected graph
%// ci are the group labels of every node in the graph, can be from 1 to |C|
n = length(W); %// number of nodes in the graph
m = sum(nonzeros(triu(W))); %// number of edges in the graph
ncomms = length(unique(ci)); %// number of groups of ci
mcd = zeros(ncomms); %// this is the matrix that counts the number of edges between group c and group d, twice the number of it if c==d
for c=1:ncomms
nodesc = find(ci==c); %// nodes in group c
for d=1:ncomms
nodesd = find(ci==d); %// nodes in group d
M = W(nodesc,nodesd); %// submatrix of edges between c and d
mcd(c,d) = sum(sum(M)); %// count of edges between c and d
end
end
%// Divide diagonal half because counted twice
mcd(1:ncomms+1:ncomms*ncomms)=mcd(1:ncomms+1:ncomms*ncomms)/2;
For example in the picture here the adjacency matrix is
W=[0 1 1 0 0 0;
1 0 1 1 0 0;
1 1 0 0 1 1;
0 1 0 0 1 0;
0 0 1 1 0 1;
0 0 1 0 1 0];
the group label vector is ci=[ 1 1 1 2 2 3] and the resulting matrix mcd is:
mcd=[3 2 1;
2 1 1;
1 1 0];
It means for example that group 1 has 3 links with itself, 2 links with group 2 and 1 link with group 3.
How about this?
C = bsxfun(#eq, ci,unique(ci)');
mcd = C*W*C'
mcd(logical(eye(size(mcd)))) = mcd(logical(eye(size(mcd))))./2;
I think it is what you wanted.
IIUC and assuming ci as an sorted array, it seems you are basically doing blockwise summations, but with irregular block sizes. Thus, you can use an approach using cumsum along the rows and columns and then differentiating at the shift positions in ci, which will basically give you blockwise summations.
The implementation would look like this -
%// Get cumulative sums row-wise and column-wise
csums = cumsum(cumsum(W,1),2)
%/ Get IDs of shifts and thus get cumsums at those positions
[~,idx] = unique(ci) %// OR find(diff([ci numel(ci)]))
csums_indexed = csums(idx,idx)
%// Get the blockwise summations by differentiations on csums at shifts
col1 = diff(csums_indexed(:,1),[],1)
row1 = diff(csums_indexed(1,:),[],2)
rest2D = diff(diff(csums_indexed,[],2),[],1)
out = [[csums_indexed(1,1) ; col1] [row1 ; rest2D]]
If you're not opposed to a mex function, you can use my code below.
testing code
n = 2000;
n_labels = 800;
W = rand(n, n);
W = W * W' > .5; % generate symmetric adjacency matrix of logicals
Wd = double(W);
ci = floor(rand(n, 1) * n_labels ) + 1; % generate ids from 1 to 251
[C, IA, IC] = unique(ci);
disp(sprintf('base avg fun time = %g ',timeit(#() interlinks(W, IC))));
disp(sprintf('mex avg fun time = %g ',timeit(#() interlink_mex(W, IC))));
%note this function requires symmetric (function from #aarbelle)
disp(sprintf('bsx avg fun time = %g ',timeit(#() interlinks_bsx(Wd, IC'))));
x1 = interlinks(W, IC);
x2 = interlink_mex(W, IC);
x3 = interlinks_bsx(Wd, IC');
disp(sprintf('norm(x1 - x2) = %g', norm(x1 - x2)));
disp(sprintf('norm(x1 - x3) = %g', norm(x1 - x3)));
testing results
Testing results with these settings:
base avg fun time = 4.94275
mex avg fun time = 0.0373092
bsx avg fun time = 0.126406
norm(x1 - x2) = 0
norm(x1 - x3) = 0
Basically, for small n_labels, the bsx function does very well but you can make it large enough so that the mex function is faster.
c++ code
throw it into some file like interlink_mex.cpp and compile with mex interlink_mex.cpp. You need a c++ compiler on your machine etc...
#include "mex.h"
#include "matrix.h"
#include <math.h>
// Author: Matthew Gunn
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
if(nrhs != 2)
mexErrMsgTxt("Invalid number of inputs. Shoudl be 2 input argument.");
if(nlhs != 1)
mexErrMsgTxt("Invalid number of outputs. Should be 1 output arguments.");
if(!mxIsLogical(prhs[0])) {
mexErrMsgTxt("First argument should be a logical array (i.e. type logical)");
}
if(!mxIsDouble(prhs[1])) {
mexErrMsgTxt("Second argument should be an array of type double");
}
const mxArray *W = prhs[0];
const mxArray *ci = prhs[1];
size_t W_m = mxGetM(W);
size_t W_n = mxGetN(W);
if(W_m != W_n)
mexErrMsgTxt("Rows and columns of W are not equal");
// size_t ci_m = mxGetM(ci);
size_t ci_n = mxGetNumberOfElements(ci);
mxLogical *W_data = mxGetLogicals(W);
// double *W_data = mxGetPr(W);
double *ci_data = mxGetPr(ci);
size_t *ci_data_size_t = (size_t*) mxCalloc(ci_n, sizeof(size_t));
size_t ncomms = 0;
double intpart;
for(size_t i = 0; i < ci_n; i++) {
double x = ci_data[i];
if(x < 1 || x > 65536 || modf(x, &intpart) != 0.0) {
mexErrMsgTxt("Input ci is not all integers from 1 to a maximum value of 65536 (can edit source code to change this)");
}
size_t xx = (size_t) x;
if(xx > ncomms)
ncomms = xx;
ci_data_size_t[i] = xx - 1;
}
mxArray *mcd = mxCreateDoubleMatrix(ncomms, ncomms, mxREAL);
double *mcd_data = mxGetPr(mcd);
for(size_t i = 0; i < W_n; i++) {
size_t ii = ci_data_size_t[i];
for(size_t j = 0; j < W_n; j++) {
size_t jj = ci_data_size_t[j];
mcd_data[ii + jj * ncomms] += (W_data[i + j * W_m] != 0);
}
}
for(size_t i = 0; i < ncomms * ncomms; i+= ncomms + 1) //go along diagonal
mcd_data[i]/=2; //divide by 2
mxFree(ci_data_size_t);
plhs[0] = mcd;
}

finding local mean in an image using mex-cuda

I have an image named HSIImage, of size is 565x585, in which I have find the local mean and standard deviation at every pixel. For this I am using a window W of size 9x9, if we a re finding the mean of x(i,j) we need values in the W where x(i,j) is at its center.
For working on the corner and edge pixels, I am padding the HSIImage and naming it as HSIImage2.
MATLAB code
[m,n,~] = size(HSIImage);
HSIImage2=padarray(HSIImage,[4,4],'symmetric');
mean1 = zeros(m,n);
sd = zeros(m,n);
phi_x=zeros(m,n);
for i=5:m+4
for j=5:n+4
mean1(i-4,j-4) = mean( mean(HSIImage2(i-4:i+4, j-4:j+4, 3) )); %sum / (4*4);
sd(i-4,j-4) = std( std(HSIImage2(i-4:i+4, j-4:j+4, 3), 1));
end
end
[phi_x2,mean2,sd2] = getPhi(HSIImage(:,:,3)',HSIImage2(:,:,3)',m,n);
Serial mean displayed as image.
My cuda code for finding mean and sd is
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
double* HSIImage;
double* d_HSIImage;
double* HSIImage2;
double* d_HSIImage2;
double row;
double col;
double* phi_x;
double* d_phi_x;
double* mean2;
double* d_mean;
double* d_std;
double* sd2;
HSIImage = (double*)mxGetPr(prhs[0]);
HSIImage2 = (double*)mxGetPr(prhs[1]);
row = mxGetScalar(prhs[2]);
col = mxGetScalar(prhs[3]);
plhs[0] = mxCreateDoubleMatrix(row,col,mxREAL);
phi_x = mxGetPr(plhs[0]);
plhs[1] = mxCreateDoubleMatrix(row,col,mxREAL);
mean2 = mxGetPr(plhs[1]);
plhs[2] = mxCreateDoubleMatrix(row,col,mxREAL);
sd2 = mxGetPr(plhs[2]);
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
if ( cudaMalloc(&d_HSIImage,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_HSIImage2,sizeof(double)*(row+8)*(col+8))!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_phi_x,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_mean,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_std,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
cudaMemcpy(d_HSIImage,HSIImage,sizeof(double)*row*col,cudaMemcpyHostToDevice);
cudaMemcpy(d_HSIImage2,HSIImage2,sizeof(double)*(row+8)*(col+8),cudaMemcpyHostToDevice);
phi <<< grid,block >>> (d_HSIImage,d_HSIImage2,row,col,d_phi_x,d_mean,d_std);
cudaMemcpy(phi_x,d_phi_x,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(mean2,d_mean,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(sd2,d_std,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaFree(d_HSIImage);
cudaFree(d_HSIImage2);
cudaFree(d_phi_x);
}
its working fine when image is full of ones. but when I give regular image, there is lot of difference in serial(MATLAB) and parallel(CUDA) outputs(When mean1 and mean2 are compared). Please tell me the error.
I am launching with
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
TILEWIDTH is 32. row=565, col=584.
Parallel mean displayed as image
It is important to note Matlab's c api is column-major ordered, however as mentioned in the comments OP has made sure of the consistency. The problem is that the stride used to access the data did not include the pads of the image. Going from one row to another requires a stride of col+8 (8 being padding of 4 on each side.
changing
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
to
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*(col+8)+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*(col+8)+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
Should work, however, I have included a compilable example that I validated on a small sample, that should be easy to expand.
It is not optimized, but that wasn't part of your question. Optimization using shared memory would give a large performance boost.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void phi(double *img, int row, int col, double *d_mean){
int X=blockDim.x*blockIdx.x+threadIdx.x+4;
int Y=blockDim.y*blockIdx.y+threadIdx.y+4;
double sum = 0;
if(Y<row+4 && X<col+4){
for(int i=-4; i<=4; ++i){
for(int j=-4; j<=4; ++j){
sum+=img[ (Y+j)*(col+8)+X+i];
}
}
sum/=81;
d_mean[(Y-4)*col+X-4]=sum;
}
}
int main(int argc, char * argv[]) {
int width=10, height=10;
double *h_img=new double[(width+8)*(height+8)];
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
h_img[i*(width+8)+j]=0.0;
}
}
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int index = (i+4)*(width+8)+j+4;
h_img[index]=i*width+j;
}
}
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
cout<<h_img[i*(width+8)+j]<<" ";
}cout<<endl;
}
double *d_img;
size_t size=sizeof(double)*(height+8)*(width*8);
cudaMalloc(&d_img, size);
cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice);
size = sizeof(double)*height*width;
double *d_avg;
cudaMalloc(&d_avg, size);
dim3 block(32, 32, 1);
dim3 grid(width/32+1, height/32+1, 1);
phi<<<grid, block>>>(d_img, height, width, d_avg);
cudaDeviceSynchronize();
double *h_avg=new double[width*height];
cudaMemcpy(h_avg, d_avg, size, cudaMemcpyDeviceToHost);
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
cout<<h_avg[i*width+j]<<" ";
}cout<<endl;
}
return 0;
}
Here's my 2 cents regarding local mean and local std.
You should check whether using matlab's optimized built-in functions (conv2 and stdfilt , with their gpu support) gives you better performance than a "simple" mex version. For example, to take the local mean, the fastest will be to use conv2 as follows:
local_mean_image=conv2(image,normalized_window,'same');
where in your case normalized_window=ones(9)./9^2;
For local std use stdfilt :
local_std_image = stdfilt(image, ones(9));
Both options are available for faster GPU performance, I use conv2 with Jacket routinely, and I saw the stdfilt supports gpuarray variables.
By observing the answers of #Christian Sarofeen and of #bla, I made some changes to my code and now I am able to find the mean exactly same as MATLAB. I posting this thinking that some one may use it in future(I am sending the image as is from MATLAB). Still finding standard deviation is little problem.
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
int index = (X-4)*row+Y-4;
for(i=-4;i<=4;i++){
for(j=-4;j<=4;j++){
sum= sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];
}
}
d_mean[index] = sum/81;
double mean = 0;
double temp_std[9] = {0} ;
for(j=-4;j<=4;j++){
sum = 0;
for(i=-4;i<=4;i++){
sum = sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];//vector mean
}
mean = sum/9;
sum =0 ;
for(i=-4;i<=4;i++){
int index = (X+j)*(row+8)+(Y+i);
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
temp_std[j+4] = (sqrt(sum/9));//vector std
}
sum =0 ;
for(j=-4;j<=4;j++){
sum = sum + temp_std[j+4];//mean of vectors
}
mean = sum/9;
sum = 0 ;
for(j=-4;j<=4;j++){
sum = sum + (temp_std[j+4]-mean) * (temp_std[j+4]-mean);
}
d_std[index] = sqrt(sum);//std of vectors
d_phi_x[index] = 1.0/(1.0+exp((d_mean[index]-d_HSIImage[index])/d_std[index]));
}
}

Most Efficient Way of Using mexCallMATLAB in Converting Double* to mxArray*

I am writing a MEX code in which I need to use pinv function. I am trying to find a way to pass the array of type double to pinv using mexCallMATLAB in the most efficient way. Let's for the sake of example say the array is named G and its size is 100.
double *G = (double*) mxMalloc( 100 * sizeof(double) );
where
G[0] = G11; G[1] = G12;
G[2] = G21; G[3] = G22;
Which means every four consecutive elements of G is a 2×2 matrix. G stores 25 different values of this 2×2 matrix.
I should note that these 2×2 matrices are not well-conditioned and they may contain all zero in their element. How can I use pinv function to calculate the pseudoinverse in the elements of G? For example, how can I pass the array to mexCallMATLAB in order to calculate the pseudoinverse of the first 2×2 matrix in G?
I thought of the following approach:
mxArray *G_PINV_input = mxCreateDoubleMatrix(2, 2, mxREAL);
mxArray *G_PINV_output = mxCreateDoubleMatrix(2, 2, mxREAL);
double *G_PINV_input_ptr = mxGetPr(G_PINV_input);
memcpy( G_PINV_input_ptr, &G[0], 4 * sizeof(double));
mexCallMATLAB(1, G_PINV_output, 1, G_PINV_input, "pinv");
I am not sure how good this approach is. Copying the values is not economical at all because the total number of elements in G in my actual application is large. Is there anyway to skip this copying?
Here is my implementation of the MEX-function:
my_pinv.cpp
#include "mex.h"
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[])
{
// validate arguments
if (nrhs!=1 || nlhs>1)
mexErrMsgIdAndTxt("mex:error", "Wrong number of arguments");
if (!mxIsDouble(prhs[0]) || mxIsComplex(prhs[0]) || mxIsSparse(prhs[0]))
mexErrMsgIdAndTxt("mex:error", "Input isnt real dense double array");
if (mxGetNumberOfElements(prhs[0]) != 100)
mexErrMsgIdAndTxt("mex:error", "numel() != 100");
// create necessary arrays
mxArray *rhs[1], *lhs[1];
plhs[0] = mxCreateDoubleMatrix(100, 1, mxREAL);
rhs[0] = mxCreateDoubleMatrix(2, 2, mxREAL);
double *in = mxGetPr(prhs[0]);
double *out = mxGetPr(plhs[0]);
double *x = mxGetPr(rhs[0]), *y;
// for each 2x2 matrix
for (mwIndex i=0; i<100; i+=4) {
// copy 2x2 matrix into rhs
x[0] = in[i+0];
x[2] = in[i+1];
x[1] = in[i+2];
x[3] = in[i+3];
// lhs = pinv(rhs)
mexCallMATLAB(1, lhs, 1, rhs, "pinv");
// copy 2x2 matrix from lhs
y = mxGetPr(lhs[0]);
out[i+0] = y[0];
out[i+1] = y[1];
out[i+2] = y[2];
out[i+3] = y[3];
// free array
mxDestroyArray(lhs[0]);
}
// cleanup
mxDestroyArray(rhs[0]);
}
Here is a baseline implementation in MATLAB so that we can verify the results are correct:
my_pinv0.m
function y = my_pinv0(x)
y = zeros(size(x));
for i=1:4:numel(x)
y(i:i+3) = pinv(x([0 1; 2 3]+i));
end
end
Now we test the MEX-function:
% some vector
x = randn(100,1);
% MEX vs. MATLAB function
y = my_pinv0(x);
yy = my_pinv(x);
% compare
assert(isequal(y,yy))
EDIT:
Here is an another implementation:
my_pinv2.cpp
#include "mex.h"
inline void call_pinv(const double &a, const double &b, const double &c,
const double &d, double *out)
{
mxArray *rhs[1], *lhs[1];
// create input matrix [a b; c d]
rhs[0] = mxCreateDoubleMatrix(2, 2, mxREAL);
double *x = mxGetPr(rhs[0]);
x[0] = a;
x[1] = c;
x[2] = b;
x[3] = d;
// lhs = pinv(rhs)
mexCallMATLAB(1, lhs, 1, rhs, "pinv");
// get values from output matrix
const double *y = mxGetPr(lhs[0]);
out[0] = y[0];
out[1] = y[1];
out[2] = y[2];
out[3] = y[3];
// cleanup
mxDestroyArray(lhs[0]);
mxDestroyArray(rhs[0]);
}
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[])
{
// validate arguments
if (nrhs!=1 || nlhs>1)
mexErrMsgIdAndTxt("mex:error", "Wrong number of arguments");
if (!mxIsDouble(prhs[0]) || mxIsComplex(prhs[0]) || mxIsSparse(prhs[0]))
mexErrMsgIdAndTxt("mex:error", "Input isnt real dense double array");
if (mxGetNumberOfElements(prhs[0]) != 100)
mexErrMsgIdAndTxt("mex:error", "numel() != 100");
// allocate output
plhs[0] = mxCreateDoubleMatrix(100, 1, mxREAL);
double *out = mxGetPr(plhs[0]);
const double *in = mxGetPr(prhs[0]);
// for each 2x2 matrix
for (mwIndex i=0; i<100; i+=4) {
// 2x2 input matrix [a b; c d], and its determinant
const double a = in[i+0];
const double b = in[i+1];
const double c = in[i+2];
const double d = in[i+3];
const double det = (a*d - b*c);
if (det != 0) {
// inverse of 2x2 matrix [d -b; -c a]/det
out[i+0] = d/det;
out[i+1] = -c/det;
out[i+2] = -b/det;
out[i+3] = a/det;
}
else {
// singular matrix, fallback to pseudo-inverse
call_pinv(a, b, c, d, &out[i]);
}
}
}
This time we compute the determinant of the 2x2 matrix, if is non-zero, we calculate the inverse ourselves according to:
Otherwise we fallback to invoking PINV from MATLAB for the pseudo-inverse.
Here is quick benchmark:
% 100x1 vector
x = randn(100,1); % average case, with normal 2x2 matrices
% running time
funcs = {#my_pinv0, #my_pinv1, #my_pinv2};
t = cellfun(#(f) timeit(#() f(x)), funcs, 'Uniform',true);
% compare results
y = cellfun(#(f) f(x), funcs, 'Uniform',false);
assert(isequal(y{1},y{2}))
I get the following timings:
>> fprintf('%.6f\n', t);
0.002111 % MATLAB function
0.001498 % first MEX-file with mexCallMATLAB
0.000010 % second MEX-file with "unrolled" matrix inverse (+ PINV as fallback)
The error is acceptable and within machine precision:
>> norm(y{1}-y{3})
ans =
2.1198e-14
You could also test the worst case, when many of the 2x2 matrices are singular:
x = randi([0 1], [100 1]);
You don't need to allocate the output. Just make the pointer and let pinv create the mxArray automatically.
mxArray *lhs;
Then just use & like,
mexCallMATLAB(1, &lhs, 1, &rhs, "pinv");