Using Thrust library of CUDA for large values - matlab

Hi I wanted to implement a loop which is extremely large in thrust but i find it much slower than normal C++ code. Can you please tell me where am i going wrong.
fi and fj are host vectors
xsize usually is around a 7-8 digit number
thrust::host_vector <double> df((2*floor(r)*(floor(r)+1)+1)*n*n);
thrust::device_vector<double> gpu_df((2*floor(r)*(floor(r)+1)+1)*n*n);
for(i=0;i<xsize;i++)
{
gpu_df[i]=(fi[i]-fj[i]);
if(gpu_df[i]<0)
gpu_df[i]=0;
else
gpu_df[i]=gpu_df[i]*(fi[i]-fj[i]);
if(gpu_df[i]>255)
gpu_df[i]=255;
// cout<<fi[i]<<"\n";
}
df=gpu_df;
I feel the code is not being parallelized. Could you please help me out.

To run programs on the GPU with Thrust you need to write them in terms of Thrust algorithms like reduce, transform, sort, etc. In this case we can write the computation in terms of transform, since the loop was just computing a function F(fi[i], fj[i]) and storing the result in df[i]. Note that we must first move the input arrays to the device before calling transform because Thrust requires the input and output arrays to live in the same place.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <cstdio>
struct my_functor
: public thrust::binary_function<float,float,float>
{
__host__ __device__
float operator()(float fi, float fj)
{
float d = fi - fj;
if (d < 0)
d = 0;
else
d = d * d;
if (d > 255)
d = 255;
return d;
}
};
int main(void)
{
size_t N = 5;
// allocate storage on host
thrust::host_vector<float> cpu_fi(N);
thrust::host_vector<float> cpu_fj(N);
thrust::host_vector<float> cpu_df(N);
// initialze fi and fj arrays
cpu_fi[0] = 2.0; cpu_fj[0] = 0.0;
cpu_fi[1] = 0.0; cpu_fj[1] = 2.0;
cpu_fi[2] = 3.0; cpu_fj[2] = 1.0;
cpu_fi[3] = 4.0; cpu_fj[3] = 5.0;
cpu_fi[4] = 8.0; cpu_fj[4] = -8.0;
// copy fi and fj to device
thrust::device_vector<float> gpu_fi = cpu_fi;
thrust::device_vector<float> gpu_fj = cpu_fj;
// allocate storage for df
thrust::device_vector<float> gpu_df(N);
// perform transformation
thrust::transform(gpu_fi.begin(), gpu_fi.end(), // first input range
gpu_fj.begin(), // second input range
gpu_df.begin(), // output range
my_functor()); // functor to apply
// copy results back to host
thrust::copy(gpu_df.begin(), gpu_df.end(), cpu_df.begin());
// print results on host
for (size_t i = 0; i < N; i++)
printf("f(%2.0lf,%2.0lf) = %3.0lf\n", cpu_fi[i], cpu_fj[i], cpu_df[i]);
return 0;
}
For reference, here's the output of the program:
f( 2, 0) = 4
f( 0, 2) = 0
f( 3, 1) = 4
f( 4, 5) = 0
f( 8,-8) = 255

Related

Determine if matrix A is subset of matrix B

For a matrix such as
A = [...
12 34 67;
90 78 15;
10 71 24];
how could we determine efficiently if it is subset of a larger matrix?
B = [...
12 34 67; % found
89 67 45;
90 78 15; % found
10 71 24; % found, so A is subset of B.
54 34 11];
Here are conditions:
all numbers are integers
matrices are so large, i.e., row# > 100000, column# may vary from 1 to 10 (same for A and B).
Edit:
It seems that ismember for the case of this question, when called only few times works just fine. My initial impression was due to previous experiences where ismember was being invoked many times inside a nested loop resulting in the worst performance.
clear all; clc
n = 200000;
k = 10;
B = randi(n,n,k);
f = randperm(n);
A = B(f(1:1000),:);
tic
assert(sum(ismember(A,B,'rows')) == size(A,1));
toc
tic
assert(all(any(all(bsxfun(#eq,B,permute(A,[3,2,1])),2),1))); %user2999345
toc
which results in:
Elapsed time is 1.088552 seconds.
Elapsed time is 12.154969 seconds.
Here are more benchmarks:
clear all; clc
n = 20000;
f = randperm(n);
k = 10;
t1 = 0;
t2 = 0;
t3 = 0;
for i=1:7
B = randi(n,n,k);
A = B(f(1:n/10),:);
%A(100,2) = 0; % to make A not submat of B
tic
b = sum(ismember(A,B,'rows')) == size(A,1);
t1 = t1+toc;
assert(b);
tic
b = ismember_mex(A,sortrows(B));
t2 = t2+toc;
assert(b);
tic
b = issubmat(A,B);
t3 = t3+toc;
assert(b);
end
George's skm's
ismember | ismember_mex | issubmat
n=20000,k=10 0.6326 0.1064 11.6899
n=1000,k=100 0.2652 0.0155 0.0577
n=1000,k=1000 1.1705 0.1582 0.2202
n=1000,k=10000 13.2470 2.0033 2.6367
*issubmat eats RAM when n or k is over 10000!
*issubmat(A,B), A is being checked as submat of B.
It seems that ismember is hard to beat, at least using MATLAB code. I created a C implementation which can be used using the MEX compiler.
#include "mex.h"
#if MX_API_VER < 0x07030000
typedef int mwIndex;
typedef int mwSize;
#endif /* MX_API_VER */
#include <math.h>
#include <stdlib.h>
#include <string.h>
int ismember(const double *y, const double *x, int yrow, int xrow, int ncol);
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
mwSize xcol, ycol, xrow, yrow;
/* output data */
int* result;
/* arguments */
const mxArray* y;
const mxArray* x;
if (nrhs != 2)
{
mexErrMsgTxt("2 input required.");
}
y = prhs[0];
x = prhs[1];
ycol = mxGetN(y);
yrow = mxGetM(y);
xcol = mxGetN(x);
xrow = mxGetM(x);
/* The first input must be a sparse matrix. */
if (!mxIsDouble(y) || !mxIsDouble(x))
{
mexErrMsgTxt("Input must be of type 'double'.");
}
if (xcol != ycol)
{
mexErrMsgTxt("Inputs must have the same number of columns");
}
plhs[0] = mxCreateLogicalMatrix(1, 1);
result = mxGetPr(plhs[0]);
*result = ismember(mxGetPr(y), mxGetPr(x), yrow, xrow, ycol);
}
int ismemberinner(const double *y, int idx, const double *x, int yrow, int xrow, int ncol) {
int from, to, i;
from = 0;
to = xrow-1;
for(i = 0; i < ncol; ++i) {
// Perform binary search
double yi = *(y + i * yrow + idx);
double *curx = x + i * xrow;
int l = from;
int u = to;
while(l <= u) {
int mididx = l + (u-l)/2;
if(yi < curx[mididx]) {
u = mididx-1;
}
else if(yi > curx[mididx]) {
l = mididx+1;
}
else {
// This can be further optimized by performing additional binary searches
for(from = mididx; from > l && curx[from-1] == yi; --from);
for(to = mididx; to < u && curx[to+1] == yi; ++to);
break;
}
}
if(l > u) {
return 0;
}
}
return 1;
}
int ismember(const double *y, const double *x, int yrow, int xrow, int ncol) {
int i;
for(i = 0; i < yrow; ++i) {
if(!ismemberinner(y, i, x, yrow, xrow, ncol)) {
return 0;
}
}
return 1;
}
Compile it using:
mex -O ismember_mex.c
It can be called as follows:
ismember_mex(x, sortrows(x))
First of all, it assumes that the columns of the matrices have the same size. It works by first sorting the rows of the larger matrix (x in this case, the second argument to the function). Then, a type of binary search is employed to identify whether the rows of the smaller matrix (y hereafter) are contained in x. This is done for each row of y separately (see ismember C function).
For a given row of y, it starts from the first entry and finds the range of indices (using the from and to variables) that match with the first column of x using binary search. This is repeated for the remaining entries, unless some value is not found, in which case it terminates and returns 0.
I tried implementing it this idea in MATLAB, but it didn't work that well. Regarding performance, I found that: (a) in case there are mismatches, it is usually much faster than ismember (b) in case the range of values in x and y is large, it is again faster than ismember, and (c) in case everything matches and the number of possible values in x and y is small (e.g. less than 1000), then ismember may be faster in some situations.
Finally, I want to point out that some parts of the C implementation may be further optimized.
EDIT 1
I fixed the warnings and further improved the function.
#include "mex.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>
int ismember(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol);
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
unsigned int xcol, ycol, nrowx, nrowy;
/* arguments */
const mxArray* y;
const mxArray* x;
if (nrhs != 2)
{
mexErrMsgTxt("2 inputs required.");
}
y = prhs[0];
x = prhs[1];
ycol = (unsigned int) mxGetN(y);
nrowy = (unsigned int) mxGetM(y);
xcol = (unsigned int) mxGetN(x);
nrowx = (unsigned int) mxGetM(x);
/* The first input must be a sparse matrix. */
if (!mxIsDouble(y) || !mxIsDouble(x))
{
mexErrMsgTxt("Input must be of type 'double'.");
}
if (xcol != ycol)
{
mexErrMsgTxt("Inputs must have the same number of columns");
}
plhs[0] = mxCreateLogicalScalar(ismember(mxGetPr(y), mxGetPr(x), nrowy, nrowx, ycol));
}
int ismemberinner(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol) {
unsigned int from = 0, to = nrowx-1, i;
for(i = 0; i < ncol; ++i) {
// Perform binary search
const double yi = *(y + i * nrowy);
const double *curx = x + i * nrowx;
unsigned int l = from;
unsigned int u = to;
while(l <= u) {
const unsigned int mididx = l + (u-l)/2;
const double midx = curx[mididx];
if(yi < midx) {
u = mididx-1;
}
else if(yi > midx) {
l = mididx+1;
}
else {
{
// Binary search to identify smallest index of x that equals yi
// Equivalent to for(from = mididx; from > l && curx[from-1] == yi; --from)
unsigned int limit = mididx;
while(curx[from] != yi) {
const unsigned int mididx = from + (limit-from)/2;
if(curx[mididx] < yi) {
from = mididx+1;
}
else {
limit = mididx-1;
}
}
}
{
// Binary search to identify largest index of x that equals yi
// Equivalent to for(to = mididx; to < u && curx[to+1] == yi; ++to);
unsigned int limit = mididx;
while(curx[to] != yi) {
const unsigned int mididx = limit + (to-limit)/2;
if(curx[mididx] > yi) {
to = mididx-1;
}
else {
limit = mididx+1;
}
}
}
break;
}
}
if(l > u) {
return 0;
}
}
return 1;
}
int ismember(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol) {
unsigned int i;
for(i = 0; i < nrowy; ++i) {
if(!ismemberinner(y + i, x, nrowy, nrowx, ncol)) {
return 0;
}
}
return 1;
}
Using this version I wasn't able to identify any case where ismember is faster. Also, I noticed that one reason ismember is hard to beat is that it uses all cores of the machine! Of course, the function I provided can be optimized to do this too, but this requires much more effort.
Finally, before using my implementation I would advise you to do extensive testing. I did some testing and it seems to work, but I suggest you also do some additional testing.
For small matrices ismember should be enough, probably.
Usage: ismember(B,A,'rows')
ans =
1
0
1
1
0
I put this answer here, emphasizing on a need to solutions with higher performance. I will accept this answer only if there was no better solution.
Using ismember, if a row of A appears twice in B while another one is missing, might wrongly indicate that A is a member of B. The following solution is suitable if the rows of A and B doesn't need to be in the same order. However, I haven't tested its performance for large matrices.
A = [...
34 12 67;
90 78 15;
10 71 24];
B = [...
34 12 67; % found
89 67 45;
90 78 15; % found
10 71 24; % found, so A is subset of B.
54 34 11];
A = permute(A,[3 2 1]);
rowIdx = all(bsxfun(#eq,B,A),2);
colIdx = any(rowIdx,1);
isAMemberB = all(colIdx);
You have said number of columns <= 10. In addition, if the matrix elements are all integers representable as bytes, you could code each row into a two 64 bit integers. That would reduce the number of comparisons by a factor of 64.
For the general case, the following may not be all that much better for thin matrices, but scales very well as the matrices get fat due to the level 3 multiplication:
function yes = is_submat(A,B)
ma = size(A, 1);
mb = size(B, 1);
n = size(B, 2);
yes = false;
if ma >= mb
a = A(:,1);
b = B(:,1);
D = (0 == bsxfun(#minus, a, b'));
q = any(D, 2);
yes = all(any(D,1));
if yes && (n > 1)
A = A(q, :);
C = B*A';
za = sum(A.*A, 2);
zb = sum(B.*B, 2);
Z = sqrt(zb)*sqrt(za');
[~, ix] = max(C./Z, [], 2);
A = A(ix,:);
yes = all(A(:) == B(:));
end
end
end
In the above, I use the fact that the dot product is maximized when two unit vectors are equal.
For fat matrices (say 5000+ columns) with large numbers of unique elements the performance beats ismember quite handily, but otherwise, it is slower than ismember. For thin matrices ismember is faster by an order of magnitude.
Best case test for this function:
A = randi(50000, [10000, 10000]);
B = A(2:3:end, :);
B = B(randperm(size(B,1)),:);
fprintf('%s: %u\n', 'Number of columns', size(A,2));
fprintf('%s: %u\n', 'Element spread', 50000);
tic; is_submat(A,B); toc;
tic; all(ismember(B,A,'rows')); toc;
fprintf('________\n\n');
is_submat_test;
Number of columns: 10000
Element spread: 50000
Elapsed time is 10.713310 seconds (is_submat).
Elapsed time is 17.446682 seconds (ismember).
So I have to admit, all round ismember seems to be much better.
Edits: Edited to correct bug when there is only one column - fixing this also results in more efficient code. Also previous version did not distinguish between positive and negative numbers. Added timing tests.

Cuda matrix multiplication results differs from MATLAB

Its been two days and I am still cant figure it out why my implementation of CUDA matrix multiplication differs from the results produced in MATLAB.
CUDA kernel: A(200x60000) = W(200x784) * Data(784x6000)
__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if ((Row < W.row) && (Col < Data.col)){
float Cvalue = 0.0;
for (int i = 0; i < W.col; ++i){
Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
}
A.elements[Row*A.col+Col] = Cvalue;
}
}
And calling the kernel:
void myFunc(Matrix W1, Matrix data){
Matrix d_W1, d_data, d_a2, a2;
size_t size;
a2.row = W1.row; d_a2.row = a2.row;
a2.col = data.col; d_a2.col = a2.col;
size = a2.col*a2.row*sizeof(float);
cudaMalloc(&d_a2.elements,size);
d_W1.row = W1.row; d_W1.col = W1.col;
size = W1.col*W1.row*sizeof(float);
cudaMalloc(&d_W1.elements,size);
cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);
d_data.col = data.col; d_data.row = data.row;
size = data.row*data.col*sizeof(float);
cudaMalloc(&d_data.elements,size);
cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
dim3 dimBlock(32, 32, 1);
CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
a2.elements = new float [a2.row*a2.col];
cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);
printf("\nA2 first and last member %f - %f\n",a2.elements[0],a2.elements[a2.row*a2.col-1]);
}
Results difference is not low for example first and last elements of CUDA code is 0.011322 and -0.179534 but multiplying in MATLAB results in 0.4280 and 0.0056.
this is how I do it in MATLAB:
>> size(W1) ans = 200 784
>> size(data) ans = 784 60000
>> z2=W1*data;
>> size(z2) ans = 200 60000
>> z2 = z2(:);
>> z2(1) ans = 0.4280
>> z2(200*60000)ans = 0.0056
There is nothing wrong with the code you posted. If I expand your kernel and function into a complete running example like this:
#include <iostream>
struct Matrix
{
int row;
int col;
float *elements;
__device__ __host__
float& operator()(int r, int c) { return elements[r*col + c]; };
};
__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if ((Row < W.row) && (Col < Data.col)){
float Cvalue = 0.0;
for (int i = 0; i < W.col; ++i){
Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
}
A.elements[Row*A.col+Col] = Cvalue;
}
}
void myFunc(Matrix W1, Matrix data)
{
Matrix d_W1, d_data, d_a2, a2;
size_t size;
a2.row = W1.row; d_a2.row = a2.row;
a2.col = data.col; d_a2.col = a2.col;
size = a2.col*a2.row*sizeof(float);
cudaMalloc(&d_a2.elements,size);
d_W1.row = W1.row; d_W1.col = W1.col;
size = W1.col*W1.row*sizeof(float);
cudaMalloc(&d_W1.elements,size);
cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);
d_data.col = data.col; d_data.row = data.row;
size = data.row*data.col*sizeof(float);
cudaMalloc(&d_data.elements,size);
cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
dim3 dimBlock(32, 32, 1);
CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
a2.elements = new float [a2.row*a2.col];
cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);
for(int j=0; j<a2.col; ++j) {
for(int i=0; i<a2.row; ++i) {
std::cout << a2(i,j) << " ";
}
std::cout << std::endl;
}
}
int main(void)
{
float a[6] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
float b[6] = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
Matrix W1; W1.row=2; W1.col=3; W1.elements = &a[0];
Matrix Data; Data.row=3; Data.col=2; Data.elements = &b[0];
myFunc(W1, Data);
return 0;
}
and run it, I get this:
>nvcc -arch=sm_21 -Xptxas="-v" -m32 matrix.cu
matrix.cu
tmpxft_000014f4_00000000-5_matrix.cudafe1.gpu
tmpxft_000014f4_00000000-10_matrix.cudafe2.gpu
matrix.cu
ptxas : info : 132 bytes gmem, 28 bytes cmem[14]
ptxas : info : Compiling entry function '_Z10CalculateA6MatrixS_S_' for 'sm_21'
ptxas : info : Function properties for _Z10CalculateA6MatrixS_S_
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 14 registers, 68 bytes cmem[0]
tmpxft_000014f4_00000000-5_matrix.cudafe1.cpp
tmpxft_000014f4_00000000-15_matrix.ii
>cuda-memcheck a.exe
========= CUDA-MEMCHECK
2.2 4.9
2.8 6.4
========= ERROR SUMMARY: 0 errors
which is the correct answer for the dot product assuming column major ordering (which is the Matlab convention).
So if your results don't agree, it is because of something you haven't shown us. One likelihood is that your test problem is so large (and kernel so inefficient) that if you are running this on a display GPU, your program is hitting the display driver watchdog timer limit and being killed before the kernel finishes running. Also note that you have no CUDA API error checking whatsoever, so it is possible that you are getting runtime errors which is either stopping your kernel from finishing or even running at all, but you simply don't notice because of the lack of error checking.

Perceptual (or average) image hashing

I need to calculate the perceptual hash of an image and should do it without using any external libraries.
I tried using pHash (http://phash.org/) but I wasn't able to compile it for iOS (5) and I haven't found a real tutorial on how to do it.
One (library-dependent) solution is to use the pHashing functionality added to ImageMagick in version 6.8.8.3, which has iOS binaries available. Usage examples are documented here.
Here's also a simple reference function (in C#) for generating your own comparable image average hash, found on this blog.
public static ulong AverageHash(System.Drawing.Image theImage)
// Calculate a hash of an image based on visual characteristics.
// Described at http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
{
// Squeeze the image down to an 8x8 image.
// Chant the ancient incantations to create the correct data structures.
Bitmap squeezedImage = new Bitmap(8, 8, PixelFormat.Format32bppRgb);
Graphics drawingArea = Graphics.FromImage(squeezedImage);
drawingArea.CompositingQuality = CompositingQuality.HighQuality;
drawingArea.InterpolationMode = InterpolationMode.HighQualityBilinear;
drawingArea.SmoothingMode = SmoothingMode.HighQuality;
drawingArea.DrawImage(theImage, 0, 0, 8, 8);
byte[] grayScaleImage = new byte[64];
uint averageValue = 0;
ulong finalHash = 0;
// Reduce to 8-bit grayscale and calculate the average pixel value.
for(int y = 0; y < 8; y++)
{
for(int x = 0; x < 8; x++)
{
Color pixelColour = squeezedImage.GetPixel(x,y);
uint grayTone = ((uint)((pixelColour.R * 0.3) + (pixelColour.G * 0.59) + (pixelColour.B * 0.11)));
grayScaleImage[x + y*8] = (byte)grayTone;
averageValue += grayTone;
}
}
averageValue /= 64;
// Return 1-bits when the tone is equal to or above the average,
// and 0-bits when it's below the average.
for(int k = 0; k < 64; k++)
{
if(grayScaleImage[k] >= averageValue)
{
finalHash |= (1UL << (63-k));
}
}
return finalHash;
}

imregionalmax matlab function's equivalent in opencv

I have an image of connected components(circles filled).If i want to segment them i can use watershed algorithm.I prefer writing my own function for watershed instead of using the inbuilt function in OPENCV.I have successfu How do i find the regionalmax of objects using opencv?
I wrote a function myself. My results were quite similar to MATLAB, although not exact. This function is implemented for CV_32F but it can easily be modified for other types.
I mark all the points that are not part of a minimum region by checking all the neighbors. The remaining regions are either minima, maxima or areas of inflection.
I use connected components to label each region.
I check each region for any point belonging to a maxima, if yes then I push that label into a vector.
Finally I sort the bad labels, erase all duplicates and then mark all the points in the output as not minima.
All that remains are the regions of minima.
Here is the code:
// output is a binary image
// 1: not a min region
// 0: part of a min region
// 2: not sure if min or not
// 3: uninitialized
void imregionalmin(cv::Mat& img, cv::Mat& out_img)
{
// pad the border of img with 1 and copy to img_pad
cv::Mat img_pad;
cv::copyMakeBorder(img, img_pad, 1, 1, 1, 1, IPL_BORDER_CONSTANT, 1);
// initialize binary output to 2, unknown if min
out_img = cv::Mat::ones(img.rows, img.cols, CV_8U)+2;
// initialize pointers to matrices
float* in = (float *)(img_pad.data);
uchar* out = (uchar *)(out_img.data);
// size of matrix
int in_size = img_pad.cols*img_pad.rows;
int out_size = img.cols*img.rows;
int x, y;
for (int i = 0; i < out_size; i++) {
// find x, y indexes
y = i % img.cols;
x = i / img.cols;
neighborCheck(in, out, i, x, y, img_pad.cols); // all regions are either min or max
}
cv::Mat label;
cv::connectedComponents(out_img, label);
int* lab = (int *)(label.data);
in = (float *)(img.data);
in_size = img.cols*img.rows;
std::vector<int> bad_labels;
for (int i = 0; i < out_size; i++) {
// find x, y indexes
y = i % img.cols;
x = i / img.cols;
if (lab[i] != 0) {
if (neighborCleanup(in, out, i, x, y, img.rows, img.cols) == 1) {
bad_labels.push_back(lab[i]);
}
}
}
std::sort(bad_labels.begin(), bad_labels.end());
bad_labels.erase(std::unique(bad_labels.begin(), bad_labels.end()), bad_labels.end());
for (int i = 0; i < out_size; ++i) {
if (lab[i] != 0) {
if (std::find(bad_labels.begin(), bad_labels.end(), lab[i]) != bad_labels.end()) {
out[i] = 0;
}
}
}
}
int inline neighborCleanup(float* in, uchar* out, int i, int x, int y, int x_lim, int y_lim)
{
int index;
for (int xx = x - 1; xx < x + 2; ++xx) {
for (int yy = y - 1; yy < y + 2; ++yy) {
if (((xx == x) && (yy==y)) || xx < 0 || yy < 0 || xx >= x_lim || yy >= y_lim)
continue;
index = xx*y_lim + yy;
if ((in[i] == in[index]) && (out[index] == 0))
return 1;
}
}
return 0;
}
void inline neighborCheck(float* in, uchar* out, int i, int x, int y, int x_lim)
{
int indexes[8], cur_index;
indexes[0] = x*x_lim + y;
indexes[1] = x*x_lim + y+1;
indexes[2] = x*x_lim + y+2;
indexes[3] = (x+1)*x_lim + y+2;
indexes[4] = (x + 2)*x_lim + y+2;
indexes[5] = (x + 2)*x_lim + y + 1;
indexes[6] = (x + 2)*x_lim + y;
indexes[7] = (x + 1)*x_lim + y;
cur_index = (x + 1)*x_lim + y+1;
for (int t = 0; t < 8; t++) {
if (in[indexes[t]] < in[cur_index]) {
out[i] = 0;
break;
}
}
if (out[i] == 3)
out[i] = 1;
}
The following listing is a function similar to Matlab's "imregionalmax". It looks for at most nLocMax local maxima above threshold, where the found local maxima are at least minDistBtwLocMax pixels apart. It returns the actual number of local maxima found. Notice that it uses OpenCV's minMaxLoc to find global maxima. It is "opencv-self-contained" except for the (easy to implement) function vdist, which computes the (euclidian) distance between points (r,c) and (row,col).
input is one-channel CV_32F matrix, and locations is nLocMax (rows) by 2 (columns) CV_32S matrix.
int imregionalmax(Mat input, int nLocMax, float threshold, float minDistBtwLocMax, Mat locations)
{
Mat scratch = input.clone();
int nFoundLocMax = 0;
for (int i = 0; i < nLocMax; i++) {
Point location;
double maxVal;
minMaxLoc(scratch, NULL, &maxVal, NULL, &location);
if (maxVal > threshold) {
nFoundLocMax += 1;
int row = location.y;
int col = location.x;
locations.at<int>(i,0) = row;
locations.at<int>(i,1) = col;
int r0 = (row-minDistBtwLocMax > -1 ? row-minDistBtwLocMax : 0);
int r1 = (row+minDistBtwLocMax < scratch.rows ? row+minDistBtwLocMax : scratch.rows-1);
int c0 = (col-minDistBtwLocMax > -1 ? col-minDistBtwLocMax : 0);
int c1 = (col+minDistBtwLocMax < scratch.cols ? col+minDistBtwLocMax : scratch.cols-1);
for (int r = r0; r <= r1; r++) {
for (int c = c0; c <= c1; c++) {
if (vdist(Point2DMake(r, c),Point2DMake(row, col)) <= minDistBtwLocMax) {
scratch.at<float>(r,c) = 0.0;
}
}
}
} else {
break;
}
}
return nFoundLocMax;
}
I do not know if it is what you want, but in my answer to this post, I gave some code to find local maxima (peaks) in a grayscale image (resulting from distance transform).
The approach relies on subtracting the original image from the dilated image and finding the zero pixels).
I hope it helps,
Good luck
I had the same problem some time ago, and the solution was to reimplement the imregionalmax algorithm in OpenCV/Cpp. It is not that complicated, because you can find the C++ source code of the function in the Matlab distribution. (somewhere in toolbox). All you have to do is to read carefully and understand the algorithm described there. Then rewrite it or remove the matlab-specific checks and you'll have it.

iPhone FFT with Accelerate framework vDSP

I'm having difficulty implementing an FFT using vDSP. I understand the theory but am looking for a specific code example please.
I have data from a wav file as below:
Question 1. How do I put the audio data into the FFT?
Question 2. How do I get the output data out of the FFT?
Question 3. The ultimate goal is to check for low frequency sounds. How would I do this?
-(OSStatus)open:(CFURLRef)inputURL{
OSStatus result = -1;
result = AudioFileOpenURL (inputURL, kAudioFileReadPermission, 0, &mAudioFile);
if (result == noErr) {
//get format info
UInt32 size = sizeof(mASBD);
result = AudioFileGetProperty(mAudioFile, kAudioFilePropertyDataFormat, &size, &mASBD);
UInt32 dataSize = sizeof packetCount;
result = AudioFileGetProperty(mAudioFile, kAudioFilePropertyAudioDataPacketCount, &dataSize, &packetCount);
NSLog([NSString stringWithFormat:#"File Opened, packet Count: %d", packetCount]);
UInt32 packetsRead = packetCount;
UInt32 numBytesRead = -1;
if (packetCount > 0) {
//allocate buffer
audioData = (SInt16*)malloc( 2 *packetCount);
//read the packets
result = AudioFileReadPackets (mAudioFile, false, &numBytesRead, NULL, 0, &packetsRead, audioData);
NSLog([NSString stringWithFormat:#"Read %d bytes, %d packets", numBytesRead, packetsRead]);
}
}
return result;
}
FFT code below:
log2n = N;
n = 1 << log2n;
stride = 1;
nOver2 = n / 2;
printf("1D real FFT of length log2 ( %d ) = %d\n\n", n, log2n);
/* Allocate memory for the input operands and check its availability,
* use the vector version to get 16-byte alignment. */
A.realp = (float *) malloc(nOver2 * sizeof(float));
A.imagp = (float *) malloc(nOver2 * sizeof(float));
originalReal = (float *) malloc(n * sizeof(float));
obtainedReal = (float *) malloc(n * sizeof(float));
if (originalReal == NULL || A.realp == NULL || A.imagp == NULL) {
printf("\nmalloc failed to allocate memory for the real FFT"
"section of the sample.\n");
exit(0);
}
/* Generate an input signal in the real domain. */
for (i = 0; i < n; i++)
originalReal[i] = (float) (i + 1);
/* Look at the real signal as an interleaved complex vector by
* casting it. Then call the transformation function vDSP_ctoz to
* get a split complex vector, which for a real signal, divides into
* an even-odd configuration. */
vDSP_ctoz((COMPLEX *) originalReal, 2, &A, 1, nOver2);
/* Set up the required memory for the FFT routines and check its
* availability. */
setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2);
if (setupReal == NULL) {
printf("\nFFT_Setup failed to allocate enough memory for"
"the real FFT.\n");
exit(0);
}
/* Carry out a Forward and Inverse FFT transform. */
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD);
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_INVERSE);
/* Verify correctness of the results, but first scale it by 2n. */
scale = (float) 1.0 / (2 * n);
vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2);
vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2);
/* The output signal is now in a split real form. Use the function
* vDSP_ztoc to get a split real vector. */
vDSP_ztoc(&A, 1, (COMPLEX *) obtainedReal, 2, nOver2);
/* Check for accuracy by looking at the inverse transform results. */
Compare(originalReal, obtainedReal, n);
Thanks
You put your audio sample data into the real part of the input, and zero the imaginary part.
If you are just interested in the magnitude of each bin in the frequency domain then you calculate sqrt(re*re + im*im) for each output bin. If you're only interested in relative magnitude then you can drop the sqrt and just calculate the squared magnitude, (re*re + im*im).
You would look at the magnitudes of the bin or bins (see (2)) that correspond to your frequency or frequencies of interest. If your sample rate is Fs, and your FFT size is N, then the corresponding frequency for output bin i is given by f = i * Fs / N. Conversely if you are interested in a specific frequency f then the bin of interest, i, is given by i = N * f / Fs.
Additional note: you will need to apply a suitable window function (e.g. Hann aka Hanning) to your FFT input data, prior to calculating the FFT itself.
You can check Appleā€™s documentation and take good care of data packing.
Here is my example:
// main.cpp
// FFTTest
//
// Created by Harry-Chris Stamatopoulos on 11/23/12.
//
/*
This is an example of a hilbert transformer using
Apple's VDSP fft/ifft & other VDSP calls.
Output signal has a PI/2 phase shift.
COMPLEX_SPLIT vector "B" was used to cross-check
real and imaginary parts coherence with the original vector "A"
that is obtained straight from the fft.
Tested and working.
Cheers!
*/
#include <iostream>
#include <Accelerate/Accelerate.h>
#define PI 3.14159265
#define DEBUG_PRINT 1
int main(int argc, const char * argv[])
{
float fs = 44100; //sample rate
float f0 = 440; //sine frequency
uint32_t i = 0;
uint32_t L = 1024;
/* vector allocations*/
float *input = new float [L];
float *output = new float[L];
float *mag = new float[L/2];
float *phase = new float[L/2];
for (i = 0 ; i < L; i++)
{
input[i] = cos(2*PI*f0*i/fs);
}
uint32_t log2n = log2f((float)L);
uint32_t n = 1 << log2n;
//printf("FFT LENGTH = %lu\n", n);
FFTSetup fftSetup;
COMPLEX_SPLIT A;
COMPLEX_SPLIT B;
A.realp = (float*) malloc(sizeof(float) * L/2);
A.imagp = (float*) malloc(sizeof(float) * L/2);
B.realp = (float*) malloc(sizeof(float) * L/2);
B.imagp = (float*) malloc(sizeof(float) * L/2);
fftSetup = vDSP_create_fftsetup(log2n, FFT_RADIX2);
/* Carry out a Forward and Inverse FFT transform. */
vDSP_ctoz((COMPLEX *) input, 2, &A, 1, L/2);
vDSP_fft_zrip(fftSetup, &A, 1, log2n, FFT_FORWARD);
mag[0] = sqrtf(A.realp[0]*A.realp[0]);
//get phase
vDSP_zvphas (&A, 1, phase, 1, L/2);
phase[0] = 0;
//get magnitude;
for(i = 1; i < L/2; i++){
mag[i] = sqrtf(A.realp[i]*A.realp[i] + A.imagp[i] * A.imagp[i]);
}
//after done with possible phase and mag processing re-pack the vectors in VDSP format
B.realp[0] = mag[0];
B.imagp[0] = mag[L/2 - 1];;
//unwrap, process & re-wrap phase
for(i = 1; i < L/2; i++){
phase[i] -= 2*PI*i * fs/L;
phase[i] -= PI / 2 ;
phase[i] += 2*PI*i * fs/L;
}
//construct real & imaginary part of the output packed vector (input to ifft)
for(i = 1; i < L/2; i++){
B.realp[i] = mag[i] * cosf(phase[i]);
B.imagp[i] = mag[i] * sinf(phase[i]);
}
#if DEBUG_PRINT
for (i = 0 ; i < L/2; i++)
{
printf("A REAL = %f \t A IMAG = %f \n", A.realp[i], A.imagp[i]);
printf("B REAL = %f \t B IMAG = %f \n", B.realp[i], B.imagp[i]);
}
#endif
//ifft
vDSP_fft_zrip(fftSetup, &B, 1, log2n, FFT_INVERSE);
//scale factor
float scale = (float) 1.0 / (2*L);
//scale values
vDSP_vsmul(B.realp, 1, &scale, B.realp, 1, L/2);
vDSP_vsmul(B.imagp, 1, &scale, B.imagp, 1, L/2);
//unpack B to real interleaved output
vDSP_ztoc(&B, 1, (COMPLEX *) output, 2, L/2);
// print output signal values to console
printf("Shifted signal x = \n");
for (i = 0 ; i < L/2; i++)
printf("%f\n", output[i]);
//release resources
free(input);
free(output);
free(A.realp);
free(A.imagp);
free(B.imagp);
free(B.realp);
free(mag);
free(phase);
}
One thing you need to be careful to is the DC component of the calculated FFT. I compared my results with the fftw library FFT and the imaginary part of the transform calculated with the vDSP library always had a different value at index 0 (which means 0 frequency, so DC).
Another measure I applied was to divide both real and imaginary parts by a factor of 2. I guess this is due to the algorithm used in the function. Also, both these problems occurred in the FFT process but not in the IFFT process.
I used vDSP_fft_zrip.