Recursively use of self-implemented cuIDFT.cu leads to changing output every time when re-runing the code - matlab

I have implemented a CUDA version of inverse discrete cosine transform (IDCT), by "translating" the MATLAB built-in function idct.m into CUDA:
My implementation is cuIDCT.cu, works when m = n and both m and n are even numbers.
cuIDCT.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cufft.h>
#include <cuComplex.h>
// round up n/m
inline int iDivUp(int n, int m)
{
return (n + m - 1) / m;
}
typedef cufftComplex complex;
#define PI 3.1415926535897932384626433832795028841971693993751
__global__
void idct_ComputeWeightsKernel(const int n, complex *ww)
{
const int pos = threadIdx.x + blockIdx.x * blockDim.x;
if (pos >= n) return;
ww[pos].x = sqrtf(2*n) * cosf(pos*PI/(2*n));
ww[pos].y = sqrtf(2*n) * sinf(pos*PI/(2*n));
}
__global__
void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
// Compute precorrection factor
ww[0].x = ww[0].x / sqrtf(2);
ww[0].y = ww[0].y / sqrtf(2);
y[iy + ix*m].x = ww[iy].x * b[pos];
y[iy + ix*m].y = ww[iy].y * b[pos];
}
__global__
void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
yy[iy + ix*n].x = y[pos].x / (float) n;
yy[iy + ix*n].y = y[pos].y / (float) n;
}
__global__
void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
// Re-order elements of each column according to equations (5.93) and (5.94) in Jain
if (iy < n/2)
{
a[ix + 2*iy*n] = yy[pos].x;
a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x;
}
}
/**
* a = idct(b), where a is of size [n m].
* #param b, input array
* #param n, first dimension of a
* #param m, second dimension of a
* #param a, output array
*/
void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m]
{
const int data_size = n * m * sizeof(float);
// device memory allocation
float *d_in, *d_out;
cudaMalloc(&d_in, data_size);
cudaMalloc(&d_out, data_size);
// transfer data from host to device
cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice);
// compute IDCT using CUDA
// begin============================================
// Compute weights
complex *ww;
cudaMalloc(&ww, n*sizeof(complex));
dim3 threads(256);
dim3 blocks(iDivUp(n, threads.x));
idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww);
complex *y;
complex *yy;
cufftHandle plan;
dim3 threads1(32, 6);
dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case
int Length[1] = {m}; // for each IFFT, the length is m
cudaMalloc(&y, n*m*sizeof(complex));
idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y);
cufftPlanMany(&plan, 1, Length,
Length, 1, m,
Length, 1, m, CUFFT_C2C, n);
cufftExecC2C(plan, y, y, CUFFT_INVERSE); // y is of size [n m]
cudaMalloc(&yy, n*m*sizeof(complex));
Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy);
Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out);
// end============================================
// transfer result from device to host
cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost);
// cleanup
cufftDestroy(plan);
cudaFree(ww);
cudaFree(y);
cudaFree(yy);
cudaFree(d_in);
cudaFree(d_out);
}
Then I compared the result of my CUDA IDCT (i.e. cuIDCT.cu) against MATLAB idct.m using following code:
a test main.cpp function, and
a MATLAB main function main.m to read result from CUDA and compare it against MATLAB.
main.cpp
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <stdlib.h>
#include <stdio.h>
// N must equal to M, and both must be even numbers
#define N 256
#define M 256
void WriteDataFile(const char *name, int w, int h, const float *in, const float *out)
{
FILE *stream;
stream = fopen(name, "wb");
float data = 202021.25f;
fwrite(&data, sizeof(float), 1, stream);
fwrite(&w, sizeof(w), 1, stream);
fwrite(&h, sizeof(h), 1, stream);
for (int i = 0; i < h; i++)
for (int j = 0; j < w; j++)
{
const int pos = j + i * h;
fwrite(in + pos, sizeof(float), 1, stream);
fwrite(out + pos, sizeof(float), 1, stream);
}
fclose(stream);
}
void cuIDCT(float *b, int n, int m, float *a);
int main()
{
// host memory allocation
float *h_in = new float [N * M];
float *h_out = new float [N * M];
float *h_temp = new float [N * M];
// input data initialization
for (int i = 0; i < N * M; i++)
{
h_in[i] = (float)rand()/(float)RAND_MAX;
h_out[i] = h_in[i];
h_temp[i] = h_in[i];
}
// please comment either one case for testing
// test case 1: use cuIDCT.cu once
// cuIDCT(h_in, N, M, h_out);
// test case 2: iteratively use cuIDCT.cu
for (int i = 0; i < 4; i++)
{
if (i % 2 == 0)
cuIDCT(h_out, N, M, h_temp);
else
cuIDCT(h_temp, N, M, h_out);
}
// write data, for further visualization using MATLAB
WriteDataFile("test.flo", N, M, h_in, h_out);
// cleanup
delete [] h_in;
delete [] h_out;
delete [] h_temp;
cudaDeviceReset();
}
main.m
clc;clear;
% read
[h_in, h_out] = read_data('test.flo');
% MATLAB result, for test case 1, comment the for-loop
matlab_out = h_in;
for i = 1:4
matlab_out = idct(matlab_out);
end
% compare
err = matlab_out - h_out;
% show
figure(1);
subplot(221); imshow(h_in, []); title('h\_in'); colorbar
subplot(222); imshow(h_out, []); title('h\_out'); colorbar
subplot(223); imshow(matlab_out, []); title('matlab\_out'); colorbar
subplot(224); imshow(err, []); title('error map'); colorbar
disp(['maximum error between CUDA and MATLAB is ' ...
num2str(max(max(abs(err))))])
I ran the code on Visual Studio 11 (i.e. VS2012) in Windows 7 with Nvidia GPU Tesla K20c, using CUDA Toolkit version 7.5, and my MATLAB version is R2015b.
My test steps:
For test case 1. Un-comment test case 1 and comment test case 2.
Run main.cpp.
Run main.m in MATLAB.
Repeat step 1 and step 2 (without any change, just re-run the code).
I repeated step 3 for 20 times. The output result is unchanged, and results in main.m are:
results of test case 1
The maximum error is 7.7152e-07.
For test case 2. Un-comment test case 2 and comment test case 1.
Run main.cpp.
Run main.m in MATLAB.
Repeat step 1 and step 2 (without any change, just re-run the code).
I repeated step 3 for 20 times. The output result is changed, and results in main.m are (not enough reputation to put all images, only wrong case is shown below):
one situation (the wrong one) of test case 2
The maximum error is 0.45341 (2 times), 0.44898 (1 time), 0.26186 (1 time), 0.26301 (1 time), and 9.5716e-07 (15 times).
From the test results, my conclusion is:
From test case 1: cuIDCT.cu is numerically correct (error ~10^-7) to idct.m.
From test case 2: recursively use of cuIDCT.cu leads to unstable result (i.e. the output changes every time when re-run the code and may sometimes be numerically wrong, error ~0.1)
My question:
From test case 1 we know cuIDCT.cu is numerically correct to idct.m. But why recursiviely use of cuIDCT.cu leads to different output result each time when re-run the code?
Any helps or suggestions are highly appreciated.

I believe the variability in your results is coming about due to this code in your idct_ComputeEvenKernel:
// Compute precorrection factor
ww[0].x = ww[0].x / sqrtf(2);
ww[0].y = ww[0].y / sqrtf(2);
It's not entirely clear what your intent is here, but it's doubtful that this code could be doing what you want. You may be confused about the CUDA execution model.
The above code will be executed by every CUDA thread that you launch for that kernel that passes the thread check:
if (ix >= n || iy >= m) return;
I believe this means 65536 threads will all execute this code in that kernel. Furthermore, the threads will execute that code in more-or-less any order (not all CUDA threads execute in lock-step). They may even step on each other as they are trying to write out their values to the location ww[0]. So the final result in ww[0] will be quite unpredictable.
When I comment out those lines of code, the results become stable for me (albeit different from what they were with those lines in place), unchanging from run to run.
I'd like to point something else out. Wherever you are calculating the .x and .y values of a complex quantity, my suggestion would be to rework the code from this (for example):
y[iy + ix*m].x = ww[iy].x * b[pos];
y[iy + ix*m].y = ww[iy].y * b[pos];
to this:
complex temp1, temp2;
temp1 = ww[iy];
temp2.x = temp1.x * b[pos];
temp2.y = temp2.y * b[pos];
y[iy + ix*m] = temp2;
At least according to my testing, the compiler doesn't seem to be making this optimization for you, and one side-effect benefit is that it's much easier to test your code with cuda-memcheck --tool initcheck .... In the first realization, the compiler will load y[iy + ix*m] as an 8 byte quantity, modify either 4 or 8 bytes of it, then store y[iy + ix*m] as an 8 byte quantity. The second realization should be more efficient (it eliminates the load of y[]), and eliminates the load of an uninitialized quantity (y[]), which the cuda-memcheck tool will report as a hazard.
This variability I'm describing should be possible whether you run either the 1-pass version of your code or the 4-pass version of your code. Therefore I think your statements about the 1-pass version being correct are suspect. I think if you run the 1-pass version enough, you will eventually see variability (although it may require varying initial memory conditions, or running on different GPU types). Even in your own results, we see that 15 out of 20 runs of the 4 pass code produce "correct" results, i.e. the residual error is ~1e-7
Here's my modified cuIDCT.cu file, modified from the version you posted here. The assumption I'm making below is that you wanted to compute the scaling on ww[0] only once, in which case we can easily handle that arithmetic as an addendum to the previous idct_ComputeWeightsKernel:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cufft.h>
#include <cuComplex.h>
#include <helper_cuda.h>
#include "assert.h"
// round up n/m
inline int iDivUp(int n, int m)
{
return (n + m - 1) / m;
}
typedef cufftComplex complex;
#define PI 3.1415926535897932384626433832795028841971693993751
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
__global__
void idct_ComputeWeightsKernel(const int n, complex *ww)
{
const int pos = threadIdx.x + blockIdx.x * blockDim.x;
if (pos >= n) return;
complex temp;
temp.x = sqrtf(2*n) * cosf(pos*PI/(2*n));
temp.y = sqrtf(2*n) * sinf(pos*PI/(2*n));
if (pos == 0) {
temp.x /= sqrtf(2);
temp.y /= sqrtf(2);}
ww[pos] = temp;
}
__global__
void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
/* handle this in idct_ComputeWeightsKernel
// Compute precorrection factor
ww[0].x = ww[0].x / sqrtf(2);
ww[0].y = ww[0].y / sqrtf(2);
*/
complex temp1, temp2;
temp1 = ww[iy];
temp2.x = temp1.x * b[pos];
temp2.y = temp1.y * b[pos];
y[iy + ix*m] = temp2;
}
__global__
void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
complex temp1, temp2;
temp1 = y[pos];
temp2.x = temp1.x / (float) n;
temp2.y = temp1.y / (float) n;
yy[iy + ix*n] = temp2;
}
__global__
void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
// Re-order elements of each column according to equations (5.93) and (5.94) in Jain
if (iy < n/2)
{
a[ix + 2*iy*n] = yy[pos].x;
a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x;
}
}
/**
* a = idct(b), where a is of size [n m].
* #param b, input array
* #param n, first dimension of a
* #param m, second dimension of a
* #param a, output array
*/
void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m]
{
const int data_size = n * m * sizeof(float);
// device memory allocation
float *d_in, *d_out;
checkCudaErrors(cudaMalloc(&d_in, data_size));
checkCudaErrors(cudaMalloc(&d_out, data_size));
// transfer data from host to device
checkCudaErrors(cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice));
// compute IDCT using CUDA
// begin============================================
// Compute weights
complex *ww;
checkCudaErrors(cudaMalloc(&ww, n*sizeof(complex)));
dim3 threads(256);
dim3 blocks(iDivUp(n, threads.x));
idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww);
complex *y;
complex *yy;
cufftHandle plan;
dim3 threads1(32, 6);
dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case
int Length[1] = {m}; // for each IFFT, the length is m
checkCudaErrors(cudaMalloc(&y, n*m*sizeof(complex)));
idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y);
cufftSafeCall(cufftPlanMany(&plan, 1, Length,
Length, 1, m,
Length, 1, m, CUFFT_C2C, n));
cufftSafeCall(cufftExecC2C(plan, y, y, CUFFT_INVERSE)); // y is of size [n m]
checkCudaErrors(cudaMalloc(&yy, n*m*sizeof(complex)));
Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy);
cudaMemset(d_out, 0, data_size);
Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out);
// end============================================
// transfer result from device to host
checkCudaErrors(cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost));
// cleanup
cufftDestroy(plan);
checkCudaErrors(cudaFree(ww));
checkCudaErrors(cudaFree(y));
checkCudaErrors(cudaFree(yy));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
}
You'll note I threw an extra cudaMemset on d_out in there, because it helped me clean up an issue with cuda-memcheck --tool initcheck .... It shouldn't be necessary, you can delete it if you want.

Related

Log2 approximation in fixed-point

I'v already implemented fixed-point log2 function using lookup table and low-order polynomial approximation but not quite happy with accuracy across the entire 32-bit fixed-point range [-1,+1). The input format is s0.31 and the output format is s15.16.
I'm posting this question here so that another user can post his answer (some comments were exchanged in another thread but they prefer to provide comprehensive answer in a separate thread). Any other answers are welcome, I would much appreciate if you could provide some speed vs accuracy details of your algorithm and its implementation.
Thanks.
By simply counting the leading zero bits in a fixed-point number x, one can determine log2(x) to the closest strictly smaller integer. On many processor architectures, there is a "count leading zeros" machine instruction or intrinsic. Where this is not available, a fairly efficient implementation of clz() can be constructed in a variety of ways, one of which is included in the code below.
To compute the fractional part of the logarithm, the two main obvious contenders are interpolation in a table and minimax polynomial approximation. In this specific case, quadratic interpolation in a fairly small table seems to be the more attractive option. x = 2i * (1+f), with 0 ≤ f < 1. We determine i as described above and use the leading bits of f to index into the table. A parabola is fit through this and two following table entries, computing the parameters of the parabola on the fly. The result is rounded, and a heuristic adjustment is applied to partially compensate for the truncating nature of fixed-point arithmetic. Finally, the integer portion is added, yielding the final result.
It should be noted that the computation involves right shifts of signed integers which may be negative. We need those right shifts to map to arithmetic right shifts at machine code level, something which is not guaranteed by the ISO-C standard. However, in practice most compilers do what is desired. In this case I used the Intel compiler on an x64 platform running Windows.
With a 66-entry table of 32-bit words, the maximum absolute error can be reduced to 8.18251e-6, so full s15.16 accuracy is achieved.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#define FRAC_BITS_OUT (16)
#define INT_BITS_OUT (15)
#define FRAC_BITS_IN (31)
#define INT_BITS_IN ( 0)
/* count leading zeros: intrinsic or machine instruction on many architectures */
int32_t clz (uint32_t x)
{
uint32_t n, y;
n = 31 + (!x);
if ((y = (x & 0xffff0000U))) { n -= 16; x = y; }
if ((y = (x & 0xff00ff00U))) { n -= 8; x = y; }
if ((y = (x & 0xf0f0f0f0U))) { n -= 4; x = y; }
if ((y = (x & 0xccccccccU))) { n -= 2; x = y; }
if (( (x & 0xaaaaaaaaU))) { n -= 1; }
return n;
}
#define LOG2_TBL_SIZE (6)
#define TBL_SIZE ((1 << LOG2_TBL_SIZE) + 2)
/* for i = [0,65]: log2(1 + i/64) * (1 << 31) */
const uint32_t log2Tab [TBL_SIZE] =
{
0x00000000, 0x02dcf2d1, 0x05aeb4dd, 0x08759c50,
0x0b31fb7d, 0x0de42120, 0x108c588d, 0x132ae9e2,
0x15c01a3a, 0x184c2bd0, 0x1acf5e2e, 0x1d49ee4c,
0x1fbc16b9, 0x22260fb6, 0x24880f56, 0x26e2499d,
0x2934f098, 0x2b803474, 0x2dc4439b, 0x30014ac6,
0x32377512, 0x3466ec15, 0x368fd7ee, 0x38b25f5a,
0x3acea7c0, 0x3ce4d544, 0x3ef50ad2, 0x40ff6a2e,
0x43041403, 0x450327eb, 0x46fcc47a, 0x48f10751,
0x4ae00d1d, 0x4cc9f1ab, 0x4eaecfeb, 0x508ec1fa,
0x5269e12f, 0x5440461c, 0x5612089a, 0x57df3fd0,
0x59a80239, 0x5b6c65aa, 0x5d2c7f59, 0x5ee863e5,
0x60a02757, 0x6253dd2c, 0x64039858, 0x65af6b4b,
0x675767f5, 0x68fb9fce, 0x6a9c23d6, 0x6c39049b,
0x6dd2523d, 0x6f681c73, 0x70fa728c, 0x72896373,
0x7414fdb5, 0x759d4f81, 0x772266ad, 0x78a450b8,
0x7a231ace, 0x7b9ed1c7, 0x7d17822f, 0x7e8d3846,
0x80000000, 0x816fe50b
};
#define RND_SHIFT (31 - FRAC_BITS_OUT)
#define RND_CONST ((1 << RND_SHIFT) / 2)
#define RND_ADJUST (0x10d) /* established heuristically */
/*
compute log2(x) in s15.16 format, where x is in s0.31 format
maximum absolute error 8.18251e-6 # 0x20352845 (0.251622232)
*/
int32_t fixed_log2 (int32_t x)
{
int32_t f1, f2, dx, a, b, approx, lz, i, idx;
uint32_t t;
/* x = 2**i * (1 + f), 0 <= f < 1. Find i */
lz = clz (x);
i = INT_BITS_IN - lz;
/* normalize f */
t = (uint32_t)x << (lz + 1);
/* index table of log2 values using LOG2_TBL_SIZE msbs of fraction */
idx = t >> (32 - LOG2_TBL_SIZE);
/* difference between argument and smallest sampling point */
dx = t - (idx << (32 - LOG2_TBL_SIZE));
/* fit parabola through closest three sampling points; find coeffs a, b */
f1 = (log2Tab[idx+1] - log2Tab[idx]);
f2 = (log2Tab[idx+2] - log2Tab[idx]);
a = f2 - (f1 << 1);
b = (f1 << 1) - a;
/* find function value for argument by computing ((a*dx+b)*dx) */
approx = (int32_t)((((int64_t)a)*dx) >> (32 - LOG2_TBL_SIZE)) + b;
approx = (int32_t)((((int64_t)approx)*dx) >> (32 - LOG2_TBL_SIZE + 1));
approx = log2Tab[idx] + approx;
/* round fractional part of result */
approx = (((uint32_t)approx) + RND_CONST + RND_ADJUST) >> RND_SHIFT;
/* combine integer and fractional parts of result */
return (i << FRAC_BITS_OUT) + approx;
}
/* convert from s15.16 fixed point to double-precision floating point */
double fixed_to_float_s15_16 (int32_t a)
{
return a / 65536.0;
}
/* convert from s0.31 fixed point to double-precision floating point */
double fixed_to_float_s0_31 (int32_t a)
{
return a / (65536.0 * 32768.0);
}
int main (void)
{
double a, res, ref, err, maxerr = 0.0;
int32_t x, start, end;
start = 0x00000001;
end = 0x7fffffff;
printf ("testing fixed_log2 with inputs in [%17.10e, %17.10e)\n",
fixed_to_float_s0_31 (start), fixed_to_float_s0_31 (end));
for (x = start; x < end; x++) {
a = fixed_to_float_s0_31 (x);
ref = log2 (a);
res = fixed_to_float_s15_16 (fixed_log2 (x));
err = fabs (res - ref);
if (err > maxerr) {
maxerr = err;
}
}
printf ("max. err = %g\n", maxerr);
return EXIT_SUCCESS;
}
For completeness, I am showing the minimax polynomial approximation below. The coefficients for such approximations can be generated by several tools such as Maple, Mathematica, Sollya or with homebrew code using the Remez algorithm, which is what I used here. The code below shows the original floating-point coefficients, the dynamic scaling used to maximize accuracy in intermediate computation, and the heuristic adjustments applied to mitigate the impact of non-rounding fixed-point arithmetic.
A typical approach for computation of log2(x) is to use x = 2i * (1+f) and use approximation of log2(1+f) for (1+f) in [√½, √2], which means that we use a polynomial p(f) on the primary approximation interval [√½-1, √2-1].
The intermediate computation scales up operands as far as feasible for improved accuracy under the restriction that we want to use a 32-bit mulhi operation as its basic building block, as this is a native instruction on many 32-bit architectures, accessible either via inline machine code or as an intrinsic. As in the table-based code, there are right shifts of signed data which may be negative, and such right shifts must map to arithmetic right shifts, something that ISO-C doesn't guarantee but most C compilers do.
I managed to get the maximum absolute error for this variant down to 1.11288e-5, so almost full s15.16 accuracy but slightly worse than for the table-based variant. I suspect I should have added one additional term to the polynomial.
/* on 32-bit architectures, there is often an instruction/intrinsic for this */
int32_t mulhi (int32_t a, int32_t b)
{
return (int32_t)(((int64_t)a * (int64_t)b) >> 32);
}
#define RND_SHIFT (25 - FRAC_BITS_OUT)
#define RND_CONST ((1 << RND_SHIFT) / 2)
#define RND_ADJUST (-2) /* established heuristically */
/*
compute log2(x) in s15.16 format, where x is in s0.31 format
maximum absolute error 1.11288e-5 # 0x5a82689f (0.707104757)
*/
int32_t fixed_log2 (int32_t x)
{
int32_t lz, i, f, p, approx;
uint32_t t;
/* x = 2**i * (1 + f), 0 <= f < 1. Find i */
lz = clz (x);
i = INT_BITS_IN - lz;
/* force (1+f) into range [sqrt(0.5), sqrt(2)] */
t = (uint32_t)x << lz;
if (t > (uint32_t)(1.414213562 * (1U << 31))) {
i++;
t = t >> 1;
}
/* compute log2(1+f) for f in [-0.2929, 0.4142] */
f = t - (1U << 31);
p = + (int32_t)(-0.206191055 * (1U << 31) - 1);
p = mulhi (p, f) + (int32_t)( 0.318199910 * (1U << 30) - 18);
p = mulhi (p, f) + (int32_t)(-0.366491705 * (1U << 29) + 22);
p = mulhi (p, f) + (int32_t)( 0.479811855 * (1U << 28) - 2);
p = mulhi (p, f) + (int32_t)(-0.721206390 * (1U << 27) + 37);
p = mulhi (p, f) + (int32_t)( 0.442701618 * (1U << 26) + 35);
p = mulhi (p, f) + (f >> (31 - 25));
/* round fractional part of the result */
approx = (p + RND_CONST + RND_ADJUST) >> RND_SHIFT;
/* combine integer and fractional parts of result */
return (i << FRAC_BITS_OUT) + approx;
}

Determine if matrix A is subset of matrix B

For a matrix such as
A = [...
12 34 67;
90 78 15;
10 71 24];
how could we determine efficiently if it is subset of a larger matrix?
B = [...
12 34 67; % found
89 67 45;
90 78 15; % found
10 71 24; % found, so A is subset of B.
54 34 11];
Here are conditions:
all numbers are integers
matrices are so large, i.e., row# > 100000, column# may vary from 1 to 10 (same for A and B).
Edit:
It seems that ismember for the case of this question, when called only few times works just fine. My initial impression was due to previous experiences where ismember was being invoked many times inside a nested loop resulting in the worst performance.
clear all; clc
n = 200000;
k = 10;
B = randi(n,n,k);
f = randperm(n);
A = B(f(1:1000),:);
tic
assert(sum(ismember(A,B,'rows')) == size(A,1));
toc
tic
assert(all(any(all(bsxfun(#eq,B,permute(A,[3,2,1])),2),1))); %user2999345
toc
which results in:
Elapsed time is 1.088552 seconds.
Elapsed time is 12.154969 seconds.
Here are more benchmarks:
clear all; clc
n = 20000;
f = randperm(n);
k = 10;
t1 = 0;
t2 = 0;
t3 = 0;
for i=1:7
B = randi(n,n,k);
A = B(f(1:n/10),:);
%A(100,2) = 0; % to make A not submat of B
tic
b = sum(ismember(A,B,'rows')) == size(A,1);
t1 = t1+toc;
assert(b);
tic
b = ismember_mex(A,sortrows(B));
t2 = t2+toc;
assert(b);
tic
b = issubmat(A,B);
t3 = t3+toc;
assert(b);
end
George's skm's
ismember | ismember_mex | issubmat
n=20000,k=10 0.6326 0.1064 11.6899
n=1000,k=100 0.2652 0.0155 0.0577
n=1000,k=1000 1.1705 0.1582 0.2202
n=1000,k=10000 13.2470 2.0033 2.6367
*issubmat eats RAM when n or k is over 10000!
*issubmat(A,B), A is being checked as submat of B.
It seems that ismember is hard to beat, at least using MATLAB code. I created a C implementation which can be used using the MEX compiler.
#include "mex.h"
#if MX_API_VER < 0x07030000
typedef int mwIndex;
typedef int mwSize;
#endif /* MX_API_VER */
#include <math.h>
#include <stdlib.h>
#include <string.h>
int ismember(const double *y, const double *x, int yrow, int xrow, int ncol);
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
mwSize xcol, ycol, xrow, yrow;
/* output data */
int* result;
/* arguments */
const mxArray* y;
const mxArray* x;
if (nrhs != 2)
{
mexErrMsgTxt("2 input required.");
}
y = prhs[0];
x = prhs[1];
ycol = mxGetN(y);
yrow = mxGetM(y);
xcol = mxGetN(x);
xrow = mxGetM(x);
/* The first input must be a sparse matrix. */
if (!mxIsDouble(y) || !mxIsDouble(x))
{
mexErrMsgTxt("Input must be of type 'double'.");
}
if (xcol != ycol)
{
mexErrMsgTxt("Inputs must have the same number of columns");
}
plhs[0] = mxCreateLogicalMatrix(1, 1);
result = mxGetPr(plhs[0]);
*result = ismember(mxGetPr(y), mxGetPr(x), yrow, xrow, ycol);
}
int ismemberinner(const double *y, int idx, const double *x, int yrow, int xrow, int ncol) {
int from, to, i;
from = 0;
to = xrow-1;
for(i = 0; i < ncol; ++i) {
// Perform binary search
double yi = *(y + i * yrow + idx);
double *curx = x + i * xrow;
int l = from;
int u = to;
while(l <= u) {
int mididx = l + (u-l)/2;
if(yi < curx[mididx]) {
u = mididx-1;
}
else if(yi > curx[mididx]) {
l = mididx+1;
}
else {
// This can be further optimized by performing additional binary searches
for(from = mididx; from > l && curx[from-1] == yi; --from);
for(to = mididx; to < u && curx[to+1] == yi; ++to);
break;
}
}
if(l > u) {
return 0;
}
}
return 1;
}
int ismember(const double *y, const double *x, int yrow, int xrow, int ncol) {
int i;
for(i = 0; i < yrow; ++i) {
if(!ismemberinner(y, i, x, yrow, xrow, ncol)) {
return 0;
}
}
return 1;
}
Compile it using:
mex -O ismember_mex.c
It can be called as follows:
ismember_mex(x, sortrows(x))
First of all, it assumes that the columns of the matrices have the same size. It works by first sorting the rows of the larger matrix (x in this case, the second argument to the function). Then, a type of binary search is employed to identify whether the rows of the smaller matrix (y hereafter) are contained in x. This is done for each row of y separately (see ismember C function).
For a given row of y, it starts from the first entry and finds the range of indices (using the from and to variables) that match with the first column of x using binary search. This is repeated for the remaining entries, unless some value is not found, in which case it terminates and returns 0.
I tried implementing it this idea in MATLAB, but it didn't work that well. Regarding performance, I found that: (a) in case there are mismatches, it is usually much faster than ismember (b) in case the range of values in x and y is large, it is again faster than ismember, and (c) in case everything matches and the number of possible values in x and y is small (e.g. less than 1000), then ismember may be faster in some situations.
Finally, I want to point out that some parts of the C implementation may be further optimized.
EDIT 1
I fixed the warnings and further improved the function.
#include "mex.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>
int ismember(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol);
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
unsigned int xcol, ycol, nrowx, nrowy;
/* arguments */
const mxArray* y;
const mxArray* x;
if (nrhs != 2)
{
mexErrMsgTxt("2 inputs required.");
}
y = prhs[0];
x = prhs[1];
ycol = (unsigned int) mxGetN(y);
nrowy = (unsigned int) mxGetM(y);
xcol = (unsigned int) mxGetN(x);
nrowx = (unsigned int) mxGetM(x);
/* The first input must be a sparse matrix. */
if (!mxIsDouble(y) || !mxIsDouble(x))
{
mexErrMsgTxt("Input must be of type 'double'.");
}
if (xcol != ycol)
{
mexErrMsgTxt("Inputs must have the same number of columns");
}
plhs[0] = mxCreateLogicalScalar(ismember(mxGetPr(y), mxGetPr(x), nrowy, nrowx, ycol));
}
int ismemberinner(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol) {
unsigned int from = 0, to = nrowx-1, i;
for(i = 0; i < ncol; ++i) {
// Perform binary search
const double yi = *(y + i * nrowy);
const double *curx = x + i * nrowx;
unsigned int l = from;
unsigned int u = to;
while(l <= u) {
const unsigned int mididx = l + (u-l)/2;
const double midx = curx[mididx];
if(yi < midx) {
u = mididx-1;
}
else if(yi > midx) {
l = mididx+1;
}
else {
{
// Binary search to identify smallest index of x that equals yi
// Equivalent to for(from = mididx; from > l && curx[from-1] == yi; --from)
unsigned int limit = mididx;
while(curx[from] != yi) {
const unsigned int mididx = from + (limit-from)/2;
if(curx[mididx] < yi) {
from = mididx+1;
}
else {
limit = mididx-1;
}
}
}
{
// Binary search to identify largest index of x that equals yi
// Equivalent to for(to = mididx; to < u && curx[to+1] == yi; ++to);
unsigned int limit = mididx;
while(curx[to] != yi) {
const unsigned int mididx = limit + (to-limit)/2;
if(curx[mididx] > yi) {
to = mididx-1;
}
else {
limit = mididx+1;
}
}
}
break;
}
}
if(l > u) {
return 0;
}
}
return 1;
}
int ismember(const double *y, const double *x, unsigned int nrowy, unsigned int nrowx, unsigned int ncol) {
unsigned int i;
for(i = 0; i < nrowy; ++i) {
if(!ismemberinner(y + i, x, nrowy, nrowx, ncol)) {
return 0;
}
}
return 1;
}
Using this version I wasn't able to identify any case where ismember is faster. Also, I noticed that one reason ismember is hard to beat is that it uses all cores of the machine! Of course, the function I provided can be optimized to do this too, but this requires much more effort.
Finally, before using my implementation I would advise you to do extensive testing. I did some testing and it seems to work, but I suggest you also do some additional testing.
For small matrices ismember should be enough, probably.
Usage: ismember(B,A,'rows')
ans =
1
0
1
1
0
I put this answer here, emphasizing on a need to solutions with higher performance. I will accept this answer only if there was no better solution.
Using ismember, if a row of A appears twice in B while another one is missing, might wrongly indicate that A is a member of B. The following solution is suitable if the rows of A and B doesn't need to be in the same order. However, I haven't tested its performance for large matrices.
A = [...
34 12 67;
90 78 15;
10 71 24];
B = [...
34 12 67; % found
89 67 45;
90 78 15; % found
10 71 24; % found, so A is subset of B.
54 34 11];
A = permute(A,[3 2 1]);
rowIdx = all(bsxfun(#eq,B,A),2);
colIdx = any(rowIdx,1);
isAMemberB = all(colIdx);
You have said number of columns <= 10. In addition, if the matrix elements are all integers representable as bytes, you could code each row into a two 64 bit integers. That would reduce the number of comparisons by a factor of 64.
For the general case, the following may not be all that much better for thin matrices, but scales very well as the matrices get fat due to the level 3 multiplication:
function yes = is_submat(A,B)
ma = size(A, 1);
mb = size(B, 1);
n = size(B, 2);
yes = false;
if ma >= mb
a = A(:,1);
b = B(:,1);
D = (0 == bsxfun(#minus, a, b'));
q = any(D, 2);
yes = all(any(D,1));
if yes && (n > 1)
A = A(q, :);
C = B*A';
za = sum(A.*A, 2);
zb = sum(B.*B, 2);
Z = sqrt(zb)*sqrt(za');
[~, ix] = max(C./Z, [], 2);
A = A(ix,:);
yes = all(A(:) == B(:));
end
end
end
In the above, I use the fact that the dot product is maximized when two unit vectors are equal.
For fat matrices (say 5000+ columns) with large numbers of unique elements the performance beats ismember quite handily, but otherwise, it is slower than ismember. For thin matrices ismember is faster by an order of magnitude.
Best case test for this function:
A = randi(50000, [10000, 10000]);
B = A(2:3:end, :);
B = B(randperm(size(B,1)),:);
fprintf('%s: %u\n', 'Number of columns', size(A,2));
fprintf('%s: %u\n', 'Element spread', 50000);
tic; is_submat(A,B); toc;
tic; all(ismember(B,A,'rows')); toc;
fprintf('________\n\n');
is_submat_test;
Number of columns: 10000
Element spread: 50000
Elapsed time is 10.713310 seconds (is_submat).
Elapsed time is 17.446682 seconds (ismember).
So I have to admit, all round ismember seems to be much better.
Edits: Edited to correct bug when there is only one column - fixing this also results in more efficient code. Also previous version did not distinguish between positive and negative numbers. Added timing tests.

MATLAB produces different result than CUBLAS + Kernel

I have the following MATLAB code :
[N, d] = size(X); % data size and dimensions
R = rand(d,dt); % Form a random matrix with elements in [0,1]
% Random projection
Y = X * R;
w=720; % hashing step
b = w * rand(dt,1);
% Compute the hash codes of the data
binId = floor( bsxfun(#plus, Y, b') / w);
and I tried to make it parallel using CUBLAS and a Kernel as follows :
__global__ void compute(const int N,const int dt,const int w,const float *old, int *newt){
int col = blockDim.y * blockIdx.y + threadIdx.y;
int row = blockDim.x * blockIdx.x + threadIdx.x;
int id = row+N*col;
if(row<N && col<dt){
newt[id]=(floor)(old[id]/w);
}
}
void gpu_blas_mmul(cublasHandle_t handle, const float *A, const float *B, float *C, const int m, const int k, const int n, const float bet) {
int lda=m,ldb=k,ldc=m;
const float alf = 1.0;
const float *alpha = &alf;
const float *beta = &bet;
// Do the actual multiplication and addition
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
float *d_R, *d_RX, *d_B_row;
int *d_H;
thrust::device_vector<float> d_X(h_X, h_X + N * d);
cudaMalloc(&d_R,d * dt * sizeof(float));
cudaMemcpy(d_R,h_R,d * dt * sizeof(float),cudaMemcpyHostToDevice);
cudaMalloc(&d_B_row,dt * sizeof(float));
cudaMemcpy(d_B_row,h_B_row,dt * sizeof(float),cudaMemcpyHostToDevice);
cudaMalloc(&d_RX,N * dt * sizeof(float));
cudaMalloc(&d_H,N * dt * sizeof(int));
//-------------------------CuBLAS-----------------------
cublasHandle_t handle;
cublasCreate(&handle);
thrust::device_vector<float> d_B_col(N, w);
gpu_blas_mmul(handle, thrust::raw_pointer_cast(&d_B_col[0]), d_B_row, d_RX, N, 1, dt,0.0);
gpu_blas_mmul(handle, thrust::raw_pointer_cast(&d_X[0]), d_R, d_RX, N, d, dt, 1.0);
cublasDestroy(handle);
//-----------------------Kernel----------------------------
dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE,1);
int linGrid1 = (int)ceil(N/(float)BLOCK_SIZE);
int linGrid2 = (int)ceil(dt/(float)BLOCK_SIZE);
dim3 gridSize(linGrid1,linGrid2,1);
compute<<<gridSize, blockSize>>>(N, dt, w, d_RX, d_H);
In h_X, h_R and h_B_row I have saved (in column-major order) X, R and b produced by MATLAB. The dataset I am using is ANN_SIFT1M from http://corpus-texmex.irisa.fr/
For about 10000 values the results produced are exactly the same, but when I try with 50000 values for example there are some differences which become more and more as the number of values increases.
Any idea about what I am doing wrong?
Your MATLAB code uses double point precision so the result is more accurate. In contrast to that, CUDA kernel you provided uses single point precision, type float, and therefore produces less accurate result. And as usually when facing single vs. double point precision issue, the problem only gets worse once you start increasing the size of your input data.
Solution would be to use type double instead of float.

finding local mean in an image using mex-cuda

I have an image named HSIImage, of size is 565x585, in which I have find the local mean and standard deviation at every pixel. For this I am using a window W of size 9x9, if we a re finding the mean of x(i,j) we need values in the W where x(i,j) is at its center.
For working on the corner and edge pixels, I am padding the HSIImage and naming it as HSIImage2.
MATLAB code
[m,n,~] = size(HSIImage);
HSIImage2=padarray(HSIImage,[4,4],'symmetric');
mean1 = zeros(m,n);
sd = zeros(m,n);
phi_x=zeros(m,n);
for i=5:m+4
for j=5:n+4
mean1(i-4,j-4) = mean( mean(HSIImage2(i-4:i+4, j-4:j+4, 3) )); %sum / (4*4);
sd(i-4,j-4) = std( std(HSIImage2(i-4:i+4, j-4:j+4, 3), 1));
end
end
[phi_x2,mean2,sd2] = getPhi(HSIImage(:,:,3)',HSIImage2(:,:,3)',m,n);
Serial mean displayed as image.
My cuda code for finding mean and sd is
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
double* HSIImage;
double* d_HSIImage;
double* HSIImage2;
double* d_HSIImage2;
double row;
double col;
double* phi_x;
double* d_phi_x;
double* mean2;
double* d_mean;
double* d_std;
double* sd2;
HSIImage = (double*)mxGetPr(prhs[0]);
HSIImage2 = (double*)mxGetPr(prhs[1]);
row = mxGetScalar(prhs[2]);
col = mxGetScalar(prhs[3]);
plhs[0] = mxCreateDoubleMatrix(row,col,mxREAL);
phi_x = mxGetPr(plhs[0]);
plhs[1] = mxCreateDoubleMatrix(row,col,mxREAL);
mean2 = mxGetPr(plhs[1]);
plhs[2] = mxCreateDoubleMatrix(row,col,mxREAL);
sd2 = mxGetPr(plhs[2]);
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
if ( cudaMalloc(&d_HSIImage,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_HSIImage2,sizeof(double)*(row+8)*(col+8))!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_phi_x,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_mean,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_std,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
cudaMemcpy(d_HSIImage,HSIImage,sizeof(double)*row*col,cudaMemcpyHostToDevice);
cudaMemcpy(d_HSIImage2,HSIImage2,sizeof(double)*(row+8)*(col+8),cudaMemcpyHostToDevice);
phi <<< grid,block >>> (d_HSIImage,d_HSIImage2,row,col,d_phi_x,d_mean,d_std);
cudaMemcpy(phi_x,d_phi_x,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(mean2,d_mean,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(sd2,d_std,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaFree(d_HSIImage);
cudaFree(d_HSIImage2);
cudaFree(d_phi_x);
}
its working fine when image is full of ones. but when I give regular image, there is lot of difference in serial(MATLAB) and parallel(CUDA) outputs(When mean1 and mean2 are compared). Please tell me the error.
I am launching with
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
TILEWIDTH is 32. row=565, col=584.
Parallel mean displayed as image
It is important to note Matlab's c api is column-major ordered, however as mentioned in the comments OP has made sure of the consistency. The problem is that the stride used to access the data did not include the pads of the image. Going from one row to another requires a stride of col+8 (8 being padding of 4 on each side.
changing
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
to
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*(col+8)+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*(col+8)+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
Should work, however, I have included a compilable example that I validated on a small sample, that should be easy to expand.
It is not optimized, but that wasn't part of your question. Optimization using shared memory would give a large performance boost.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void phi(double *img, int row, int col, double *d_mean){
int X=blockDim.x*blockIdx.x+threadIdx.x+4;
int Y=blockDim.y*blockIdx.y+threadIdx.y+4;
double sum = 0;
if(Y<row+4 && X<col+4){
for(int i=-4; i<=4; ++i){
for(int j=-4; j<=4; ++j){
sum+=img[ (Y+j)*(col+8)+X+i];
}
}
sum/=81;
d_mean[(Y-4)*col+X-4]=sum;
}
}
int main(int argc, char * argv[]) {
int width=10, height=10;
double *h_img=new double[(width+8)*(height+8)];
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
h_img[i*(width+8)+j]=0.0;
}
}
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int index = (i+4)*(width+8)+j+4;
h_img[index]=i*width+j;
}
}
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
cout<<h_img[i*(width+8)+j]<<" ";
}cout<<endl;
}
double *d_img;
size_t size=sizeof(double)*(height+8)*(width*8);
cudaMalloc(&d_img, size);
cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice);
size = sizeof(double)*height*width;
double *d_avg;
cudaMalloc(&d_avg, size);
dim3 block(32, 32, 1);
dim3 grid(width/32+1, height/32+1, 1);
phi<<<grid, block>>>(d_img, height, width, d_avg);
cudaDeviceSynchronize();
double *h_avg=new double[width*height];
cudaMemcpy(h_avg, d_avg, size, cudaMemcpyDeviceToHost);
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
cout<<h_avg[i*width+j]<<" ";
}cout<<endl;
}
return 0;
}
Here's my 2 cents regarding local mean and local std.
You should check whether using matlab's optimized built-in functions (conv2 and stdfilt , with their gpu support) gives you better performance than a "simple" mex version. For example, to take the local mean, the fastest will be to use conv2 as follows:
local_mean_image=conv2(image,normalized_window,'same');
where in your case normalized_window=ones(9)./9^2;
For local std use stdfilt :
local_std_image = stdfilt(image, ones(9));
Both options are available for faster GPU performance, I use conv2 with Jacket routinely, and I saw the stdfilt supports gpuarray variables.
By observing the answers of #Christian Sarofeen and of #bla, I made some changes to my code and now I am able to find the mean exactly same as MATLAB. I posting this thinking that some one may use it in future(I am sending the image as is from MATLAB). Still finding standard deviation is little problem.
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
int index = (X-4)*row+Y-4;
for(i=-4;i<=4;i++){
for(j=-4;j<=4;j++){
sum= sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];
}
}
d_mean[index] = sum/81;
double mean = 0;
double temp_std[9] = {0} ;
for(j=-4;j<=4;j++){
sum = 0;
for(i=-4;i<=4;i++){
sum = sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];//vector mean
}
mean = sum/9;
sum =0 ;
for(i=-4;i<=4;i++){
int index = (X+j)*(row+8)+(Y+i);
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
temp_std[j+4] = (sqrt(sum/9));//vector std
}
sum =0 ;
for(j=-4;j<=4;j++){
sum = sum + temp_std[j+4];//mean of vectors
}
mean = sum/9;
sum = 0 ;
for(j=-4;j<=4;j++){
sum = sum + (temp_std[j+4]-mean) * (temp_std[j+4]-mean);
}
d_std[index] = sqrt(sum);//std of vectors
d_phi_x[index] = 1.0/(1.0+exp((d_mean[index]-d_HSIImage[index])/d_std[index]));
}
}

Cuda matrix multiplication results differs from MATLAB

Its been two days and I am still cant figure it out why my implementation of CUDA matrix multiplication differs from the results produced in MATLAB.
CUDA kernel: A(200x60000) = W(200x784) * Data(784x6000)
__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if ((Row < W.row) && (Col < Data.col)){
float Cvalue = 0.0;
for (int i = 0; i < W.col; ++i){
Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
}
A.elements[Row*A.col+Col] = Cvalue;
}
}
And calling the kernel:
void myFunc(Matrix W1, Matrix data){
Matrix d_W1, d_data, d_a2, a2;
size_t size;
a2.row = W1.row; d_a2.row = a2.row;
a2.col = data.col; d_a2.col = a2.col;
size = a2.col*a2.row*sizeof(float);
cudaMalloc(&d_a2.elements,size);
d_W1.row = W1.row; d_W1.col = W1.col;
size = W1.col*W1.row*sizeof(float);
cudaMalloc(&d_W1.elements,size);
cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);
d_data.col = data.col; d_data.row = data.row;
size = data.row*data.col*sizeof(float);
cudaMalloc(&d_data.elements,size);
cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
dim3 dimBlock(32, 32, 1);
CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
a2.elements = new float [a2.row*a2.col];
cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);
printf("\nA2 first and last member %f - %f\n",a2.elements[0],a2.elements[a2.row*a2.col-1]);
}
Results difference is not low for example first and last elements of CUDA code is 0.011322 and -0.179534 but multiplying in MATLAB results in 0.4280 and 0.0056.
this is how I do it in MATLAB:
>> size(W1) ans = 200 784
>> size(data) ans = 784 60000
>> z2=W1*data;
>> size(z2) ans = 200 60000
>> z2 = z2(:);
>> z2(1) ans = 0.4280
>> z2(200*60000)ans = 0.0056
There is nothing wrong with the code you posted. If I expand your kernel and function into a complete running example like this:
#include <iostream>
struct Matrix
{
int row;
int col;
float *elements;
__device__ __host__
float& operator()(int r, int c) { return elements[r*col + c]; };
};
__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if ((Row < W.row) && (Col < Data.col)){
float Cvalue = 0.0;
for (int i = 0; i < W.col; ++i){
Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
}
A.elements[Row*A.col+Col] = Cvalue;
}
}
void myFunc(Matrix W1, Matrix data)
{
Matrix d_W1, d_data, d_a2, a2;
size_t size;
a2.row = W1.row; d_a2.row = a2.row;
a2.col = data.col; d_a2.col = a2.col;
size = a2.col*a2.row*sizeof(float);
cudaMalloc(&d_a2.elements,size);
d_W1.row = W1.row; d_W1.col = W1.col;
size = W1.col*W1.row*sizeof(float);
cudaMalloc(&d_W1.elements,size);
cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);
d_data.col = data.col; d_data.row = data.row;
size = data.row*data.col*sizeof(float);
cudaMalloc(&d_data.elements,size);
cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
dim3 dimBlock(32, 32, 1);
CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
a2.elements = new float [a2.row*a2.col];
cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);
for(int j=0; j<a2.col; ++j) {
for(int i=0; i<a2.row; ++i) {
std::cout << a2(i,j) << " ";
}
std::cout << std::endl;
}
}
int main(void)
{
float a[6] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
float b[6] = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
Matrix W1; W1.row=2; W1.col=3; W1.elements = &a[0];
Matrix Data; Data.row=3; Data.col=2; Data.elements = &b[0];
myFunc(W1, Data);
return 0;
}
and run it, I get this:
>nvcc -arch=sm_21 -Xptxas="-v" -m32 matrix.cu
matrix.cu
tmpxft_000014f4_00000000-5_matrix.cudafe1.gpu
tmpxft_000014f4_00000000-10_matrix.cudafe2.gpu
matrix.cu
ptxas : info : 132 bytes gmem, 28 bytes cmem[14]
ptxas : info : Compiling entry function '_Z10CalculateA6MatrixS_S_' for 'sm_21'
ptxas : info : Function properties for _Z10CalculateA6MatrixS_S_
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 14 registers, 68 bytes cmem[0]
tmpxft_000014f4_00000000-5_matrix.cudafe1.cpp
tmpxft_000014f4_00000000-15_matrix.ii
>cuda-memcheck a.exe
========= CUDA-MEMCHECK
2.2 4.9
2.8 6.4
========= ERROR SUMMARY: 0 errors
which is the correct answer for the dot product assuming column major ordering (which is the Matlab convention).
So if your results don't agree, it is because of something you haven't shown us. One likelihood is that your test problem is so large (and kernel so inefficient) that if you are running this on a display GPU, your program is hitting the display driver watchdog timer limit and being killed before the kernel finishes running. Also note that you have no CUDA API error checking whatsoever, so it is possible that you are getting runtime errors which is either stopping your kernel from finishing or even running at all, but you simply don't notice because of the lack of error checking.