Making a screenshot using Xlib and Cairo libs [fail] - png

I'm trying to make a screenshot using Xlib and Cairo, however I'm not sure to do it the good way, "stride" is really confusing me.
Here's the code :
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <cairo.h>
#include <X11/Xlib.h>
#include <X11/Xutil.h>
int main(int argc, char** argv) {
int x, y;
Display *disp;
Window root;
XWindowAttributes watts;
XImage *image;
cairo_surface_t *surface;
unsigned int width;
unsigned int height;
int stride;
disp = XOpenDisplay(NULL);
root = DefaultRootWindow(disp);
XGetWindowAttributes(disp, root, &watts);
width = watts.width;
height = watts.height;
image = XGetImage(disp, root, watts.x, watts.y, width, height, AllPlanes, ZPixmap);
stride = cairo_format_stride_for_width(CAIRO_FORMAT_RGB24, width);
unsigned char *data = malloc(width * height * 3);
for (y = 0; y < height; ++y)
for (x = 0; x < width; ++x) {
unsigned long pixel = XGetPixel(image, x, y);
unsigned char red = (image->red_mask & pixel);
unsigned char green = (image->green_mask & pixel) >> 8;
unsigned char blue = (image->blue_mask & pixel) >> 16;
data[(y * width + x) * 3] = red;
data[(y * width + x) * 3 + 1] = green;
data[(y * width + x) * 3 + 2] = blue;
}
surface = cairo_image_surface_create_for_data(
data,
CAIRO_FORMAT_RGB24,
width, height,
stride);
cairo_surface_write_to_png(
surface,
"test.png");
cairo_surface_destroy(surface);
free(data);
return (EXIT_SUCCESS);
}
When I compile and run the program, everything seems to work just fine. However here's the resulting image :
quite a mess right ?..
What am I possibly doing wrong ?

Instead of doing all this complicated magic, let cairo do it for you:
#include <cairo.h>
#include <cairo-xlib.h>
#include <X11/Xlib.h>
int main(int argc, char** argv) {
Display *disp;
Window root;
cairo_surface_t *surface;
int scr;
disp = XOpenDisplay(NULL);
scr = DefaultScreen(disp);
root = DefaultRootWindow(disp);
surface = cairo_xlib_surface_create(disp, root, DefaultVisual(disp, scr),
DisplayWidth(disp, scr), DisplayHeight(disp, scr));
cairo_surface_write_to_png(
surface,
"test.png");
cairo_surface_destroy(surface);
return 0;
}

TFM:
CAIRO_FORMAT_RGB24
each pixel is a 32-bit quantity, with the upper 8 bits unused
TFM:
stride = cairo_format_stride_for_width (format, width);
data = malloc (stride * height);
Hence, the correct index calculation is
data[y * stride + x * 4 + 0] = blue;
data[y * stride + x * 4 + 1] = green;
data[y * stride + x * 4 + 2] = red; /* yes, in this order */
Also, masks are taken from the image and shifts are hard-coded, which makes absolutely no sense. Calculate the shifts from the masks.

Related

How to access the results of a Magma Routine

I am trying to access the results of the eigenvalue decomposition of a general real matrix, using the magma_sgeev routine. My code is as follows -
#include <cstdlib>
#include <iostream>
#include <map>
#include <typeindex>
#include <typeinfo>
#include <magma_v2.h>
#include <random>
#define N 10
#define LDA N
#define LDVL N
#define LDVR N
/* Main program */
int main()
{
magma_init();
/* Locals */
int n = N, lda = LDA, ldvl = LDVL, ldvr = LDVR, info, lwork;
float wkopt;
float *work;
/* Local arrays */
float wr[N], wi[N], vl[LDVL * N], vr[LDVR * N];
float *a = (float *)malloc(LDA * N * sizeof(float));
for (int i = 0; i < LDA * N; i++)
a[i] = rand();
printf("Done populating matrix\n");
lwork = -1;
magma_sgeev_m(MagmaNoVec, MagmaNoVec, n, a, lda, wr, wi, vl, ldvl, vr, ldvr,
work, lwork, &info);
lwork = (int)work[0];
work = (float *)malloc(lwork * sizeof(float));
printf("%d\n", lwork);
magma_sgeev_m(MagmaNoVec, MagmaNoVec, n, a, lda, wr, wi, vl, ldvl, vr, ldvr,
work, lwork, &info);
for (int i = 0; i < N; i++)
printf("%f ", wr[i]);
std::cout << std::endl;
for (int i = 0; i < N; i++)
printf("%f ", wi[i]);
printf("\n");
if (info > 0)
{
printf("The algorithm failed to compute eigenvalues.\n");
exit(1);
}
exit(0);
magma_finalize();
}
As the documentation says, the arrays wr and wi hold the results of the computation. However, there are two issues with this code,
When I try to access the wr and wi, I get a segmentation fault. I am not aware of the inner workings of the magma library, what am I missing, and how do I fix it ?
When I increase the size of the matrix, for example from 5x5 to 10x10. I get a segmentation fault in the first call of magma_sgeev itself. What am I missing
TIA

Recursively use of self-implemented cuIDFT.cu leads to changing output every time when re-runing the code

I have implemented a CUDA version of inverse discrete cosine transform (IDCT), by "translating" the MATLAB built-in function idct.m into CUDA:
My implementation is cuIDCT.cu, works when m = n and both m and n are even numbers.
cuIDCT.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cufft.h>
#include <cuComplex.h>
// round up n/m
inline int iDivUp(int n, int m)
{
return (n + m - 1) / m;
}
typedef cufftComplex complex;
#define PI 3.1415926535897932384626433832795028841971693993751
__global__
void idct_ComputeWeightsKernel(const int n, complex *ww)
{
const int pos = threadIdx.x + blockIdx.x * blockDim.x;
if (pos >= n) return;
ww[pos].x = sqrtf(2*n) * cosf(pos*PI/(2*n));
ww[pos].y = sqrtf(2*n) * sinf(pos*PI/(2*n));
}
__global__
void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
// Compute precorrection factor
ww[0].x = ww[0].x / sqrtf(2);
ww[0].y = ww[0].y / sqrtf(2);
y[iy + ix*m].x = ww[iy].x * b[pos];
y[iy + ix*m].y = ww[iy].y * b[pos];
}
__global__
void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
yy[iy + ix*n].x = y[pos].x / (float) n;
yy[iy + ix*n].y = y[pos].y / (float) n;
}
__global__
void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
// Re-order elements of each column according to equations (5.93) and (5.94) in Jain
if (iy < n/2)
{
a[ix + 2*iy*n] = yy[pos].x;
a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x;
}
}
/**
* a = idct(b), where a is of size [n m].
* #param b, input array
* #param n, first dimension of a
* #param m, second dimension of a
* #param a, output array
*/
void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m]
{
const int data_size = n * m * sizeof(float);
// device memory allocation
float *d_in, *d_out;
cudaMalloc(&d_in, data_size);
cudaMalloc(&d_out, data_size);
// transfer data from host to device
cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice);
// compute IDCT using CUDA
// begin============================================
// Compute weights
complex *ww;
cudaMalloc(&ww, n*sizeof(complex));
dim3 threads(256);
dim3 blocks(iDivUp(n, threads.x));
idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww);
complex *y;
complex *yy;
cufftHandle plan;
dim3 threads1(32, 6);
dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case
int Length[1] = {m}; // for each IFFT, the length is m
cudaMalloc(&y, n*m*sizeof(complex));
idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y);
cufftPlanMany(&plan, 1, Length,
Length, 1, m,
Length, 1, m, CUFFT_C2C, n);
cufftExecC2C(plan, y, y, CUFFT_INVERSE); // y is of size [n m]
cudaMalloc(&yy, n*m*sizeof(complex));
Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy);
Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out);
// end============================================
// transfer result from device to host
cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost);
// cleanup
cufftDestroy(plan);
cudaFree(ww);
cudaFree(y);
cudaFree(yy);
cudaFree(d_in);
cudaFree(d_out);
}
Then I compared the result of my CUDA IDCT (i.e. cuIDCT.cu) against MATLAB idct.m using following code:
a test main.cpp function, and
a MATLAB main function main.m to read result from CUDA and compare it against MATLAB.
main.cpp
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <stdlib.h>
#include <stdio.h>
// N must equal to M, and both must be even numbers
#define N 256
#define M 256
void WriteDataFile(const char *name, int w, int h, const float *in, const float *out)
{
FILE *stream;
stream = fopen(name, "wb");
float data = 202021.25f;
fwrite(&data, sizeof(float), 1, stream);
fwrite(&w, sizeof(w), 1, stream);
fwrite(&h, sizeof(h), 1, stream);
for (int i = 0; i < h; i++)
for (int j = 0; j < w; j++)
{
const int pos = j + i * h;
fwrite(in + pos, sizeof(float), 1, stream);
fwrite(out + pos, sizeof(float), 1, stream);
}
fclose(stream);
}
void cuIDCT(float *b, int n, int m, float *a);
int main()
{
// host memory allocation
float *h_in = new float [N * M];
float *h_out = new float [N * M];
float *h_temp = new float [N * M];
// input data initialization
for (int i = 0; i < N * M; i++)
{
h_in[i] = (float)rand()/(float)RAND_MAX;
h_out[i] = h_in[i];
h_temp[i] = h_in[i];
}
// please comment either one case for testing
// test case 1: use cuIDCT.cu once
// cuIDCT(h_in, N, M, h_out);
// test case 2: iteratively use cuIDCT.cu
for (int i = 0; i < 4; i++)
{
if (i % 2 == 0)
cuIDCT(h_out, N, M, h_temp);
else
cuIDCT(h_temp, N, M, h_out);
}
// write data, for further visualization using MATLAB
WriteDataFile("test.flo", N, M, h_in, h_out);
// cleanup
delete [] h_in;
delete [] h_out;
delete [] h_temp;
cudaDeviceReset();
}
main.m
clc;clear;
% read
[h_in, h_out] = read_data('test.flo');
% MATLAB result, for test case 1, comment the for-loop
matlab_out = h_in;
for i = 1:4
matlab_out = idct(matlab_out);
end
% compare
err = matlab_out - h_out;
% show
figure(1);
subplot(221); imshow(h_in, []); title('h\_in'); colorbar
subplot(222); imshow(h_out, []); title('h\_out'); colorbar
subplot(223); imshow(matlab_out, []); title('matlab\_out'); colorbar
subplot(224); imshow(err, []); title('error map'); colorbar
disp(['maximum error between CUDA and MATLAB is ' ...
num2str(max(max(abs(err))))])
I ran the code on Visual Studio 11 (i.e. VS2012) in Windows 7 with Nvidia GPU Tesla K20c, using CUDA Toolkit version 7.5, and my MATLAB version is R2015b.
My test steps:
For test case 1. Un-comment test case 1 and comment test case 2.
Run main.cpp.
Run main.m in MATLAB.
Repeat step 1 and step 2 (without any change, just re-run the code).
I repeated step 3 for 20 times. The output result is unchanged, and results in main.m are:
results of test case 1
The maximum error is 7.7152e-07.
For test case 2. Un-comment test case 2 and comment test case 1.
Run main.cpp.
Run main.m in MATLAB.
Repeat step 1 and step 2 (without any change, just re-run the code).
I repeated step 3 for 20 times. The output result is changed, and results in main.m are (not enough reputation to put all images, only wrong case is shown below):
one situation (the wrong one) of test case 2
The maximum error is 0.45341 (2 times), 0.44898 (1 time), 0.26186 (1 time), 0.26301 (1 time), and 9.5716e-07 (15 times).
From the test results, my conclusion is:
From test case 1: cuIDCT.cu is numerically correct (error ~10^-7) to idct.m.
From test case 2: recursively use of cuIDCT.cu leads to unstable result (i.e. the output changes every time when re-run the code and may sometimes be numerically wrong, error ~0.1)
My question:
From test case 1 we know cuIDCT.cu is numerically correct to idct.m. But why recursiviely use of cuIDCT.cu leads to different output result each time when re-run the code?
Any helps or suggestions are highly appreciated.
I believe the variability in your results is coming about due to this code in your idct_ComputeEvenKernel:
// Compute precorrection factor
ww[0].x = ww[0].x / sqrtf(2);
ww[0].y = ww[0].y / sqrtf(2);
It's not entirely clear what your intent is here, but it's doubtful that this code could be doing what you want. You may be confused about the CUDA execution model.
The above code will be executed by every CUDA thread that you launch for that kernel that passes the thread check:
if (ix >= n || iy >= m) return;
I believe this means 65536 threads will all execute this code in that kernel. Furthermore, the threads will execute that code in more-or-less any order (not all CUDA threads execute in lock-step). They may even step on each other as they are trying to write out their values to the location ww[0]. So the final result in ww[0] will be quite unpredictable.
When I comment out those lines of code, the results become stable for me (albeit different from what they were with those lines in place), unchanging from run to run.
I'd like to point something else out. Wherever you are calculating the .x and .y values of a complex quantity, my suggestion would be to rework the code from this (for example):
y[iy + ix*m].x = ww[iy].x * b[pos];
y[iy + ix*m].y = ww[iy].y * b[pos];
to this:
complex temp1, temp2;
temp1 = ww[iy];
temp2.x = temp1.x * b[pos];
temp2.y = temp2.y * b[pos];
y[iy + ix*m] = temp2;
At least according to my testing, the compiler doesn't seem to be making this optimization for you, and one side-effect benefit is that it's much easier to test your code with cuda-memcheck --tool initcheck .... In the first realization, the compiler will load y[iy + ix*m] as an 8 byte quantity, modify either 4 or 8 bytes of it, then store y[iy + ix*m] as an 8 byte quantity. The second realization should be more efficient (it eliminates the load of y[]), and eliminates the load of an uninitialized quantity (y[]), which the cuda-memcheck tool will report as a hazard.
This variability I'm describing should be possible whether you run either the 1-pass version of your code or the 4-pass version of your code. Therefore I think your statements about the 1-pass version being correct are suspect. I think if you run the 1-pass version enough, you will eventually see variability (although it may require varying initial memory conditions, or running on different GPU types). Even in your own results, we see that 15 out of 20 runs of the 4 pass code produce "correct" results, i.e. the residual error is ~1e-7
Here's my modified cuIDCT.cu file, modified from the version you posted here. The assumption I'm making below is that you wanted to compute the scaling on ww[0] only once, in which case we can easily handle that arithmetic as an addendum to the previous idct_ComputeWeightsKernel:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cufft.h>
#include <cuComplex.h>
#include <helper_cuda.h>
#include "assert.h"
// round up n/m
inline int iDivUp(int n, int m)
{
return (n + m - 1) / m;
}
typedef cufftComplex complex;
#define PI 3.1415926535897932384626433832795028841971693993751
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
__global__
void idct_ComputeWeightsKernel(const int n, complex *ww)
{
const int pos = threadIdx.x + blockIdx.x * blockDim.x;
if (pos >= n) return;
complex temp;
temp.x = sqrtf(2*n) * cosf(pos*PI/(2*n));
temp.y = sqrtf(2*n) * sinf(pos*PI/(2*n));
if (pos == 0) {
temp.x /= sqrtf(2);
temp.y /= sqrtf(2);}
ww[pos] = temp;
}
__global__
void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
/* handle this in idct_ComputeWeightsKernel
// Compute precorrection factor
ww[0].x = ww[0].x / sqrtf(2);
ww[0].y = ww[0].y / sqrtf(2);
*/
complex temp1, temp2;
temp1 = ww[iy];
temp2.x = temp1.x * b[pos];
temp2.y = temp1.y * b[pos];
y[iy + ix*m] = temp2;
}
__global__
void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
complex temp1, temp2;
temp1 = y[pos];
temp2.x = temp1.x / (float) n;
temp2.y = temp1.y / (float) n;
yy[iy + ix*n] = temp2;
}
__global__
void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a)
{
const int ix = threadIdx.x + blockIdx.x * blockDim.x;
const int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= n || iy >= m) return;
const int pos = ix + iy*n;
// Re-order elements of each column according to equations (5.93) and (5.94) in Jain
if (iy < n/2)
{
a[ix + 2*iy*n] = yy[pos].x;
a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x;
}
}
/**
* a = idct(b), where a is of size [n m].
* #param b, input array
* #param n, first dimension of a
* #param m, second dimension of a
* #param a, output array
*/
void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m]
{
const int data_size = n * m * sizeof(float);
// device memory allocation
float *d_in, *d_out;
checkCudaErrors(cudaMalloc(&d_in, data_size));
checkCudaErrors(cudaMalloc(&d_out, data_size));
// transfer data from host to device
checkCudaErrors(cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice));
// compute IDCT using CUDA
// begin============================================
// Compute weights
complex *ww;
checkCudaErrors(cudaMalloc(&ww, n*sizeof(complex)));
dim3 threads(256);
dim3 blocks(iDivUp(n, threads.x));
idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww);
complex *y;
complex *yy;
cufftHandle plan;
dim3 threads1(32, 6);
dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case
int Length[1] = {m}; // for each IFFT, the length is m
checkCudaErrors(cudaMalloc(&y, n*m*sizeof(complex)));
idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y);
cufftSafeCall(cufftPlanMany(&plan, 1, Length,
Length, 1, m,
Length, 1, m, CUFFT_C2C, n));
cufftSafeCall(cufftExecC2C(plan, y, y, CUFFT_INVERSE)); // y is of size [n m]
checkCudaErrors(cudaMalloc(&yy, n*m*sizeof(complex)));
Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy);
cudaMemset(d_out, 0, data_size);
Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out);
// end============================================
// transfer result from device to host
checkCudaErrors(cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost));
// cleanup
cufftDestroy(plan);
checkCudaErrors(cudaFree(ww));
checkCudaErrors(cudaFree(y));
checkCudaErrors(cudaFree(yy));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
}
You'll note I threw an extra cudaMemset on d_out in there, because it helped me clean up an issue with cuda-memcheck --tool initcheck .... It shouldn't be necessary, you can delete it if you want.

finding local mean in an image using mex-cuda

I have an image named HSIImage, of size is 565x585, in which I have find the local mean and standard deviation at every pixel. For this I am using a window W of size 9x9, if we a re finding the mean of x(i,j) we need values in the W where x(i,j) is at its center.
For working on the corner and edge pixels, I am padding the HSIImage and naming it as HSIImage2.
MATLAB code
[m,n,~] = size(HSIImage);
HSIImage2=padarray(HSIImage,[4,4],'symmetric');
mean1 = zeros(m,n);
sd = zeros(m,n);
phi_x=zeros(m,n);
for i=5:m+4
for j=5:n+4
mean1(i-4,j-4) = mean( mean(HSIImage2(i-4:i+4, j-4:j+4, 3) )); %sum / (4*4);
sd(i-4,j-4) = std( std(HSIImage2(i-4:i+4, j-4:j+4, 3), 1));
end
end
[phi_x2,mean2,sd2] = getPhi(HSIImage(:,:,3)',HSIImage2(:,:,3)',m,n);
Serial mean displayed as image.
My cuda code for finding mean and sd is
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
double* HSIImage;
double* d_HSIImage;
double* HSIImage2;
double* d_HSIImage2;
double row;
double col;
double* phi_x;
double* d_phi_x;
double* mean2;
double* d_mean;
double* d_std;
double* sd2;
HSIImage = (double*)mxGetPr(prhs[0]);
HSIImage2 = (double*)mxGetPr(prhs[1]);
row = mxGetScalar(prhs[2]);
col = mxGetScalar(prhs[3]);
plhs[0] = mxCreateDoubleMatrix(row,col,mxREAL);
phi_x = mxGetPr(plhs[0]);
plhs[1] = mxCreateDoubleMatrix(row,col,mxREAL);
mean2 = mxGetPr(plhs[1]);
plhs[2] = mxCreateDoubleMatrix(row,col,mxREAL);
sd2 = mxGetPr(plhs[2]);
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
if ( cudaMalloc(&d_HSIImage,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_HSIImage2,sizeof(double)*(row+8)*(col+8))!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_phi_x,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_mean,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_std,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
cudaMemcpy(d_HSIImage,HSIImage,sizeof(double)*row*col,cudaMemcpyHostToDevice);
cudaMemcpy(d_HSIImage2,HSIImage2,sizeof(double)*(row+8)*(col+8),cudaMemcpyHostToDevice);
phi <<< grid,block >>> (d_HSIImage,d_HSIImage2,row,col,d_phi_x,d_mean,d_std);
cudaMemcpy(phi_x,d_phi_x,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(mean2,d_mean,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(sd2,d_std,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaFree(d_HSIImage);
cudaFree(d_HSIImage2);
cudaFree(d_phi_x);
}
its working fine when image is full of ones. but when I give regular image, there is lot of difference in serial(MATLAB) and parallel(CUDA) outputs(When mean1 and mean2 are compared). Please tell me the error.
I am launching with
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
TILEWIDTH is 32. row=565, col=584.
Parallel mean displayed as image
It is important to note Matlab's c api is column-major ordered, however as mentioned in the comments OP has made sure of the consistency. The problem is that the stride used to access the data did not include the pads of the image. Going from one row to another requires a stride of col+8 (8 being padding of 4 on each side.
changing
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
to
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*(col+8)+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*(col+8)+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
Should work, however, I have included a compilable example that I validated on a small sample, that should be easy to expand.
It is not optimized, but that wasn't part of your question. Optimization using shared memory would give a large performance boost.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void phi(double *img, int row, int col, double *d_mean){
int X=blockDim.x*blockIdx.x+threadIdx.x+4;
int Y=blockDim.y*blockIdx.y+threadIdx.y+4;
double sum = 0;
if(Y<row+4 && X<col+4){
for(int i=-4; i<=4; ++i){
for(int j=-4; j<=4; ++j){
sum+=img[ (Y+j)*(col+8)+X+i];
}
}
sum/=81;
d_mean[(Y-4)*col+X-4]=sum;
}
}
int main(int argc, char * argv[]) {
int width=10, height=10;
double *h_img=new double[(width+8)*(height+8)];
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
h_img[i*(width+8)+j]=0.0;
}
}
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int index = (i+4)*(width+8)+j+4;
h_img[index]=i*width+j;
}
}
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
cout<<h_img[i*(width+8)+j]<<" ";
}cout<<endl;
}
double *d_img;
size_t size=sizeof(double)*(height+8)*(width*8);
cudaMalloc(&d_img, size);
cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice);
size = sizeof(double)*height*width;
double *d_avg;
cudaMalloc(&d_avg, size);
dim3 block(32, 32, 1);
dim3 grid(width/32+1, height/32+1, 1);
phi<<<grid, block>>>(d_img, height, width, d_avg);
cudaDeviceSynchronize();
double *h_avg=new double[width*height];
cudaMemcpy(h_avg, d_avg, size, cudaMemcpyDeviceToHost);
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
cout<<h_avg[i*width+j]<<" ";
}cout<<endl;
}
return 0;
}
Here's my 2 cents regarding local mean and local std.
You should check whether using matlab's optimized built-in functions (conv2 and stdfilt , with their gpu support) gives you better performance than a "simple" mex version. For example, to take the local mean, the fastest will be to use conv2 as follows:
local_mean_image=conv2(image,normalized_window,'same');
where in your case normalized_window=ones(9)./9^2;
For local std use stdfilt :
local_std_image = stdfilt(image, ones(9));
Both options are available for faster GPU performance, I use conv2 with Jacket routinely, and I saw the stdfilt supports gpuarray variables.
By observing the answers of #Christian Sarofeen and of #bla, I made some changes to my code and now I am able to find the mean exactly same as MATLAB. I posting this thinking that some one may use it in future(I am sending the image as is from MATLAB). Still finding standard deviation is little problem.
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
int index = (X-4)*row+Y-4;
for(i=-4;i<=4;i++){
for(j=-4;j<=4;j++){
sum= sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];
}
}
d_mean[index] = sum/81;
double mean = 0;
double temp_std[9] = {0} ;
for(j=-4;j<=4;j++){
sum = 0;
for(i=-4;i<=4;i++){
sum = sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];//vector mean
}
mean = sum/9;
sum =0 ;
for(i=-4;i<=4;i++){
int index = (X+j)*(row+8)+(Y+i);
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
temp_std[j+4] = (sqrt(sum/9));//vector std
}
sum =0 ;
for(j=-4;j<=4;j++){
sum = sum + temp_std[j+4];//mean of vectors
}
mean = sum/9;
sum = 0 ;
for(j=-4;j<=4;j++){
sum = sum + (temp_std[j+4]-mean) * (temp_std[j+4]-mean);
}
d_std[index] = sqrt(sum);//std of vectors
d_phi_x[index] = 1.0/(1.0+exp((d_mean[index]-d_HSIImage[index])/d_std[index]));
}
}

Scaling in inverse FFT by cuFFT

Whenever I'm plotting the values obtained by a programme using the cuFFT and comparing the results with that of Matlab, I'm getting the same shape of graphs and the values of maxima and minima are getting at the same points. However, the values resulting by the cuFFT are much greater than those resulting from Matlab. The Matlab code is
fs = 1000; % sample freq
D = [0:1:4]'; % pulse delay times
t = 0 : 1/fs : 4000/fs; % signal evaluation time
w = 0.5; % width of each pulse
yp = pulstran(t,D,'rectpuls',w);
filt = conj(fliplr(yp));
xx = fft(yp,1024).*fft(filt,1024);
xx = (abs(ifft(xx)));
and the CUDA code with the same input is like:
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD);
cufftExecC2C(plan, (cufftComplex *)d_filter_signal, (cufftComplex *)d_filter_signal, CUFFT_FORWARD);
ComplexPointwiseMul<<<blocksPerGrid, threadsPerBlock>>>(d_signal, d_filter_signal, NX);
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE);
The cuFFT performs also a 1024 points FFT with batch size of 2.
With the scaling factor of NX=1024, the values are not coming correct. Please tell what to do.
This is a late answer to remove this question from the unanswered list.
You are not giving enough information to diagnose your problem, since you are missing to specify the way you are setting up the cuFFT plan. You are even not specifying whether you have exactly the same shape for the Matlab's and cuFFT's signals (so you have just a scaling) or you have approximately the same shape. However, let me make the following two observations:
The yp vector has 4000 elements; opposite to thatm by fft(yp,1024), you are performing an FFT by truncating the signal to 1024 elements;
The inverse cuFFT does not perform the scaling by the number of vector elements.
For the sake of convenience (it could be useful to other users), I'm reporting below a simple FFT-IFFT scheme which includes also the scaling performed by using the CUDA Thrust library.
#include <cufft.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
/*********************/
/* SCALE BY CONSTANT */
/*********************/
class Scale_by_constant
{
private:
float c_;
public:
Scale_by_constant(float c) { c_ = c; };
__host__ __device__ float2 operator()(float2 &a) const
{
float2 output;
output.x = a.x / c_;
output.y = a.y / c_;
return output;
}
};
int main(void){
const int N=4;
// --- Setting up input device vector
thrust::device_vector<float2> d_vec(N,make_cuComplex(1.f,2.f));
cufftHandle plan;
cufftPlan1d(&plan, N, CUFFT_C2C, 1);
// --- Perform in-place direct Fourier transform
cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_FORWARD);
// --- Perform in-place inverse Fourier transform
cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_INVERSE);
thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(), Scale_by_constant((float)(N)));
// --- Setting up output host vector
thrust::host_vector<float2> h_vec(d_vec);
for (int i=0; i<N; i++) printf("Element #%i; Real part = %f; Imaginary part: %f\n",i,h_vec[i].x,h_vec[i].y);
getchar();
}
With the introduction of the cuFFT callback feature, the normalization required by the inverse FFT performed by the cuFFT can be embedded directly within the cufftExecC2C call by defining the normalization operation as a __device__ function.
Besides the cuFFT User Guide, for the cuFFT callback features, see
CUDA Pro Tip: Use cuFFT Callbacks for Custom Data Processing
Below is an example of implementing the IFFT normalization by cuFFT callback.
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <cufftXt.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#endif
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {
float *scaling_factor = (float*)callerInfo;
float2 output;
output.x = cuCrealf(element);
output.y = cuCimagf(element);
output.x = output.x / scaling_factor[0];
output.y = output.y / scaling_factor[0];
((float2*)dataOut)[offset] = output;
}
__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;
/********/
/* MAIN */
/********/
int main() {
const int N = 16;
cufftHandle plan;
float2 *h_input = (float2*)malloc(N*sizeof(float2));
float2 *h_output1 = (float2*)malloc(N*sizeof(float2));
float2 *h_output2 = (float2*)malloc(N*sizeof(float2));
float2 *d_input; gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
float2 *d_output1; gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
float2 *d_output2; gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));
float *h_scaling_factor = (float*)malloc(sizeof(float));
h_scaling_factor[0] = 16.0f;
float *d_scaling_factor; gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));
for (int i=0; i<N; i++) {
h_input[i].x = 1.0f;
h_input[i].y = 0.f;
}
gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice));
cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
cufftCallbackStoreC h_storeCallbackPtr;
gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
cufftSafeCall(cufftDestroy(plan));
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output1));
gpuErrchk(cudaFree(d_output2));
return 0;
}
EDIT
The "moment" the callback operation is performed is specified by CUFFT_CB_ST_COMPLEX in the call to cufftXtSetCallback. Notice that you can then have load and store callbacks with the same cuFFT plan.
PERFORMANCE
I'm adding a further answer to compare the callback performance with the non-callback version of the same code for this particular case of IFFT scaling. The code I'm using is
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <cufftXt.h>
#include <thrust/device_vector.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define DISPLAY
/*******************************/
/* THRUST FUNCTOR IFFT SCALING */
/*******************************/
class Scale_by_constant
{
private:
float c_;
public:
Scale_by_constant(float c) { c_ = c; };
__host__ __device__ float2 operator()(float2 &a) const
{
float2 output;
output.x = a.x / c_;
output.y = a.y / c_;
return output;
}
};
/**********************************/
/* IFFT SCALING CALLBACK FUNCTION */
/**********************************/
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {
float *scaling_factor = (float*)callerInfo;
float2 output;
output.x = cuCrealf(element);
output.y = cuCimagf(element);
output.x = output.x / scaling_factor[0];
output.y = output.y / scaling_factor[0];
((float2*)dataOut)[offset] = output;
}
__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;
/********/
/* MAIN */
/********/
int main() {
const int N = 100000000;
cufftHandle plan; cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
TimingGPU timerGPU;
float2 *h_input = (float2*)malloc(N*sizeof(float2));
float2 *h_output1 = (float2*)malloc(N*sizeof(float2));
float2 *h_output2 = (float2*)malloc(N*sizeof(float2));
float2 *d_input; gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
float2 *d_output1; gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
float2 *d_output2; gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));
// --- Callback function parameters
float *h_scaling_factor = (float*)malloc(sizeof(float));
h_scaling_factor[0] = 16.0f;
float *d_scaling_factor; gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));
// --- Initializing the input on the host and moving it to the device
for (int i = 0; i < N; i++) {
h_input[i].x = 1.0f;
h_input[i].y = 0.f;
}
gpuErrchk(cudaMemcpy(d_input, h_input, N * sizeof(float2), cudaMemcpyHostToDevice));
// --- Execute direct FFT on the device and move the results to the host
cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output1, d_output1, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
#endif
// --- Execute inverse FFT with subsequent scaling on the device and move the results to the host
timerGPU.StartCounter();
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
thrust::transform(thrust::device_pointer_cast(d_output2), thrust::device_pointer_cast(d_output2) + N, thrust::device_pointer_cast(d_output2), Scale_by_constant((float)(N)));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
printf("Timing NO callback %f\n", timerGPU.GetCounter());
// --- Setup store callback
// timerGPU.StartCounter();
cufftCallbackStoreC h_storeCallbackPtr;
gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));
// --- Execute inverse callback FFT on the device and move the results to the host
timerGPU.StartCounter();
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
printf("Timing callback %f\n", timerGPU.GetCounter());
cufftSafeCall(cufftDestroy(plan));
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output1));
gpuErrchk(cudaFree(d_output2));
return 0;
}
For such large 1D arrays and simple processing (scaling), the timing on a Kepler K20c is the following
Non-callback 69.029762 ms
Callback 65.868607 ms
So, there is not much improvement. I expect that the improvement one sees is due to avoiding a separate kernel call in the non-callback case. For smaller 1D arrays, there is either no improvement or the non-callback case runs faster.

OpenCV: how to rotate IplImage?

I need to rotate an image by very small angle, like 1-5 degrees. Does OpenCV provide simple way of doing that? From reading docs i can assume that getAffineTransform() should be involved, but there is no direct example of doing something like:
IplImage *rotateImage( IplImage *source, double angle);
If you use OpenCV > 2.0 it is as easy as
using namespace cv;
Mat rotateImage(const Mat& source, double angle)
{
Point2f src_center(source.cols/2.0F, source.rows/2.0F);
Mat rot_mat = getRotationMatrix2D(src_center, angle, 1.0);
Mat dst;
warpAffine(source, dst, rot_mat, source.size());
return dst;
}
Note: angle is in degrees, not radians.
See the C++ interface documentation for more details and adapt as you need:
getRotationMatrix
warpAffine
Edit: To down voter: Please comment the reason for down voting a tried and tested code?
#include "cv.h"
#include "highgui.h"
#include "math.h"
int main( int argc, char** argv )
{
IplImage* src = cvLoadImage("lena.jpg", 1);
IplImage* dst = cvCloneImage( src );
int delta = 1;
int angle = 0;
int opt = 1; // 1: rotate & zoom
// 0: rotate only
double factor;
cvNamedWindow("src", 1);
cvShowImage("src", src);
for(;;)
{
float m[6];
CvMat M = cvMat(2, 3, CV_32F, m);
int w = src->width;
int h = src->height;
if(opt)
factor = (cos(angle*CV_PI/180.) + 1.05) * 2;
else
factor = 1;
m[0] = (float)(factor*cos(-angle*2*CV_PI/180.));
m[1] = (float)(factor*sin(-angle*2*CV_PI/180.));
m[3] = -m[1];
m[4] = m[0];
m[2] = w*0.5f;
m[5] = h*0.5f;
cvGetQuadrangleSubPix( src, dst, &M);
cvNamedWindow("dst", 1);
cvShowImage("dst", dst);
if( cvWaitKey(1) == 27 )
break;
angle =(int)(angle + delta) % 360;
}
return 0;
}
UPDATE: See the following code for rotation using warpaffine
https://code.google.com/p/opencvjp-sample/source/browse/trunk/cpp/affine2_cpp.cpp?r=48
#include <cv.h>
#include <highgui.h>
using namespace cv;
int
main(int argc, char **argv)
{
// (1)load a specified file as a 3-channel color image,
// set its ROI, and allocate a destination image
const string imagename = argc > 1 ? argv[1] : "../image/building.png";
Mat src_img = imread(imagename);
if(!src_img.data)
return -1;
Mat dst_img = src_img.clone();
// (2)set ROI
Rect roi_rect(cvRound(src_img.cols*0.25), cvRound(src_img.rows*0.25), cvRound(src_img.cols*0.5), cvRound(src_img.rows*0.5));
Mat src_roi(src_img, roi_rect);
Mat dst_roi(dst_img, roi_rect);
// (2)With specified three parameters (angle, rotation center, scale)
// calculate an affine transformation matrix by cv2DRotationMatrix
double angle = -45.0, scale = 1.0;
Point2d center(src_roi.cols*0.5, src_roi.rows*0.5);
const Mat affine_matrix = getRotationMatrix2D( center, angle, scale );
// (3)rotate the image by warpAffine taking the affine matrix
warpAffine(src_roi, dst_roi, affine_matrix, dst_roi.size(), INTER_LINEAR, BORDER_CONSTANT, Scalar::all(255));
// (4)show source and destination images with a rectangle indicating ROI
rectangle(src_img, roi_rect.tl(), roi_rect.br(), Scalar(255,0,255), 2);
namedWindow("src", CV_WINDOW_AUTOSIZE);
namedWindow("dst", CV_WINDOW_AUTOSIZE);
imshow("src", src_img);
imshow("dst", dst_img);
waitKey(0);
return 0;
}
Check my answer to a similar problem:
Rotating an image in C/C++
Essentially, use cvWarpAffine - I've described how to get the 2x3 transformation matrix from the angle in my previous answer.
Updating full answer for OpenCV 2.4 and up
// ROTATE p by R
/**
* Rotate p according to rotation matrix (from getRotationMatrix2D()) R
* #param R Rotation matrix from getRotationMatrix2D()
* #param p Point2f to rotate
* #return Returns rotated coordinates in a Point2f
*/
Point2f rotPoint(const Mat &R, const Point2f &p)
{
Point2f rp;
rp.x = (float)(R.at<double>(0,0)*p.x + R.at<double>(0,1)*p.y + R.at<double>(0,2));
rp.y = (float)(R.at<double>(1,0)*p.x + R.at<double>(1,1)*p.y + R.at<double>(1,2));
return rp;
}
//COMPUTE THE SIZE NEEDED TO LOSSLESSLY STORE A ROTATED IMAGE
/**
* Return the size needed to contain bounding box bb when rotated by R
* #param R Rotation matrix from getRotationMatrix2D()
* #param bb bounding box rectangle to be rotated by R
* #return Size of image(width,height) that will compleley contain bb when rotated by R
*/
Size rotatedImageBB(const Mat &R, const Rect &bb)
{
//Rotate the rectangle coordinates
vector<Point2f> rp;
rp.push_back(rotPoint(R,Point2f(bb.x,bb.y)));
rp.push_back(rotPoint(R,Point2f(bb.x + bb.width,bb.y)));
rp.push_back(rotPoint(R,Point2f(bb.x + bb.width,bb.y+bb.height)));
rp.push_back(rotPoint(R,Point2f(bb.x,bb.y+bb.height)));
//Find float bounding box r
float x = rp[0].x;
float y = rp[0].y;
float left = x, right = x, up = y, down = y;
for(int i = 1; i<4; ++i)
{
x = rp[i].x;
y = rp[i].y;
if(left > x) left = x;
if(right < x) right = x;
if(up > y) up = y;
if(down < y) down = y;
}
int w = (int)(right - left + 0.5);
int h = (int)(down - up + 0.5);
return Size(w,h);
}
/**
* Rotate region "fromroi" in image "fromI" a total of "angle" degrees and put it in "toI" if toI exists.
* If toI doesn't exist, create it such that it will hold the entire rotated region. Return toI, rotated imge
* This will put the rotated fromroi piece of fromI into the toI image
*
* #param fromI Input image to be rotated
* #param toI Output image if provided, (else if &toI = 0, it will create a Mat fill it with the rotated image roi, and return it).
* #param fromroi roi region in fromI to be rotated.
* #param angle Angle in degrees to rotate
* #return Rotated image (you can ignore if you passed in toI
*/
Mat rotateImage(const Mat &fromI, Mat *toI, const Rect &fromroi, double angle)
{
//CHECK STUFF
// you should protect against bad parameters here ... omitted ...
//MAKE OR GET THE "toI" MATRIX
Point2f cx((float)fromroi.x + (float)fromroi.width/2.0,fromroi.y +
(float)fromroi.height/2.0);
Mat R = getRotationMatrix2D(cx,angle,1);
Mat rotI;
if(toI)
rotI = *toI;
else
{
Size rs = rotatedImageBB(R, fromroi);
rotI.create(rs,fromI.type());
}
//ADJUST FOR SHIFTS
double wdiff = (double)((cx.x - rotI.cols/2.0));
double hdiff = (double)((cx.y - rotI.rows/2.0));
R.at<double>(0,2) -= wdiff; //Adjust the rotation point to the middle of the dst image
R.at<double>(1,2) -= hdiff;
//ROTATE
warpAffine(fromI, rotI, R, rotI.size(), INTER_CUBIC, BORDER_CONSTANT, Scalar::all(0));
//& OUT
return(rotI);
}
IplImage* rotate(double angle, float centreX, float centreY, IplImage* src, bool crop)
{
int w=src->width;
int h=src->height;
CvPoint2D32f centre;
centre.x = centreX;
centre.y = centreY;
CvMat* warp_mat = cvCreateMat(2, 3, CV_32FC1);
cv2DRotationMatrix(centre, angle, 1.0, warp_mat);
double m11= cvmGet(warp_mat,0,0);
double m12= cvmGet(warp_mat,0,1);
double m13= cvmGet(warp_mat,0,2);
double m21= cvmGet(warp_mat,1,0);
double m22= cvmGet(warp_mat,1,1);
double m23= cvmGet(warp_mat,1,2);
double m31= 0;
double m32= 0;
double m33= 1;
double x=0;
double y=0;
double u0= (m11*x + m12*y + m13)/(m31*x + m32*y + m33);
double v0= (m21*x + m22*y + m23)/(m31*x + m32*y + m33);
x=w;
y=0;
double u1= (m11*x + m12*y + m13)/(m31*x + m32*y + m33);
double v1= (m21*x + m22*y + m23)/(m31*x + m32*y + m33);
x=0;
y=h;
double u2= (m11*x + m12*y + m13)/(m31*x + m32*y + m33);
double v2= (m21*x + m22*y + m23)/(m31*x + m32*y + m33);
x=w;
y=h;
double u3= (m11*x + m12*y + m13)/(m31*x + m32*y + m33);
double v3= (m21*x + m22*y + m23)/(m31*x + m32*y + m33);
int left= MAX(MAX(u0,u2),0);
int right= MIN(MIN(u1,u3),w);
int top= MAX(MAX(v0,v1),0);
int bottom= MIN(MIN(v2,v3),h);
ASSERT(left<right&&top<bottom); // throw message?
if (left<right&&top<bottom)
{
IplImage* dst= cvCreateImage( cvGetSize(src), IPL_DEPTH_8U, src->nChannels);
cvWarpAffine(src, dst, warp_mat/*, CV_INTER_LINEAR + CV_WARP_FILL_OUTLIERS, cvScalarAll(0)*/);
if (crop) // crop and resize to initial size
{
IplImage* dst_crop= cvCreateImage(cvSize(right-left, bottom-top), IPL_DEPTH_8U, src->nChannels);
cvSetImageROI(dst,cvRect(left,top,right-left,bottom-top));
cvCopy(dst,dst_crop);
cvReleaseImage(&dst);
cvReleaseMat(&warp_mat);
//ver1
//return dst_crop;
// ver2 resize
IplImage* out= cvCreateImage(cvSize(w, h), IPL_DEPTH_8U, src->nChannels);
cvResize(dst_crop,out);
cvReleaseImage(&dst_crop);
return out;
}
else
{
/*cvLine( dst, cvPoint(left,top),cvPoint(left, bottom), cvScalar(0, 0, 255, 0) ,1,CV_AA);
cvLine( dst, cvPoint(right,top),cvPoint(right, bottom), cvScalar(0, 0, 255, 0) ,1,CV_AA);
cvLine( dst, cvPoint(left,top),cvPoint(right, top), cvScalar(0, 0, 255, 0) ,1,CV_AA);
cvLine( dst, cvPoint(left,bottom),cvPoint(right, bottom), cvScalar(0, 0, 255, 0) ,1,CV_AA);*/
cvReleaseMat(&warp_mat);
return dst;
}
}
else
{
return NULL; //assert?
}
}