When I solve this question(149. Max Points on a Line) on leetcode, it have a bug when met this case:
Input [[0,0],[94911151,94911150],[94911152,94911151]]
Output 3
Expected 2
This is my code:
/**
* Definition for a point.
* struct Point {
* int x;
* int y;
* Point() : x(0), y(0) {}
* Point(int a, int b) : x(a), y(b) {}
* };
*/
class Solution {
public:
int maxPoints(vector<Point>& points) {
int size = points.size();
int ans = 0;
if (size == 0) return 0;
unordered_map<double, int> mp;
double k;
for (int i = 0; i < size; ++i) {
int num = 0;
for (int j = i + 1; j < size; ++j) {
if (points[i].x == points[j].x && points[i].y == points[j].y) {
num++;
continue;
}
// my question in below code.
// how can I get the hash key according to slope
if (points[j].x - points[i].x != 0)
k = (double)(points[j].y - points[i].y) / (double)(points[j].x - points[i].x); // calculate the slope.
else k = INT_MAX;
mp[k]++;
}
if (mp[k] == 0) mp[k] = 1, num--;
for (auto it = mp.begin(); it != mp.end(); ++it) {
if (it->second > ans) {
ans = it->second;
ans += num;
}
}
mp.clear();
}
return ans+1;
}
};
In above test case, when it calculate the slope with [0,0] and [94911151,94911150] it comeback k = 1. So I want to know how to get the right hash key to solve this problem?
I am implementing part of FFT algorithm using Renderscript in Android, When I ran the code my application got hanged. I want to process 512 values from real and img allocation at a time. Kernel will execute 512 times using provided dummy allocation of size 512.
Here is my java code
RenderScript rs = RenderScript.create(WajinViewerApplication
.getApplication());
ScriptC_fft scriptC_fft = new ScriptC_fft(rs);
float inReal[] = new float[512 * 512];
float inImg[] = new float[512 * 512];
int k = 0;
for (int i = 0; i < 512; i++) {
for (int j = 0; j < 512; j++) {
// copy values from complex 2d array to 1d array
inReal[k] = data[i][j].real;
inImg[k] = data[i][j].imaginary;
k++;
}
}
Allocation realAllocation = Allocation.createSized(rs, Element.F32(rs),
512 * 512);
Allocation imgAllocation = Allocation.createSized(rs, Element.F32(rs),
512 * 512);
realAllocation.copyFrom(inReal);
imgAllocation.copyFrom(inImg);
scriptC_fft.set_real(realAllocation);
scriptC_fft.set_img(imgAllocation);
Allocation inAllocation = Allocation.createSized(rs, Element.U16(rs),
512);
Allocation outAllocation = Allocation.createTyped(rs,
inAllocation.getType());
inAllocation.copyFrom(new short[512]);
// set direction
if (direction == Direction.Forward) {
scriptC_fft.set_is_forward(true);
} else {
scriptC_fft.set_is_forward(false);
}
scriptC_fft.set_len(512);
scriptC_fft.set_levels(Integer.numberOfLeadingZeros(512));
scriptC_fft.forEach_root(inAllocation, outAllocation);
outAllocation.copyTo(new short[512]);
float outReal[] = new float[512 * 512];
float outImg[] = new float[512 * 512];
scriptC_fft.get_real().copyTo(outReal);
scriptC_fft.get_img().copyTo(outImg);
k = 0;
for (int i = 0; i < 512; i++) {
for (int j = 0; j < 512; j++) {
// copy values from complex 1d array to 2d array
data[i][j].real = outReal[k];
data[i][j].imaginary = outImg[k];
k++;
}
}
rs.destroy();
And here is my Renderscript code
#pragma version(1)
#pragma rs java_package_name(jp.drmh.wajin.newversion)
#include "common.rsh"
rs_allocation real;
rs_allocation img;
bool is_forward;
uint32_t len;
uint32_t levels;
uint16_t __attribute__((kernel)) root(uint16_t in, uint32_t x, uint32_t y){
// rsDebug("call",x);
float realval[512];
float imagval[512];
if(is_forward){
for (uint32_t i = 0; i < len; i++) {
realval[i]=rsGetElementAt_float(real,x*512+i);
imagval[i]=rsGetElementAt_float(img,x*512+i);
//rsDebug("values", realval[i], imagval[i]);
}
}else{
for (uint32_t i = 0; i < len; i++) {
realval[i]=rsGetElementAt_float(img,x*512+i);
imagval[i]=rsGetElementAt_float(real,x*512+i);
}
}
float costable[256],sintable[256];
for (uint32_t i = 0; i < len / 2; i++) {
costable[i]=cos(2 * M_PI * i / len);
sintable[i]=sin(2 * M_PI * i / len);
}
// Bit-reversed addressing permutation
for (uint32_t i = 0; i < len; i++) {
uint32_t j = bit_reverse32(i);
uint32_t ans=j>>(32 - levels);
if (j > i) {
float temp = realval[i];
realval[i] = realval[j];
realval[j] = temp;
temp = imagval[i];
imagval[i] = imagval[j];
imagval[j] = temp;
}
}
for (uint32_t size = 2; size <= len; size *= 2) {
uint32_t halfsize = size / 2;
uint32_t tablestep = len / size;
for (uint32_t i = 0; i < len; i += size) {
for (uint32_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
float tpre=realval[j + halfsize] * costable[k]
+ imagval[j + halfsize] * sintable[k];
float tpim = -realval[j + halfsize] * sintable[k]
+ imagval[j + halfsize] * costable[k];
realval[j + halfsize] = realval[j] - tpre;
imagval[j + halfsize] = imagval[j] - tpim;
realval[j] += tpre;
imagval[j] += tpim;
}
}
if (size == len)
break;
}
if(!is_forward){
for(uint32_t i = 0; i < len; i++){
realval[i]=realval[i]/len;
imagval[i]=imagval[i]/len;
rsDebug("values", realval[i], imagval[i]);
}
for (uint32_t i = 0; i < len; i++) {
rsSetElementAt_float(real, realval[i], x*512+i);
rsSetElementAt_float(img, imagval[i], x*512+i);
}
}
return in;
}
I have an image named HSIImage, of size is 565x585, in which I have find the local mean and standard deviation at every pixel. For this I am using a window W of size 9x9, if we a re finding the mean of x(i,j) we need values in the W where x(i,j) is at its center.
For working on the corner and edge pixels, I am padding the HSIImage and naming it as HSIImage2.
MATLAB code
[m,n,~] = size(HSIImage);
HSIImage2=padarray(HSIImage,[4,4],'symmetric');
mean1 = zeros(m,n);
sd = zeros(m,n);
phi_x=zeros(m,n);
for i=5:m+4
for j=5:n+4
mean1(i-4,j-4) = mean( mean(HSIImage2(i-4:i+4, j-4:j+4, 3) )); %sum / (4*4);
sd(i-4,j-4) = std( std(HSIImage2(i-4:i+4, j-4:j+4, 3), 1));
end
end
[phi_x2,mean2,sd2] = getPhi(HSIImage(:,:,3)',HSIImage2(:,:,3)',m,n);
Serial mean displayed as image.
My cuda code for finding mean and sd is
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
double* HSIImage;
double* d_HSIImage;
double* HSIImage2;
double* d_HSIImage2;
double row;
double col;
double* phi_x;
double* d_phi_x;
double* mean2;
double* d_mean;
double* d_std;
double* sd2;
HSIImage = (double*)mxGetPr(prhs[0]);
HSIImage2 = (double*)mxGetPr(prhs[1]);
row = mxGetScalar(prhs[2]);
col = mxGetScalar(prhs[3]);
plhs[0] = mxCreateDoubleMatrix(row,col,mxREAL);
phi_x = mxGetPr(plhs[0]);
plhs[1] = mxCreateDoubleMatrix(row,col,mxREAL);
mean2 = mxGetPr(plhs[1]);
plhs[2] = mxCreateDoubleMatrix(row,col,mxREAL);
sd2 = mxGetPr(plhs[2]);
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
if ( cudaMalloc(&d_HSIImage,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_HSIImage2,sizeof(double)*(row+8)*(col+8))!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_phi_x,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_mean,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_std,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
cudaMemcpy(d_HSIImage,HSIImage,sizeof(double)*row*col,cudaMemcpyHostToDevice);
cudaMemcpy(d_HSIImage2,HSIImage2,sizeof(double)*(row+8)*(col+8),cudaMemcpyHostToDevice);
phi <<< grid,block >>> (d_HSIImage,d_HSIImage2,row,col,d_phi_x,d_mean,d_std);
cudaMemcpy(phi_x,d_phi_x,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(mean2,d_mean,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(sd2,d_std,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaFree(d_HSIImage);
cudaFree(d_HSIImage2);
cudaFree(d_phi_x);
}
its working fine when image is full of ones. but when I give regular image, there is lot of difference in serial(MATLAB) and parallel(CUDA) outputs(When mean1 and mean2 are compared). Please tell me the error.
I am launching with
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
TILEWIDTH is 32. row=565, col=584.
Parallel mean displayed as image
It is important to note Matlab's c api is column-major ordered, however as mentioned in the comments OP has made sure of the consistency. The problem is that the stride used to access the data did not include the pads of the image. Going from one row to another requires a stride of col+8 (8 being padding of 4 on each side.
changing
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
to
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*(col+8)+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*(col+8)+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
Should work, however, I have included a compilable example that I validated on a small sample, that should be easy to expand.
It is not optimized, but that wasn't part of your question. Optimization using shared memory would give a large performance boost.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void phi(double *img, int row, int col, double *d_mean){
int X=blockDim.x*blockIdx.x+threadIdx.x+4;
int Y=blockDim.y*blockIdx.y+threadIdx.y+4;
double sum = 0;
if(Y<row+4 && X<col+4){
for(int i=-4; i<=4; ++i){
for(int j=-4; j<=4; ++j){
sum+=img[ (Y+j)*(col+8)+X+i];
}
}
sum/=81;
d_mean[(Y-4)*col+X-4]=sum;
}
}
int main(int argc, char * argv[]) {
int width=10, height=10;
double *h_img=new double[(width+8)*(height+8)];
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
h_img[i*(width+8)+j]=0.0;
}
}
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int index = (i+4)*(width+8)+j+4;
h_img[index]=i*width+j;
}
}
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
cout<<h_img[i*(width+8)+j]<<" ";
}cout<<endl;
}
double *d_img;
size_t size=sizeof(double)*(height+8)*(width*8);
cudaMalloc(&d_img, size);
cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice);
size = sizeof(double)*height*width;
double *d_avg;
cudaMalloc(&d_avg, size);
dim3 block(32, 32, 1);
dim3 grid(width/32+1, height/32+1, 1);
phi<<<grid, block>>>(d_img, height, width, d_avg);
cudaDeviceSynchronize();
double *h_avg=new double[width*height];
cudaMemcpy(h_avg, d_avg, size, cudaMemcpyDeviceToHost);
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
cout<<h_avg[i*width+j]<<" ";
}cout<<endl;
}
return 0;
}
Here's my 2 cents regarding local mean and local std.
You should check whether using matlab's optimized built-in functions (conv2 and stdfilt , with their gpu support) gives you better performance than a "simple" mex version. For example, to take the local mean, the fastest will be to use conv2 as follows:
local_mean_image=conv2(image,normalized_window,'same');
where in your case normalized_window=ones(9)./9^2;
For local std use stdfilt :
local_std_image = stdfilt(image, ones(9));
Both options are available for faster GPU performance, I use conv2 with Jacket routinely, and I saw the stdfilt supports gpuarray variables.
By observing the answers of #Christian Sarofeen and of #bla, I made some changes to my code and now I am able to find the mean exactly same as MATLAB. I posting this thinking that some one may use it in future(I am sending the image as is from MATLAB). Still finding standard deviation is little problem.
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
int index = (X-4)*row+Y-4;
for(i=-4;i<=4;i++){
for(j=-4;j<=4;j++){
sum= sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];
}
}
d_mean[index] = sum/81;
double mean = 0;
double temp_std[9] = {0} ;
for(j=-4;j<=4;j++){
sum = 0;
for(i=-4;i<=4;i++){
sum = sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];//vector mean
}
mean = sum/9;
sum =0 ;
for(i=-4;i<=4;i++){
int index = (X+j)*(row+8)+(Y+i);
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
temp_std[j+4] = (sqrt(sum/9));//vector std
}
sum =0 ;
for(j=-4;j<=4;j++){
sum = sum + temp_std[j+4];//mean of vectors
}
mean = sum/9;
sum = 0 ;
for(j=-4;j<=4;j++){
sum = sum + (temp_std[j+4]-mean) * (temp_std[j+4]-mean);
}
d_std[index] = sqrt(sum);//std of vectors
d_phi_x[index] = 1.0/(1.0+exp((d_mean[index]-d_HSIImage[index])/d_std[index]));
}
}
I have already passed an image to my mexFunction but now I need to pass an array of images and I am struggling to get the thing right. This is my code to get the simple Image. This works perfectly but when I go into 3D I don't understand how the information is arranged in the mxArray.
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray *prhs[])
{
mxArray *matrixIn = prhs[0];
inputImage=(double *)mxGetPr(matrixIn);
int x = int(dims[0]);
int y = int(dims[1]);
volume3D image(inputImage, x, y, 1);
}
volume3D::volume3D(double* image, int x, int y, int z)
{
allocateVolume( x, y, z);
for(int i=0; i<xSize; i++)
for(int j=0; j<ySize; j++) {
volume[i][j][0] = double(image[(i)*x+j]);
}
}
I did something like this to pass it the other way around
mwSize mrows,ncols;
mrows = mxGetM(prhs[0]);
ncols = mxGetN(prhs[0]);
plhs[0] = mxCreateNumericMatrix(mrows, ncols, mxDOUBLE_CLASS, mxREAL);
double *matlabTumorMap = mxGetPr(plhs[0]);
const int * dims = mxGetDimensions( plhs[0]);
int x = int(dims[0]);
int y = int(dims[1]);
int z = int(dims[2]);
mwIndex subs[3];
mexPrintf("x %i\n",x);
mexPrintf("y %i\n",y);
mexPrintf("z %i\n",z);
mxArray *matrixTumor = plhs[0];
for(subs[0]=0; subs[0]<x; subs[0]++)
for(subs[1]=0; subs[1]<y; subs[1]++)
for(subs[2]=0; subs[2]<z; subs[2]++)
{
mwIndex x = mxCalcSingleSubscript( matrixTumor,3,subs);
matlabTumorMap[x] = tumorMap.getVoxel(subs[0],subs[1],subs[2]);
}
According to http://www.mathworks.de/help/techdoc/apiref/bqoqnz0.html, there is a mxCalcSingleSubscript which helps you calculating these data.
Something like
mxArray *matrixIn = prhs[0];
volume3D image(matrixIn);
}
volume3D::volume3D(MxArray* matrixIn)
{
double * inputImage=(double *)mxGetPr(matrixIn);
assert(mxGetNumberOfDimensions(matrixIn) >= 3)
mwSize * dims = mxGetDimensions(matrixIn);
int x = int(dims[0]);
int y = int(dims[1]);
int z = int(dims[2]);
double * image = mxGetPr(matrixIn);
mwIndex subs[3];
allocateVolume( x, y, z);
for(subs[0]=0; subs[0]<x; subs[0]++)
for(subs[1]=0; subs[1]<y; subs[1]++)
for(subs[2]=0; subs[2]<z; subs[2]++) {
mwIndex x = mxCalcSingleSubscript(matrixIn, 3, subs);
/* <unsure> */volume[subs[0]][subs[1]][subs[2]] /* </unsure> */ = image[x];
}
BTW: Pay attention if mixing C and C++ - it can lead to even more headache due to name mangling etc.
You are doing the thing right.
The only problem is your indexing, I think. you should write:
volume[i][j][0] = double(image[i+j*x]);
and also you forgot to write:
mwSize* dims = mxGetDimensions(matrixIn);
In a language that passes parameters by reference, given the following function:
int function g(x, y) {
x = x + 1;
y = y + 2;
return x + y;
}
If i = 3, and g(i,i) is called, what is value returned? I thought it is 9, is this correct?
If it's pass-by-reference (your original question was C but C doesn't have pass-by-reference and the question has changed since then anyway, so I'll answer generically), it's probably the case that x and y will simply modify the variables that are passed in for them. That's what a reference is, after all.
In this case, they're both a reference to the same variable i, so your sequence is likely to be:
i = i + 1; // i becomes 4.
i = i + 2; // i becomes 6.
return i + i; // return i + i, or 12.
You can see this in operation with the following C (using pointers to emulate pass-by-reference):
pax$ cat qq.c
#include <stdio.h>
int g(int *x, int *y) {
*x = *x + 1;
*y = *y + 2;
return *x + *y;
}
int main (void) {
int i = 3;
int rv = g (&i, &i);
printf ("Returned: %d\n", rv);
return 0;
}
pax$ gcc -o qq qq.c ; ./qq
Returned: 12
Your result of 9 seems to be assuming that the references are distinct from one another, such as in the following code:
#include <stdio.h>
int g(int *x, int *y) {
*x = *x + 1;
*y = *y + 2;
return *x + *y;
}
int main (void) {
int i1 = 3, i2 = 3;
int rv = g (&i1, &i2);
printf ("Returned: %d\n", rv);
return 0;
}
(this does output 9) but that's not usually the case with reference types.