CUDA class with multidimensional pointers - class

I have been struggling with this class implementation now for quite a while and hope someone can help me with it.
class Material_Properties_Class_device
{
public:
int max_variables;
Logical * table_prop;
Table_Class ** prop_table;
};
The implementation for the pointers looks like this
Material_Properties_Class **d_material_prop = new Material_Properties_Class* [4];
Logical *table_prop;
for (int k = 1; k <= 3; k++ )
{
cutilSafeCall(cudaMalloc((void**)&(d_material_prop[k]),sizeof(Material_Properties_Class)));
cutilSafeCall(cudaMemcpy(d_material_prop[k], material_prop[k], sizeof(Material_Properties_Class ), cudaMemcpyHostToDevice));
}
for( int i = 1; i <= 3; i++ )
{
cutilSafeCall(cudaMalloc((void**)&(table_prop), sizeof(Logical)));
cudaMemcpy(&(d_material_prop[i]->table_prop), &(table_prop), sizeof(Logical*),cudaMemcpyHostToDevice);
cudaMemcpy(table_prop, material_prop[i]->table_prop, sizeof(Logical),cudaMemcpyHostToDevice);
}
cutilSafeCall(cudaMalloc((void ***)&material_prop_device, (4) * sizeof(Material_Properties_Class *)));
cutilSafeCall(cudaMemcpy(material_prop_device, d_material_prop, (4) * sizeof(Material_Properties_Class *), cudaMemcpyHostToDevice));
This implementation works but it can't get it working for the **prop_table.
I assume it must somehow follow the same principle but I just can't get my head around it.
I have already tried
Table_Class_device **prop_table = new Table_Class_device*[3];
and insert another loop inside the second for loop
for (int k = 1; k <= 3; k++ )
{
cutilSafeCall(cudaMalloc((void**)&(prop_table[k]), sizeof(Table_Class)));
cutilSafeCall(cudaMemcpy( prop_table[k], material_prop[i]->prop_table[k], sizeof( Table_Class *), cudaMemcpyHostToDevice));
}
Help would be much appriciated

some magic. May be it'll help
struct fading_coefficient
{
double* frequency_array;
double* temperature_array;
int frequency_size;
int temperature_size;
double** fading_coefficients;
};
struct fading_coefficient* cuda_fading_coefficient;
double* frequency_array = NULL;
double* temperature_array = NULL;
double** fading_coefficients = NULL;
double** fading_coefficients1 = (double **)malloc(fading_coefficient->frequency_size * sizeof(double *));
cudaMalloc((void**)&frequency_array,fading_coefficient->frequency_size *sizeof(double));
cudaMemcpy( frequency_array, fading_coefficient->frequency_array, fading_coefficient->frequency_size *sizeof(double), cudaMemcpyHostToDevice );
free(fading_coefficient->frequency_array);
cudaMalloc((void**)&temperature_array,fading_coefficient->temperature_size *sizeof(double));
cudaMemcpy( temperature_array, fading_coefficient->temperature_array, fading_coefficient->temperature_size *sizeof(double), cudaMemcpyHostToDevice );
free(fading_coefficient->temperature_array);
cudaMalloc((void***)&fading_coefficients,fading_coefficient->temperature_size *sizeof(double*));
for (int i = 0; i < fading_coefficient->temperature_size; i++)
{
cudaMalloc((void**)&(fading_coefficients1[i]),fading_coefficient->frequency_size *sizeof(double));
cudaMemcpy( fading_coefficients1[i], fading_coefficient->fading_coefficients[i], fading_coefficient->frequency_size *sizeof(double), cudaMemcpyHostToDevice );
free(fading_coefficient->fading_coefficients[i]);
}
cudaMemcpy(fading_coefficients, fading_coefficients1, fading_coefficient->temperature_size *sizeof(double*), cudaMemcpyHostToDevice );
fading_coefficient->frequency_array = frequency_array;
fading_coefficient->temperature_array = temperature_array;
fading_coefficient->fading_coefficients = fading_coefficients;
cudaMalloc((void**)&cuda_fading_coefficient,sizeof(struct fading_coefficient));
cudaMemcpy( cuda_fading_coefficient, fading_coefficient, sizeof(struct fading_coefficient), cudaMemcpyHostToDevice );

This question comes up frequently. Multidimensional pointers are especially challenging.
If possible, it's recommended that you flatten multidimensional pointer usage (**) to single-dimensional pointer usage (*), and as you've seen, even that is somewhat cumbersome.
The single-dimensional case (*) is further described here. Although you seem to have already figured it out.
If you really want to handle the 2 dimensional (**) case, look here.
An example implementation for 3 dimensional case (***) is here. ("madness!")
Working with 2 and 3 dimensions this way is quite difficult. Thus the recommendation to flatten.

Related

Extracting data from a matlab struct in mex

I'm following this example but I'm not sure what I missed. Specifically, I have this struct in MATLAB:
a = struct; a.one = 1.0; a.two = 2.0; a.three = 3.0; a.four = 4.0;
And this is my test code in MEX ---
First, I wanted to make sure that I'm passing in the right thing, so I did this check:
int nfields = mxGetNumberOfFields(prhs[0]);
mexPrintf("nfields =%i \n\n", nfields);
And it does yield 4, since I have four fields.
However, when I tried to extract the value in field three:
tmp = mxGetField(prhs[0], 0, "three");
mexPrintf("data =%f \n\n", (double *)mxGetData(tmp) );
It returns data =1.000000. I'm not sure what I did wrong. My logic is that I want to get the first element (hence index is 0) of the field three, so I expected data =3.00000.
Can I get a pointer or a hint?
EDITED
Ok, since you didn't provide your full code but you are working on a test, let's try to make a new one from scratch.
On Matlab side, use the following code:
a.one = 1;
a.two = 2;
a.three = 3;
a.four = 4;
read_struct(a);
Now, create and compile the MEX read_struct function as follows:
#include "mex.h"
void read_struct(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
if (nrhs != 1)
mexErrMsgTxt("One input argument required.");
/* Let's check if the input is a struct... */
if (!mxIsStruct(prhs[0]))
mexErrMsgTxt("The input must be a structure.");
int ne = mxGetNumberOfElements(prhs[0]);
int nf = mxGetNumberOfFields(prhs[0]);
mexPrintf("The structure contains %i elements and %i fields.\n", ne, nf);
mwIndex i;
mwIndex j;
mxArray *mxValue;
double *value;
for (i = 0; i < nf; ++i)
{
for (j = 0; j < ne; ++j)
{
mxValue = mxGetFieldByNumber(prhs[0], j, i);
value = mxGetPr(mxValue);
mexPrintf("Field %s(%d) = %.1f\n", mxGetFieldNameByNumber(prhs[0],i), j, value[0]);
}
}
return;
}
Does this correctly prints your structure?

can a constant pointer be changed during for iterations?

I've just started learning C++ with openCV and came across a weird thing, for me at least...
in this code, there's a for loop, in which there is an assignment to a "const uchar*" as a function of the index (j).
for(int j = 1; j < myImage.rows - 1; ++j)
{
const uchar* previous = myImage.ptr<uchar>(j - 1);
const uchar* current = myImage.ptr<uchar>(j );
const uchar* next = myImage.ptr<uchar>(j + 1);
uchar* output = Result.ptr<uchar>(j);
for(int i = nChannels; i < nChannels * (myImage.cols - 1); ++i)
{
*output++ = saturate_cast<uchar>(5 * current[i]
-current[i - nChannels] - current[i + nChannels] - previous[i] - next[i]);
}
}
As far as I know, constants can only be assigned once, during the construction, so what is going on in here?
I think I am missing a big part of the algorithm, and wish to understand its nature.
Please don't refer to the implementation itself for it is known to be non effective whatsoever. Also it's my first question posted to StackOverflow, so please consider ;)

Expression result unused

I got some codes and I'm trying to fix some compiling bugs:
StkFrames& PRCRev :: tick( StkFrames& frames, unsigned int channel )
{
#if defined(_STK_DEBUG_)
if ( channel >= frames.channels() - 1 ) {
errorString_ << "PRCRev::tick(): channel and StkFrames arguments are incompatible!";
handleError( StkError::FUNCTION_ARGUMENT );
}
#endif
StkFloat *samples = &frames[channel];
unsigned int hop = frames.channels();
for ( unsigned int i=0; i<frames.frames(); i++, samples += hop ) {
*samples = tick( *samples );
*samples++; <<<<<<<<<--------- Expression result unused.
*samples = lastFrame_[1];
}
return frames;
}
I don't understand what the codes is trying to do. The codes are huge and I fixed quite a few. But googling didn't work for this.
Any ideas?
First, you do an increment (the line which actually gives you warning).
*samples++;
And then you assign to that variable something else, which makes previous action unused.
*samples = lastFrame_[1];
I recommend you to read this code inside 'for' loop more carefully. It doesn't look very logical.

Finding log2() using sqrt()

This is an interview question I saw on some site.
It was mentioned that the answer involves forming a recurrence of log2() as follows:
double log2(double x )
{
if ( x<=2 ) return 1;
if ( IsSqureNum(x) )
return log2(sqrt(x) ) * 2;
return log2( sqrt(x) ) * 2 + 1; // Why the plus one here.
}
as for the recurrence, clearly the +1 is wrong. Also, the base case is also erroneous.
Does anyone know a better answer?
How is log() and log10() actually implemented in C.
Perhaps I have found the exact answers the interviewers were looking for. From my part, I would say it's little bit difficult to derive this under interview pressure. The idea is, say you want to find log2(13), you can know that it lies between 3 to 4. Also 3 = log2(8) and 4 = log2(16),
from properties of logarithm, we know that log( sqrt( (8*16) ) = (log(8) + log(16))/2 = (3+4)/2 = 3.5
Now, sqrt(8*16) = 11.3137 and log2(11.3137) = 3.5. Since 11.3137<13, we know that our desired log2(13) would lie between 3.5 and 4 and we proceed to locate that. It is easy to notice that this has a Binary Search solution and we iterate up to a point when our value converges to the value whose log2() we wish to find. Code is given below:
double Log2(double val)
{
int lox,hix;
double rval, lval;
hix = 0;
while((1<<hix)<val)
hix++;
lox =hix-1;
lval = (1<<lox) ;
rval = (1<<hix);
double lo=lox,hi=hix;
// cout<<lox<<" "<<hix<<endl;
//cout<<lval<<" "<<rval;
while( fabs(lval-val)>1e-7)
{
double mid = (lo+hi)/2;
double midValue = sqrt(lval*rval);
if ( midValue > val)
{
hi = mid;
rval = midValue;
}
else{
lo=mid;
lval = midValue;
}
}
return lo;
}
It's been a long time since I've written pure C, so here it is in C++ (I think the only difference is the output function, so you should be able to follow it):
#include <iostream>
using namespace std;
const static double CUTOFF = 1e-10;
double log2_aux(double x, double power, double twoToTheMinusN, unsigned int accumulator) {
if (twoToTheMinusN < CUTOFF)
return accumulator * twoToTheMinusN * 2;
else {
int thisBit;
if (x > power) {
thisBit = 1;
x /= power;
}
else
thisBit = 0;
accumulator = (accumulator << 1) + thisBit;
return log2_aux(x, sqrt(power), twoToTheMinusN / 2.0, accumulator);
}
}
double mylog2(double x) {
if (x < 1)
return -mylog2(1.0/x);
else if (x == 1)
return 0;
else if (x > 2.0)
return mylog2(x / 2.0) + 1;
else
return log2_aux(x, 2.0, 1.0, 0);
}
int main() {
cout << "5 " << mylog2(5) << "\n";
cout << "1.25 " << mylog2(1.25) << "\n";
return 0;
}
The function 'mylog2' does some simple log trickery to get a related number which is between 1 and 2, then call log2_aux with that number.
The log2_aux more or less follows the algorithm that Scorpi0 linked to above. At each step, you get 1 bit of the result. When you have enough bits, stop.
If you can get a hold of a copy, the Feynman Lectures on Physics, number 23, starts off with a great explanation of logs and more or less how to do this conversion. Vastly superior to the Wikipedia article.

Carefully deleting N items from a "circular" vector (or perhaps just an NSMutableArray)

Imagine a std:vector, say, with 100 things on it (0 to 99) currently. You are treating it as a loop. So the 105th item is index 4; forward 7 from index 98 is 5.
You want to delete N items after index position P.
So, delete 5 items after index 50; easy.
Or 5 items after index 99: as you delete 0 five times, or 4 through 0, noting that position at 99 will be erased from existence.
Worst, 5 items after index 97 - you have to deal with both modes of deletion.
What's the elegant and solid approach?
Here's a boring routine I wrote
-(void)knotRemovalHelper:(NSMutableArray*)original
after:(NSInteger)nn howManyToDelete:(NSInteger)desired
{
#define ORCO ((NSInteger)[original count])
static NSInteger kount, howManyUntilLoop, howManyExtraAferLoop;
if ( ... our array is NOT a loop ... )
// trivial, if messy...
{
for ( kount = 1; kount<=desired; ++kount )
{
if ( (nn+1) >= ORCO )
return;
[original removeObjectAtIndex:( nn+1 )];
}
return;
}
else // our array is a loop
// messy, confusing and inelegant. how to improve?
// here we go...
{
howManyUntilLoop = (ORCO-1) - nn;
if ( howManyUntilLoop > desired )
{
for ( kount = 1; kount<=desired; ++kount )
[original removeObjectAtIndex:( nn+1 )];
return;
}
howManyExtraAferLoop = desired - howManyUntilLoop;
for ( kount = 1; kount<=howManyUntilLoop; ++kount )
[original removeObjectAtIndex:( nn+1 )];
for ( kount = 1; kount<=howManyExtraAferLoop; ++kount )
[original removeObjectAtIndex:0];
return;
}
#undef ORCO
}
Update!
InVariant's second answer leads to the following excellent solution. "starting with" is much better than "starting after". So the routine now uses "start with". Invariant's second answer leads to this very simple solution...
N times do if P < currentsize remove P else remove 0
-(void)removeLoopilyFrom:(NSMutableArray*)ra
startingWithThisOne:(NSInteger)removeThisOneFirst
howManyToDelete:(NSInteger)countToDelete
{
// exception if removeThisOneFirst > ra highestIndex
// exception if countToDelete is > ra size
// so easy thanks to Invariant:
for ( do this countToDelete times )
{
if ( removeThisOneFirst < [ra count] )
[ra removeObjectAtIndex:removeThisOneFirst];
else
[ra removeObjectAtIndex:0];
}
}
Update!
Toolbox has pointed out the excellent idea of working to a new array - super KISS.
Here's an idea off the top of my head.
First, generate an array of integers representing the indices to remove. So "remove 5 from index 97" would generate [97,98,99,0,1]. This can be done with the application of a simple modulus operator.
Then, sort this array descending giving [99,98,97,1,0] and then remove the entries in that order.
Should work in all cases.
This solution seems to work, and it copies all remaining elements in the vector only once (to their final destination).
Assume kNumElements, kStartIndex, and kNumToRemove are defined as const size_t values.
vector<int> my_vec(kNumElements);
for (size_t i = 0; i < my_vec.size(); ++i) {
my_vec[i] = i;
}
for (size_t i = 0, cur = 0; i < my_vec.size(); ++i) {
// What is the "distance" from the current index to the start, taking
// into account the wrapping behavior?
size_t distance = (i + kNumElements - kStartIndex) % kNumElements;
// If it's not one of the ones to remove, then we keep it by copying it
// into its proper place.
if (distance >= kNumToRemove) {
my_vec[cur++] = my_vec[i];
}
}
my_vec.resize(kNumElements - kNumToRemove);
There's nothing wrong with two loop solutions as long as they're readable and don't do anything redundant. I don't know Objective-C syntax, but here's the pseudocode approach I'd take:
endIdx = after + howManyToDelete
if (Len <= after + howManyToDelete) //will have a second loop
firstloop = Len - after; //handle end in the first loop, beginning in second
else
firstpass = howManyToDelete; //the first loop will get them all
for (kount = 0; kount < firstpass; kount++)
remove after+1
for ( ; kount < howManyToDelete; kount++) //if firstpass < howManyToDelete, clean up leftovers
remove 0
This solution doesn't use mod, does the limit calculation outside the loop, and touches the relevant samples once each. The second for loop won't execute if all the samples were handled in the first loop.
The common way to do this in DSP is with a circular buffer. This is just a fixed length buffer with two associated counters:
//make sure BUFSIZE is a power of 2 for quick mod trick
#define BUFSIZE 1024
int CircBuf[BUFSIZE];
int InCtr, OutCtr;
void PutData(int *Buf, int count) {
int srcCtr;
int destCtr = InCtr & (BUFSIZE - 1); // if BUFSIZE is a power of 2, equivalent to and faster than destCtr = InCtr % BUFSIZE
for (srcCtr = 0; (srcCtr < count) && (destCtr < BUFSIZE); srcCtr++, destCtr++)
CircBuf[destCtr] = Buf[srcCtr];
for (destCtr = 0; srcCtr < count; srcCtr++, destCtr++)
CircBuf[destCtr] = Buf[srcCtr];
InCtr += count;
}
void GetData(int *Buf, int count) {
int srcCtr = OutCtr & (BUFSIZE - 1);
int destCtr = 0;
for (destCtr = 0; (srcCtr < BUFSIZE) && (destCtr < count); srcCtr++, destCtr++)
Buf[destCtr] = CircBuf[srcCtr];
for (srcCtr = 0; srcCtr < count; srcCtr++, destCtr++)
Buf[destCtr] = CircBuf[srcCtr];
OutCtr += count;
}
int BufferOverflow() {
return ((InCtr - OutCtr) > BUFSIZE);
}
This is pretty lightweight, but effective. And aside from the ctr = BigCtr & (SIZE-1) stuff, I'd argue it's highly readable. The only reason for the & trick is in old DSP environments, mod was an expensive operation so for something that ran often, like every time a buffer was ready for processing, you'd find ways to remove stuff like that. And if you were doing FFT's, your buffers were probably a power of 2 anyway.
These days, of course, you have 1 GHz processors and magically resizing arrays. You kids get off my lawn.
Another method:
N times do {remove entry at index P mod max(ArraySize, P)}
Example:
N=5, P=97, ArraySize=100
1: max(100, 97)=100 so remove at 97%100 = 97
2: max(99, 97)=99 so remove at 97%99 = 97 // array size is now 99
3: max(98, 97)=98 so remove at 97%98 = 97
4: max(97, 97)=97 so remove at 97%97 = 0
5: max(96, 97)=97 so remove at 97%97 = 0
I don't program iphone for know, so I image std::vector, it's quite easy, simple and elegant enough:
#include <iostream>
using std::cout;
#include <vector>
using std::vector;
#include <cassert> //no need for using, assert is macro
template<typename T>
void eraseCircularVector(vector<T> & vec, size_t position, size_t count)
{
assert(count <= vec.size());
if (count > 0)
{
position %= vec.size(); //normalize position
size_t positionEnd = (position + count) % vec.size();
if (positionEnd < position)
{
vec.erase(vec.begin() + position, vec.end());
vec.erase(vec.begin(), vec.begin() + positionEnd);
}
else
vec.erase(vec.begin() + position, vec.begin() + positionEnd);
}
}
int main()
{
vector<int> values;
for (int i = 0; i < 10; ++i)
values.push_back(i);
cout << "Values: ";
for (vector<int>::const_iterator cit = values.begin(); cit != values.end(); cit++)
cout << *cit << ' ';
cout << '\n';
eraseCircularVector(values, 5, 1); //remains 9: 0,1,2,3,4,6,7,8,9
eraseCircularVector(values, 16, 5); //remains 4: 3,4,6,7
cout << "Values: ";
for (vector<int>::const_iterator cit = values.begin(); cit != values.end(); cit++)
cout << *cit << ' ';
cout << '\n';
return 0;
}
However, you might consider:
creating new loop_vector class, if you use this kind of functionality enough
using list if you perform many deletions (or few deletions (not from end, that's simple pop_back) but large array)
If your container (NSMutableArray or whatever) is not list, but vector (i.e. resizable array), you most definitely don't want to delete items one by one, but whole range (e.g. std::vector's erase(begin, end)!
Edit: reacting to comment, to fully realize what must be done by vector, if you erase element other than the last one: it must copy all values after that element (e.g. 1000 items in array, you erase first, 999x copying (moving) of item, that is very costly).
Example:
#include <iostream>
#include <vector>
#include <ctime>
using namespace std;
int main()
{
clock_t start, end;
vector<int> vec;
const int items = 64 * 1024;
cout << "using " << items << " items in vector\n";
for (size_t i = 0; i < items; ++i) vec.push_back(i);
start = clock();
while (!vec.empty()) vec.erase(vec.begin());
end = clock();
cout << "Inefficient method took: "
<< (end - start) * 1.0 / CLOCKS_PER_SEC << " ms\n";
for (size_t i = 0; i < items; ++i) vec.push_back(i);
start = clock();
vec.erase(vec.begin(), vec.end());
end = clock();
cout << "Efficient method took: "
<< (end - start) * 1.0 / CLOCKS_PER_SEC << " ms\n";
return 0;
}
Produces output:
using 65536 items in vector
Inefficient method took: 1.705 ms
Efficient method took: 0 ms
Note it's very easy to get inefficient, look e.g. have at http://www.cplusplus.com/reference/stl/vector/erase/