CUDA fft different results from MATLAB fft - matlab

I have tried to do a simple fft and compare the results between MATLAB and CUDA.
MATLAB:
Vector of 9 numbers 1-9
I = [1 2 3 4 5 6 7 8 9];
and use this code:
fft(I)
gives the results:
45.0000 + 0.0000i
-4.5000 +12.3636i
-4.5000 + 5.3629i
-4.5000 + 2.5981i
-4.5000 + 0.7935i
-4.5000 - 0.7935i
-4.5000 - 2.5981i
-4.5000 - 5.3629i
-4.5000 -12.3636i
And CUDA code:
int FFT_Test_Function() {
int n = 9;
double* in = new double[n];
Complex* out = new Complex[n];
for (int i = 0; i<n; i++)
{
in[i] = i + 1;
}
// Allocate the buffer
cufftDoubleReal *d_in;
cufftDoubleComplex *d_out;
unsigned int out_mem_size = sizeof(cufftDoubleComplex)*n;
unsigned int in_mem_size = sizeof(cufftDoubleReal)*n;
cudaMalloc((void **)&d_in, in_mem_size);
cudaMalloc((void **)&d_out, out_mem_size);
// Save time stamp
milliseconds timeStart = getCurrentTimeStamp();
cufftHandle plan;
cufftResult res = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
if (res != CUFFT_SUCCESS) { cout << "cufft plan error: " << res << endl; return 1; }
cudaCheckErrors("cuda malloc fail");
cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy H2D fail");
res = cufftExecD2Z(plan, d_in, d_out);
if (res != CUFFT_SUCCESS) { cout << "cufft exec error: " << res << endl; return 1; }
cudaMemcpy(out, d_out, out_mem_size, cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy D2H fail");
milliseconds timeEnd = getCurrentTimeStamp();
milliseconds totalTime = timeEnd - timeStart;
std::cout << "Total time: " << totalTime.count() << std::endl;
return 0;
}
In this CUDA code i got the result:
You can see that CUDA gives 4 zero's (cells 5-9).
What am i missed?
Thank you very much for your attention!

CUFFT_D2Z is a real-to-complex FFT, so the top N/2 - 1 points in the output data are redundant - they are just the complex conjugate of the bottom half of the transform (you can see this in the MATLAB output if you compare pairs of terms which are mirrored about the mid-point).
You can fill in these "missing" terms if you need them, by just taking the complex conjugate of each corresponding term, but usually there isn't much point in doing this.

Related

Saving Nef polyhedron as Polyhedron_3 or Surface_mesh gives different results

I wanted to save a Nef polyhedron into an OFF file for visualizing it. As written in the CGAL Nef polyhedra user manual (see paragraphs 5.4 and 5.5), a Nef polyhedron can be converted both to a Polyhedron_3 or a Surface_mesh.
However, I noticed that when converting to those structures and then saving it into an OFF file, the results are different.
Here I report the code for a minimal example:
#include <list>
#include <iostream>
#include <fstream>
#include <CGAL/Exact_predicates_exact_constructions_kernel.h>
#include <CGAL/Polyhedron_3.h>
#include <CGAL/Nef_polyhedron_3.h>
#include <CGAL/IO/Nef_polyhedron_iostream_3.h>
#include <CGAL/Surface_mesh.h>
#include <CGAL/boost/graph/convert_nef_polyhedron_to_polygon_mesh.h>
typedef CGAL::Exact_predicates_exact_constructions_kernel Kernel;
typedef Kernel::Point_3 Point_3;
typedef CGAL::Surface_mesh<Point_3> Mesh;
typedef CGAL::Polyhedron_3<Kernel> Polyhedron_3;
typedef CGAL::Nef_polyhedron_3<Kernel> Nef_polyhedron;
typedef Kernel::Vector_3 Vector_3;
typedef Kernel::Aff_transformation_3 Aff_transformation_3;
int convertStlToOff(const char* inputFilename, const char* outputFilename)
{
//read 80 bytes and put in std::cerr
std::ifstream obj(inputFilename, std::ios::in | std::ios::binary);
for (int i = 0; i < 80; i++) {
boost::uint8_t c;
obj.read(reinterpret_cast<char*>(&c), sizeof(c));
std::cerr << c;
}
std::cerr << std::endl;
//read 4 bytes and initialize number of triangles
boost::uint32_t N32;
obj.read(reinterpret_cast<char*>(&N32), sizeof(N32));
unsigned int N = N32;
std::cerr << N << " triangles" << std::endl;
//reserve space for N faces
std::vector<Point_3> points;
std::map<Point_3, int> pmap;
typedef boost::tuple<int, int, int> Face;
std::vector<Face> faces;
faces.reserve(N);
//read all faces
int number_of_points = 0;
int number_of_snapped_points = 0;
for (int i = 0; i < N; i++)
{
//read face normal (it is ignored)
float normal[3];
obj.read(reinterpret_cast<char*>(&normal[0]), sizeof(normal[0]));
obj.read(reinterpret_cast<char*>(&normal[1]), sizeof(normal[1]));
obj.read(reinterpret_cast<char*>(&normal[2]), sizeof(normal[2]));
//read coordinates of all 3 points
int index[3];
for (int j = 0; j < 3; j++)
{
float x, y, z;
obj.read(reinterpret_cast<char*>(&x), sizeof(x));
obj.read(reinterpret_cast<char*>(&y), sizeof(y));
obj.read(reinterpret_cast<char*>(&z), sizeof(z));
Point_3 p(x, y, z);
if (pmap.find(p) == pmap.end())
{
// check brute force if there is a close point
bool found_close_point = false;
/*for (int k = 0; k < points.size(); k++)
{
if (sqrt(CGAL::squared_distance(p, points[k])) < 0.00001)
{
index[j] = k;
found_close_point = true;
number_of_snapped_points++;
}
}*/
if (!found_close_point)
{
points.push_back(p);
index[j] = number_of_points;
pmap[p] = number_of_points++;
}
}
else {
index[j] = pmap[p];
}
}
faces.push_back(boost::make_tuple(index[0], index[1], index[2]));
//read two additional bytes, and ignore them
char c;
obj.read(reinterpret_cast<char*>(&c), sizeof(c));
obj.read(reinterpret_cast<char*>(&c), sizeof(c));
}
std::cerr << number_of_snapped_points << " snapped points" << std::endl;
std::ofstream outputFile(outputFilename);
outputFile.precision(20);
outputFile << "OFF\n" << points.size() << " " << faces.size() << " 0" << std::endl;
for (int i = 0; i < points.size(); i++)
{
outputFile << points[i] << std::endl;
}
for (int i = 0; i < faces.size(); i++)
{
outputFile << "3 " << boost::get<0>(faces[i]) << " " << boost::get<1>(faces[i]) << " " << boost::get<2>(faces[i]) << std::endl;
}
return 0;
}
void fill_cube_1(Polyhedron_3 & poly)
{
std::string input =
"OFF\n\
8 12 0\n\
-1 -1 -1\n\
-1 1 -1\n\
1 1 -1\n\
1 -1 -1\n\
-1 -1 1\n\
-1 1 1\n\
1 1 1\n\
1 -1 1\n\
3 0 1 3\n\
3 3 1 2\n\
3 0 4 1\n\
3 1 4 5\n\
3 3 2 7\n\
3 7 2 6\n\
3 4 0 3\n\
3 7 4 3\n\
3 6 4 7\n\
3 6 5 4\n\
3 1 5 6\n\
3 2 1 6";
std::stringstream ss;
ss << input;
ss >> poly;
}
enum savingModality
{
SAVE_AS_POLYHEDRON_3 = 0,
SAVE_AS_SURFACE_MESH = 1,
};
int saveNefObjectInOffFile(Nef_polyhedron offObject, const char* filename, savingModality modality)
{
if (!offObject.is_simple())
{
printf("Object is not simple. Cannot convert to mesh or polyhedron\n");
return 1;
}
std::ofstream outStream;
outStream.open(filename);
if (modality == SAVE_AS_POLYHEDRON_3)
{
Polyhedron_3 outputPolyhedron;
offObject.convert_to_Polyhedron(outputPolyhedron);
outStream << outputPolyhedron;
}
else if (modality == SAVE_AS_SURFACE_MESH)
{
Mesh outputMesh;
CGAL::convert_nef_polyhedron_to_polygon_mesh(offObject, outputMesh);
outStream << outputMesh;
}
outStream.close();
return 0;
}
int main()
{
int ret;
//construct nef object #1
Polyhedron_3 cube1;
fill_cube_1(cube1);
Nef_polyhedron nefObject1(cube1);
//construct nef object #2
Nef_polyhedron nefObject2(cube1);
Aff_transformation_3 scale2(1, 0, 0,
0, 1, 0,
0, 0, 1,
2);
nefObject2.transform(scale2);
Aff_transformation_3 translation2(CGAL::TRANSLATION, Vector_3(-0.5, -0.5, -0.5));
nefObject2.transform(translation2);
//construct nef object #3
Nef_polyhedron nefObject3;
nefObject3 = nefObject1 - nefObject2;
//save results into .off file
ret = saveNefObjectInOffFile(nefObject3, "out1.off", SAVE_AS_POLYHEDRON_3);
ret = saveNefObjectInOffFile(nefObject3, "out2.off", SAVE_AS_SURFACE_MESH);
return 0;
}
and the screenshots of the visualization of the two files: saving as Polyhedron_3 and saving as Surface_mesh. As you can see, it seems like if some faces were missing.
My question is: "Why the results are visualized different?"
The output to Polyhedron_3 is triangulated while the output to Surface_mesh is not. There is a bug in meshlab to display non convex faces I guess.
Look at the doc you'll see that there is a Boolean parameter to trigger or not the triangulation.

Approximation of 1-exp(-mu*t) when mu*t is very small

I am working on some fairly simple linear attenuation and absorption calculations and from high school math I seem to remember that there is an approximation of:
1-exp(-mu*t)
When
mu*t << 1
Does this approximation exist? I thought it was a taylor series expansion but could not convince myself after looking through old math textbooks.
Any help or direction is greatly appreciated.
mu*t plus O((mu*t)^2)
To see why, try rewriting this as f(u) = 1-exp(-u), and taking a Taylor series expansion at the point u=0.
If you are using C++11, for example, it has this function as part of the standard library: expm1.
In your case, you would call it as -expm1(-mu*t).
Otherwise, you can derive the Maclaurin series for expm1 easily from the Maclaurin series for exp(x) by simply dropping the first 1. One implementation is given below in expm1_maclaurin.
Comparing this with the built-in expm1:
#include <cmath>
#include <iostream>
#include <limits>
using namespace std;
double expm1_maclaurin( double x )
{
const double order = 10;
double retval = 1.0;
for( int i = order ; 1 < i ; --i ) retval = 1.0 + x*retval/i;
return x*retval;
}
int main()
{
cout.precision(numeric_limits<double>::digits10);
for( int i = 0 ; i <= 32 ; ++i )
{
double x = i < 0 ? 1.0 * (1u<<-i) : i < 32 ? 1.0 / (1u<<i) : 0;
cout << "x=" << x << ' '
<< expm1(x) << ' '
<< expm1_maclaurin(x) << ' '
<< ( expm1(x) == expm1_maclaurin(x) ) << endl;
}
return 0;
}
Output:
x=1 1.71828182845905 1.71828180114638 0
x=0.5 0.648721270700128 0.648721270687366 0
x=0.25 0.284025416687742 0.284025416687735 0
x=0.125 0.133148453066826 0.133148453066826 1
x=0.0625 0.0644944589178594 0.0644944589178594 1
x=0.03125 0.0317434074991027 0.0317434074991027 1
...
For all positive x <= 1/8 the result is equal to full double precision of expm1.

CUFFT: trying to implement row by row fft of a matrix

I am trying to replicate matlab fft functionality, where it does a row by row (or column by column) fft of a matrix. Each row would be one of the batches in the cufft plan.
I can get it working using cufftExecC2C (the commented out part in the code below works), but not cufftExecR2C. My code is using cufftPlan1d, but ideally I want to implement it using cufftPlanMany.
I am wondering what I'm doing wrong, and if there is a better way of doing this. Thank you.
// linker -> input -> additional dependencies -> add 'cufft.lib'
// VC++ Directories -> include directories - > add 'C:\ProgramData\NVIDIA Corporation\CUDA Samples\v6.0\common\inc'
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <iostream>
#define NX 6
#define NY 5
void printArray(float *my_array);
void printComplexArray(float2 *my_array);
int main(){
/************************************************************ C2C ************************************************************/
/*
float2 *initial_array = (float2 *)malloc(sizeof(float2) * NX * NY);
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++){
initial_array[NY * h + w].x = 0;
initial_array[NY * h + w].y = 0;
}
}
initial_array[NY*3 + 0].x = 1;
initial_array[NY*5 + 0].x = 1;
printComplexArray(initial_array);
float2 *transformed_array= (float2 *)malloc(sizeof(float2) * NX * NY);
cufftComplex *gpu_initial_array;
cufftComplex *gpu_transformed_array;
cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftComplex));
cudaMalloc((void **)&gpu_transformed_array, NX*NY*sizeof(cufftComplex));
cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float2), cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan1d(&plan, NY, CUFFT_C2C, NX);
cufftExecC2C(plan, gpu_initial_array, gpu_transformed_array, CUFFT_FORWARD);
cudaMemcpy(transformed_array, gpu_transformed_array, NX*NY*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
printComplexArray(transformed_array);
*/
/************************************************************ C2C ************************************************************/
/************************************************************ R2C ************************************************************/
float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
initial_array[NY * h + w] = 0;
}
initial_array[NY*3 + 0] = 1;
printArray(initial_array);
float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);
cufftReal *gpu_initial_array;
cufftComplex *gpu_transformed_array;
cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal));
cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex));
cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan1d(&plan, NY, CUFFT_R2C, NX);
// ***** cufftPlanMany *****
//int n[2] = {NX, NY};
//cufftPlanMany(&plan,1,n,NULL,1,0,NULL,1,0,CUFFT_R2C,NX);
cufftExecR2C(plan, gpu_initial_array, gpu_transformed_array);
cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
printComplexArray(transformed_array);
/************************************************************ R2C ************************************************************/
cufftDestroy(plan);
free(initial_array);
free(transformed_array);
cudaFree(gpu_initial_array);
cudaFree(gpu_transformed_array);
std::system("pause");
return 0;
}
void printArray(float *my_array){
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
std::cout << my_array[NY * h + w] << " | ";
std::cout << std::endl;
}
std::cout << std::endl;
}
void printComplexArray(float2 *my_array){
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
std::cout << my_array[NY * h + w].x << " + " << my_array[NY * h + w].y << " | ";
std::cout << std::endl;
}
std::cout << std::endl;
}
It seems that your issue resides in the way you print out the result. You cannot use the same routine to print for the two cases of CUFFT_R2C and CUFFT_C2C. In the former case, you have a (NY/2+1)*NX sized output, while the the latter case you have a NY*NX sized output. The fixed code below should work.
Also, it would be also good to add proper CUDA error check and CUFFT error check, which I have also added to the code below.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <assert.h>
#include <iostream>
#define NX 6
#define NY 5
void printArray(float *my_array);
void printComplexSymmetricArray(float2 *my_array);
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
int main(){
float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
initial_array[NY * h + w] = 0;
}
initial_array[NY*3 + 0] = 1;
printArray(initial_array);
float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);
cufftReal *gpu_initial_array;
cufftComplex *gpu_transformed_array;
gpuErrchk(cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal)));
gpuErrchk(cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex)));
gpuErrchk(cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice));
cufftHandle plan;
cufftSafeCall(cufftPlan1d(&plan, NY, CUFFT_R2C, NX));
cufftSafeCall(cufftExecR2C(plan, (cufftReal*)gpu_initial_array, (cufftComplex*)gpu_transformed_array));
gpuErrchk(cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
printComplexSymmetricArray(transformed_array);
cufftSafeCall(cufftDestroy(plan));
free(initial_array);
free(transformed_array);
gpuErrchk(cudaFree(gpu_initial_array));
gpuErrchk(cudaFree(gpu_transformed_array));
std::system("pause");
return 0;
}
/***********************/
/* PRINTOUT REAL ARRAY */
/***********************/
void printArray(float *my_array){
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
std::cout << my_array[NY * h + w] << " | ";
std::cout << std::endl;
}
std::cout << std::endl;
}
/************************************/
/* PRINTOUT COMPLEX SYMMETRIC ARRAY */
/************************************/
void printComplexSymmetricArray(float2 *my_array){
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY/2+1; w++)
std::cout << my_array[(NY/2+1) * h + w].x << " + " << my_array[(NY/2+1) * h + w].y << " | ";
std::cout << std::endl;
}
std::cout << std::endl;
}

Carefully deleting N items from a "circular" vector (or perhaps just an NSMutableArray)

Imagine a std:vector, say, with 100 things on it (0 to 99) currently. You are treating it as a loop. So the 105th item is index 4; forward 7 from index 98 is 5.
You want to delete N items after index position P.
So, delete 5 items after index 50; easy.
Or 5 items after index 99: as you delete 0 five times, or 4 through 0, noting that position at 99 will be erased from existence.
Worst, 5 items after index 97 - you have to deal with both modes of deletion.
What's the elegant and solid approach?
Here's a boring routine I wrote
-(void)knotRemovalHelper:(NSMutableArray*)original
after:(NSInteger)nn howManyToDelete:(NSInteger)desired
{
#define ORCO ((NSInteger)[original count])
static NSInteger kount, howManyUntilLoop, howManyExtraAferLoop;
if ( ... our array is NOT a loop ... )
// trivial, if messy...
{
for ( kount = 1; kount<=desired; ++kount )
{
if ( (nn+1) >= ORCO )
return;
[original removeObjectAtIndex:( nn+1 )];
}
return;
}
else // our array is a loop
// messy, confusing and inelegant. how to improve?
// here we go...
{
howManyUntilLoop = (ORCO-1) - nn;
if ( howManyUntilLoop > desired )
{
for ( kount = 1; kount<=desired; ++kount )
[original removeObjectAtIndex:( nn+1 )];
return;
}
howManyExtraAferLoop = desired - howManyUntilLoop;
for ( kount = 1; kount<=howManyUntilLoop; ++kount )
[original removeObjectAtIndex:( nn+1 )];
for ( kount = 1; kount<=howManyExtraAferLoop; ++kount )
[original removeObjectAtIndex:0];
return;
}
#undef ORCO
}
Update!
InVariant's second answer leads to the following excellent solution. "starting with" is much better than "starting after". So the routine now uses "start with". Invariant's second answer leads to this very simple solution...
N times do if P < currentsize remove P else remove 0
-(void)removeLoopilyFrom:(NSMutableArray*)ra
startingWithThisOne:(NSInteger)removeThisOneFirst
howManyToDelete:(NSInteger)countToDelete
{
// exception if removeThisOneFirst > ra highestIndex
// exception if countToDelete is > ra size
// so easy thanks to Invariant:
for ( do this countToDelete times )
{
if ( removeThisOneFirst < [ra count] )
[ra removeObjectAtIndex:removeThisOneFirst];
else
[ra removeObjectAtIndex:0];
}
}
Update!
Toolbox has pointed out the excellent idea of working to a new array - super KISS.
Here's an idea off the top of my head.
First, generate an array of integers representing the indices to remove. So "remove 5 from index 97" would generate [97,98,99,0,1]. This can be done with the application of a simple modulus operator.
Then, sort this array descending giving [99,98,97,1,0] and then remove the entries in that order.
Should work in all cases.
This solution seems to work, and it copies all remaining elements in the vector only once (to their final destination).
Assume kNumElements, kStartIndex, and kNumToRemove are defined as const size_t values.
vector<int> my_vec(kNumElements);
for (size_t i = 0; i < my_vec.size(); ++i) {
my_vec[i] = i;
}
for (size_t i = 0, cur = 0; i < my_vec.size(); ++i) {
// What is the "distance" from the current index to the start, taking
// into account the wrapping behavior?
size_t distance = (i + kNumElements - kStartIndex) % kNumElements;
// If it's not one of the ones to remove, then we keep it by copying it
// into its proper place.
if (distance >= kNumToRemove) {
my_vec[cur++] = my_vec[i];
}
}
my_vec.resize(kNumElements - kNumToRemove);
There's nothing wrong with two loop solutions as long as they're readable and don't do anything redundant. I don't know Objective-C syntax, but here's the pseudocode approach I'd take:
endIdx = after + howManyToDelete
if (Len <= after + howManyToDelete) //will have a second loop
firstloop = Len - after; //handle end in the first loop, beginning in second
else
firstpass = howManyToDelete; //the first loop will get them all
for (kount = 0; kount < firstpass; kount++)
remove after+1
for ( ; kount < howManyToDelete; kount++) //if firstpass < howManyToDelete, clean up leftovers
remove 0
This solution doesn't use mod, does the limit calculation outside the loop, and touches the relevant samples once each. The second for loop won't execute if all the samples were handled in the first loop.
The common way to do this in DSP is with a circular buffer. This is just a fixed length buffer with two associated counters:
//make sure BUFSIZE is a power of 2 for quick mod trick
#define BUFSIZE 1024
int CircBuf[BUFSIZE];
int InCtr, OutCtr;
void PutData(int *Buf, int count) {
int srcCtr;
int destCtr = InCtr & (BUFSIZE - 1); // if BUFSIZE is a power of 2, equivalent to and faster than destCtr = InCtr % BUFSIZE
for (srcCtr = 0; (srcCtr < count) && (destCtr < BUFSIZE); srcCtr++, destCtr++)
CircBuf[destCtr] = Buf[srcCtr];
for (destCtr = 0; srcCtr < count; srcCtr++, destCtr++)
CircBuf[destCtr] = Buf[srcCtr];
InCtr += count;
}
void GetData(int *Buf, int count) {
int srcCtr = OutCtr & (BUFSIZE - 1);
int destCtr = 0;
for (destCtr = 0; (srcCtr < BUFSIZE) && (destCtr < count); srcCtr++, destCtr++)
Buf[destCtr] = CircBuf[srcCtr];
for (srcCtr = 0; srcCtr < count; srcCtr++, destCtr++)
Buf[destCtr] = CircBuf[srcCtr];
OutCtr += count;
}
int BufferOverflow() {
return ((InCtr - OutCtr) > BUFSIZE);
}
This is pretty lightweight, but effective. And aside from the ctr = BigCtr & (SIZE-1) stuff, I'd argue it's highly readable. The only reason for the & trick is in old DSP environments, mod was an expensive operation so for something that ran often, like every time a buffer was ready for processing, you'd find ways to remove stuff like that. And if you were doing FFT's, your buffers were probably a power of 2 anyway.
These days, of course, you have 1 GHz processors and magically resizing arrays. You kids get off my lawn.
Another method:
N times do {remove entry at index P mod max(ArraySize, P)}
Example:
N=5, P=97, ArraySize=100
1: max(100, 97)=100 so remove at 97%100 = 97
2: max(99, 97)=99 so remove at 97%99 = 97 // array size is now 99
3: max(98, 97)=98 so remove at 97%98 = 97
4: max(97, 97)=97 so remove at 97%97 = 0
5: max(96, 97)=97 so remove at 97%97 = 0
I don't program iphone for know, so I image std::vector, it's quite easy, simple and elegant enough:
#include <iostream>
using std::cout;
#include <vector>
using std::vector;
#include <cassert> //no need for using, assert is macro
template<typename T>
void eraseCircularVector(vector<T> & vec, size_t position, size_t count)
{
assert(count <= vec.size());
if (count > 0)
{
position %= vec.size(); //normalize position
size_t positionEnd = (position + count) % vec.size();
if (positionEnd < position)
{
vec.erase(vec.begin() + position, vec.end());
vec.erase(vec.begin(), vec.begin() + positionEnd);
}
else
vec.erase(vec.begin() + position, vec.begin() + positionEnd);
}
}
int main()
{
vector<int> values;
for (int i = 0; i < 10; ++i)
values.push_back(i);
cout << "Values: ";
for (vector<int>::const_iterator cit = values.begin(); cit != values.end(); cit++)
cout << *cit << ' ';
cout << '\n';
eraseCircularVector(values, 5, 1); //remains 9: 0,1,2,3,4,6,7,8,9
eraseCircularVector(values, 16, 5); //remains 4: 3,4,6,7
cout << "Values: ";
for (vector<int>::const_iterator cit = values.begin(); cit != values.end(); cit++)
cout << *cit << ' ';
cout << '\n';
return 0;
}
However, you might consider:
creating new loop_vector class, if you use this kind of functionality enough
using list if you perform many deletions (or few deletions (not from end, that's simple pop_back) but large array)
If your container (NSMutableArray or whatever) is not list, but vector (i.e. resizable array), you most definitely don't want to delete items one by one, but whole range (e.g. std::vector's erase(begin, end)!
Edit: reacting to comment, to fully realize what must be done by vector, if you erase element other than the last one: it must copy all values after that element (e.g. 1000 items in array, you erase first, 999x copying (moving) of item, that is very costly).
Example:
#include <iostream>
#include <vector>
#include <ctime>
using namespace std;
int main()
{
clock_t start, end;
vector<int> vec;
const int items = 64 * 1024;
cout << "using " << items << " items in vector\n";
for (size_t i = 0; i < items; ++i) vec.push_back(i);
start = clock();
while (!vec.empty()) vec.erase(vec.begin());
end = clock();
cout << "Inefficient method took: "
<< (end - start) * 1.0 / CLOCKS_PER_SEC << " ms\n";
for (size_t i = 0; i < items; ++i) vec.push_back(i);
start = clock();
vec.erase(vec.begin(), vec.end());
end = clock();
cout << "Efficient method took: "
<< (end - start) * 1.0 / CLOCKS_PER_SEC << " ms\n";
return 0;
}
Produces output:
using 65536 items in vector
Inefficient method took: 1.705 ms
Efficient method took: 0 ms
Note it's very easy to get inefficient, look e.g. have at http://www.cplusplus.com/reference/stl/vector/erase/

form a number using consecutive numbers

I was puzzled with one of the question in Microsoft interview which is as given below:
A function should accept a range( 3 - 21 ) and it should print all the consecutive numbers combinations to form each number as given below:
3 = 1+2
5 = 2+3
6 = 1+2+3
7 = 3+4
9 = 4+5
10 = 1+2+3+4
11 = 5+6
12 = 3+4+5
13 = 6+7
14 = 2+3+4+5
15 = 1+2+3+4+5
17 = 8+9
18 = 5+6+7
19 = 9+10
20 = 2+3+4+5+6
21 = 10+11
21 = 1+2+3+4+5+6
could you please help me in forming this sequence in C#?
Thanks,
Mahesh
So here is a straightforward/naive answer (in C++, and not tested; but you should be able to translate). It uses the fact that
1 + 2 + ... + n = n(n+1)/2,
which you have probably seen before. There are lots of easy optimisations that can be made here which I have omitted for clarity.
void WriteAsSums (int n)
{
for (int i = 0; i < n; i++)
{
for (int j = i; j < n; j++)
{
if (n = (j * (j+1) - i * (i+1))/2) // then n = (i+1) + (i+2) + ... + (j-1) + j
{
std::cout << n << " = ";
for (int k = i + 1; k <= j; k++)
{
std::cout << k;
if (k != j) // this is not the interesting bit
std::cout << std::endl;
else
std::cout << " + ";
}
}
}
}
}
This is some pseudo code to find all the combinations if any exists:
function consecutive_numbers(n, m)
list = [] // empty list
list.push_back(m)
while m != n
if m > n
first = list.remove_first
m -= first
else
last = list.last_element
if last <= 1
return []
end
list.push_back(last - 1)
m += last - 1
end
end
return list
end
function all_consecutive_numbers(n)
m = n / 2 + 1
a = consecutive_numbers(n, m)
while a != []
print_combination(n, a)
m = a.first - 1
a = consecutive_numbers(n, m)
end
end
function print_combination(n, a)
print(n + " = ")
print(a.remove_first)
foreach element in a
print(" + " + element)
end
print("\n")
end
A call to all_consecutive_numbers(21) would print:
21 = 11 + 10
21 = 8 + 7 + 6
21 = 6 + 5 + 4 + 3 + 2 + 1
I tested it in ruby (code here) and it seems to work. I'm sure the basic idea could easily be implemented in C# as well.
I like this problem. Here is a slick and slightly mysterious O(n) solution:
void DisplaySum (int n, int a, int b)
{
std::cout << n << " = ";
for (int i = a; i < b; i++) std::cout << i << " + ";
std::cout << b;
}
void WriteAsSums (int n)
{
N = 2*n;
for (int i = 1; i < N; i++)
{
if (~(N%i))
{
int j = N/i;
if (j+i%2)
{
int a = (j+i-1)/2;
int b = (j-i+1)/2;
if (a>0 & a<b) // exclude trivial & negative solutions
DisplaySum(n,a,b);
}
}
}
}
Here's something in Groovy, you should be able to understand what's going on. It's not the most efficient code and doesn't create the answers in the order you cite in your question (you seem to be missing some though) but it might give you a start.
def f(a,b) {
for (i in a..b) {
for (j in 1..i/2) {
def (sum, str, k) = [ 0, "", j ]
while (sum < i) {
sum += k
str += "+$k"
k++
}
if (sum == i) println "$i=${str[1..-1]}"
}
}
}
Output for f(3,21) is:
3=1+2
5=2+3
6=1+2+3
7=3+4
9=2+3+4
9=4+5
10=1+2+3+4
11=5+6
12=3+4+5
13=6+7
14=2+3+4+5
15=1+2+3+4+5
15=4+5+6
15=7+8
17=8+9
18=3+4+5+6
18=5+6+7
19=9+10
20=2+3+4+5+6
21=1+2+3+4+5+6
21=6+7+8
21=10+11
Hope this helps. It kind of conforms to the tenet of doing the simplest thing that could possibly work.
if we slice a into 2 digit, then a = b + (b+1) = 2*b + (0+1)
if we slice a into 3 digit, then a = b + (b+1) + (b+2) = 3*b + (0+1+2)
...
if we slice a into n digit, then a = b + (b+1) +...+ (b+n) = nb + (0+1+n-1)
the last result is a = nb + n*(n-1)/2, a,b,n are all ints.
so O(N) Algorithm is:
void seq_sum(int a)
{
// start from 2 digits
int n=2;
while(1)
{
int value = a-n*(n-1)/2;
if(value < 0)
break;
// meet the quotation we deduct
if( value%n == 0 )
{
int b=value/n;
// omit the print stage
print("......");
}
n++;
}
}