OpenCL 1D strided convolution performance - filtering

For downsampling a signal, I use a FIR filter + decimation stage (that's practical a strided convolution). The big advantage of combining filtering and decimation is the reduced computational cost (by the decimation factor).
With a straight forward OpenCL implementation, I am not able to benefit from the decimation. Quite to the contrary: The convolution with a decimation factor of 4 is 25% slower than the full convolution.
Kernel Code:
__kernel void decimation(__constant float *input,
__global float *output,
__constant float *coefs,
const int taps,
const int decimationFactor) {
int posOutput = get_global_id(0);
float result = 0;
for (int tap=0; tap<taps; tap++) {
int posInput = (posOutput * decimationFactor) - tap;
result += input[posInput] * coefs[tap];
}
output[posOutput] = result;
}
I guess it is due to the uncoalesced memory access. Though I can not think of a solution to fix the problem. Any ideas?
Edit: I tried Dithermaster's solution to split the problem into coalesced reads to shared local memory and convolution from local memory:
__kernel void decimation(__constant float *input,
__global float *output,
__constant float *coefs,
const int taps,
const int decimationFactor,
const int bufferSize,
__local float *localInput) {
const int posOutput = get_global_id(0);
const int localSize = get_local_size(0);
const int localId = get_local_id(0);
const int groupId = get_group_id(0);
const int localInputOffset = taps-1;
const int localInputOverlap = taps-decimationFactor;
const int localInputSize = localInputOffset + localSize * decimationFactor;
// 1. transfer global input data to local memory
// read global input to local input (only overlap)
if (localId < localInputOverlap) {
int posInputStart = ((groupId*localSize) * decimationFactor) - (taps-1);
int posInput = posInputStart + localId;
int posLocalInput = localId;
localInput[posLocalInput] = 0.0f;
if (posInput >= 0)
localInput[posLocalInput] = input[posInput];
}
// read remaining global input to local input
// 1. alternative: strided read
// for (int i=0; i<decimationFactor; i++) {
// int posInputStart = (groupId*localSize) * decimationFactor;
// int posInput = posInputStart + localId * decimationFactor - i;
// int posLocalInput = localInputOffset + localId * decimationFactor - i;
// localInput[posLocalInput] = 0.0f;
// if ((posInput >= 0) && (posInput < bufferSize*decimationFactor))
// localInput[posLocalInput] = input[posInput];
// }
// 2. alternative: coalesced read (in blocks of localSize)
for (int i=0; i<decimationFactor; i++) {
int posInputStart = (groupId*localSize) * decimationFactor;
int posInput = posInputStart - (decimationFactor-1) + i*localSize + localId;
int posLocalInput = localInputOffset - (decimationFactor-1) + i*localSize + localId;
localInput[posLocalInput] = 0.0f;
if ((posInput >= 0) && (posInput < bufferSize*decimationFactor))
localInput[posLocalInput] = input[posInput];
}
// 2. wait until every thread completed
barrier(CLK_LOCAL_MEM_FENCE);
// 3. convolution
if (posOutput < bufferSize) {
float result = 0.0f;
for (int tap=0; tap<taps; tap++) {
int posLocalInput = localInputOffset + (localId * decimationFactor) - tap;
result += localInput[posLocalInput] * coefs[tap];
}
output[posOutput] = result;
}
}
Big improvement! But still, the performance does not correlate with the overall operations (not proportional to the decimation factor):
speedup for full convolution compared to first approach: ~12 %
computatoin time for decimation compared to full convolution:
decimation factor 2: 61 %
decimation factor 4: 46 %
decimation factor 8: 53 %
decimation factor 16: 68 %
The performance has a optimum for a decimation factor of 4. Why is that? Any ideas for further improvements?
Edit 2: Diagram with shared local memory:
Edit 3: Comparison of the performance for the 3 different implementations

Due to the amount of data overlap (66%), this could benefit from sharing data read from memory between work items, within a workgroup. You could get rid of redundant reads and also make coalesced reads. Break you kernel up into two parts: The first part does coalesced reads for all the data needed within the work group, into shared local memory. Then a memory barrier to synchronize. Then in the second part do the convolutions using reads from shared local memory.
P.S. Thanks for the diagram, it helped me understand your goal more quickly than trying to read code.

Related

Cellular automata with compute shader & race condition problem

I'm trying to make a simple "falling sand" cellular automata, following these two guides:
https://www.youtube.com/watch?v=5Ka3tbbT-9E&t=29s
https://www.youtube.com/watch?v=VLZjd_Y1gJ8&t=62s
I'm using compute shader, along with Unity to calculate the position of next frame's particles.
#pragma kernel UpTex
#define AIR -1
#define SAND 0
struct blockInfo
{
int id;
int lifeTime;
float velocity;
float4 col;
int updated;
};
struct cellInf
{
int xPos;
int yPos;
};
RWStructuredBuffer<blockInfo> readBlock, writeBlock;
int sizeX, sizeY;
RWStructuredBuffer<cellInf> location;
int ToOneDi(int i, int j)
{
int newX = sizeY - 1 - j, newY = i;
return (sizeX * newX + (newY));
}
[numthreads(16,1,1)]
void UpTex (uint3 id : SV_DispatchThreadID)
{
int2 pos = int2(location[id.x].xPos, location[id.x].yPos);
int cur = ToOneDi(pos.x, pos.y);
blockInfo inf = readBlock[cur];
if (inf.updated == 1) {return; } //updated then skip
inf.updated = 1;
writeBlock[cur] = inf; //preemptive sets
if (pos.y == 0 || pos.x == 0 || pos.x == sizeX - 1) {return;} //base level
if (inf.id == SAND)
{
int left = ToOneDi(pos.x-1, pos.y-1);
int mid = ToOneDi(pos.x, pos.y-1);
int right = ToOneDi(pos.x+1, pos.y-1);
blockInfo leftBlock = readBlock[left];
blockInfo midBlock = readBlock[mid];
blockInfo rightBlock = readBlock[right];
//checking -> double check
if (midBlock.id == AIR)
{
writeBlock[mid] = inf;
writeBlock[cur] = midBlock;
}
else if (leftBlock.id == AIR)
{
writeBlock[left] = inf;
writeBlock[cur] = leftBlock;
}
else if (rightBlock.id == AIR)
{
writeBlock[right] = inf;
writeBlock[cur] = rightBlock;
}
else
{
writeBlock[cur] = inf;
}
}
//writeBlock[cur].col = float4(1, 0, 0, 1); //test
}
Currently I'm only simulating air and sand. readBlock is the previous frame info, writeBlock is current frame info (which would be utilized in another compute shader to apply to a texture and location just contained the 2D position of the cell to be updated.
With this code, it's clear that 2 sand block of the previous frame can cause a race condition by accessing the same cell (when falling into that cell), causing the color displayed to be changed.
I've tried to fix the problem by creating different groups to be processed linearly, and the cells in each groups separated enough to not cause a race condition to each other.
The new problem is, as the video shown, if I tried to move a particle (like sand) more than one block at a time (basically adding velocity to it), the grouping would have to be changed again. At some point it'd probably cause some form of diminishing returns if I have to create too many groups.
Any ideas how to fix the race conditions other than extra groupings?
Further note:
Both youtubers linked above and even the devs of the game that they use as reference (Noita - https://www.youtube.com/watch?v=prXuyMCgbTc) seems to agree that brute-force with a single thread, then using chunking and other techniques for optimization is just more bug-free and less headache.
The 2 recreations use C/Java, and the devs create their own custom engine with C++ (and probably OpenGL from what I can piece together) which from my experience already ran much faster. As such I haven't been able to find any other fix to my problem.
Links:
https://www.reddit.com/r/Games/comments/d7cqjz/comment/f0z8hu8/
https://www.reddit.com/r/gamedev/comments/d93op6/noita_pixel_simulation_any_tip_about_how_is_it/

Implementing convolution in C++ using fftw 3

UPDATE
See my fundamental based question on DSP stackexchange here
UPDATE
I am still experiencing crackling in the output. These crackles are now less pronounced and are only audible when the volume is turned up
UPDATE
Following the advice given here has removed the crackling sound from my output. I will test with other available HRIRs to see if the convolution is indeed working properly and will answer this question once I've verified that my code now works
UPDATE
I have made some progress, but I still think there is an issue with my convolution implementation.
The following is my revised program:
#define HRIR_LENGTH 512
#define WAV_SAMPLE_SIZE 256
while (signal_input_wav.read(&signal_input_buffer[0], WAV_SAMPLE_SIZE) >= WAV_SAMPLE_SIZE)
{
#ifdef SKIP_CONVOLUTION
// Copy the input buffer over
std::copy(signal_input_buffer.begin(),
signal_input_buffer.begin() + WAV_SAMPLE_SIZE,
signal_output_buffer.begin());
signal_output_wav.write(&signal_output_buffer[0], WAV_SAMPLE_SIZE);
#else
// Copy the first segment into the buffer
// with zero padding
for (int i = 0; i < HRIR_LENGTH; ++i)
{
if (i < WAV_SAMPLE_SIZE)
{
signal_buffer_fft_in[i] = signal_input_buffer[i];
}
else
{
signal_buffer_fft_in[i] = 0; // zero pad
}
}
// Dft of the signal segment
fftw_execute(signal_fft);
// Convolve in the frequency domain by multiplying filter kernel with dft signal
for (int i = 0; i < HRIR_LENGTH; ++i)
{
signal_buffer_ifft_in[i] = signal_buffer_fft_out[i] * left_hrir_fft_out[i]
- signal_buffer_fft_out[HRIR_LENGTH - i] * left_hrir_fft_out[HRIR_LENGTH - i];
signal_buffer_ifft_in[HRIR_LENGTH - i] = signal_buffer_fft_out[i] * left_hrir_fft_out[HRIR_LENGTH - i]
+ signal_buffer_fft_out[HRIR_LENGTH - i] * left_hrir_fft_out[i];
//double re = signal_buffer_out[i];
//double im = signal_buffer_out[BLOCK_OUTPUT_SIZE - i];
}
// inverse dft back to time domain
fftw_execute(signal_ifft);
// Normalize the data
for (int i = 0; i < HRIR_LENGTH; ++i)
{
signal_buffer_ifft_out[i] = signal_buffer_ifft_out[i] / HRIR_LENGTH;
}
// Overlap-add method
for (int i = 0; i < HRIR_LENGTH; ++i)
{
if (i < WAV_SAMPLE_SIZE)
{
signal_output_buffer[i] = signal_overlap_buffer[i] + signal_buffer_ifft_out[i];
}
else
{
signal_output_buffer[i] = signal_buffer_ifft_out[i];
signal_overlap_buffer[i] = signal_output_buffer[i]; // record into the overlap buffer
}
}
// Write the block to the output file
signal_output_wav.write(&signal_output_buffer[0], HRIR_LENGTH);
#endif
}
The resulting output sound file contains crackling sounds; presumably artefacts left from the buggy fftw implementation. Also, writing blocks of 512 (HRIR_LENGTH) seems to result in some aliasing, with the soundfile upon playback sounding like a vinyl record being slowed down. Writing out blocks of size WAV_SAMPLE_SIZE (256, half of the fft output) seems to playback at normal speed.
However, irrespective of this the crackling sound remains.
ORIGINAL
I'm trying to implement convolution using the fftw library in C++.
I can load my filter perfectly fine, and am zero padding both the filter (of length 512) and the input signal (of length 513) in order to get a signal output block of 1024 and using this as the fft size.
Here is my code:
#define BLOCK_OUTPUT_SIZE 1024
#define HRIR_LENGTH 512
#define WAV_SAMPLE_SIZE 513
#define INPUT_SHIFT 511
while (signal_input_wav.read(&signal_input_buffer[0], WAV_SAMPLE_SIZE) >= WAV_SAMPLE_SIZE)
{
#ifdef SKIP_CONVOLUTION
// Copy the input buffer over
std::copy(signal_input_buffer.begin(),
signal_input_buffer.begin() + WAV_SAMPLE_SIZE,
signal_output_buffer.begin());
signal_output_wav.write(&signal_output_buffer[0], WAV_SAMPLE_SIZE);
#else
// Zero pad input
for (int i = 0; i < INPUT_SHIFT; ++i)
signal_input_buffer[WAV_SAMPLE_SIZE + i] = 0;
// Copy to the signal convolve buffer
for (int i = 0; i < BLOCK_OUTPUT_SIZE; ++i)
{
signal_buffer_in[i] = signal_input_buffer[i];
}
// Dft of the signal segment
fftw_execute(signal_fft);
// Convolve in the frequency domain by multiplying filter kernel with dft signal
for (int i = 1; i < BLOCK_OUTPUT_SIZE; ++i)
{
signal_buffer_out[i] = signal_buffer_in[i] * left_hrir_fft_in[i]
- signal_buffer_in[BLOCK_OUTPUT_SIZE - i] * left_hrir_fft_in[BLOCK_OUTPUT_SIZE - i];
signal_buffer_out[BLOCK_OUTPUT_SIZE - i]
= signal_buffer_in[BLOCK_OUTPUT_SIZE - i] * left_hrir_fft_in[i]
+ signal_buffer_in[i] * left_hrir_fft_in[BLOCK_OUTPUT_SIZE - i];
double re = signal_buffer_out[i];
double im = signal_buffer_out[BLOCK_OUTPUT_SIZE - i];
}
// inverse dft back to time domain
fftw_execute(signal_ifft);
// Normalize the data
for (int i = 0; i < BLOCK_OUTPUT_SIZE; ++i)
{
signal_buffer_out[i] = signal_buffer_out[i] / i;
}
// Overlap and add with the previous block
if (first_block)
{
first_block = !first_block;
for (int i = 0; i < BLOCK_OUTPUT_SIZE; ++i)
{
signal_output_buffer[i] = signal_buffer_out[i];
}
}
else
{
for (int i = WAV_SAMPLE_SIZE; i < BLOCK_OUTPUT_SIZE; ++i)
{
signal_output_buffer[i] = signal_output_buffer[i] + signal_buffer_out[i];
}
}
// Write the block to the output file
signal_output_wav.write(&signal_output_buffer[0], BLOCK_OUTPUT_SIZE);
#endif
}
In the end, the resulting output file contains garbage, but is not all zeros.
Things I have tried:
1) Using the standard complex interface fftw_plan_dft_1d with the appropriate fftw_complex type. Same issues arise.
2) Using a smaller input sample size and iterating over the zero padded blocks (overlap-add).
I also note that its not a fault of libsndfile; toggling SKIP_CONVOLUTION does successfully result in copying the input file to the output file.

How does the reversebits function of HLSL SM5 work?

I am trying to implement an inverse FFT in a HLSL compute shader and don't understand how the new inversebits function works. The shader is run under Unity3D, but that shouldn't make a difference.
The problem is, that the resulting texture remains black with the exception of the leftmost one or two pixels in every row. It seems to me, as if the reversebits function wouldn't return the correct indexes.
My very simple code is as following:
#pragma kernel BitReverseHorizontal
Texture2D<float4> inTex;
RWTexture2D<float4> outTex;
uint2 getTextureThreadPosition(uint3 groupID, uint3 threadID) {
uint2 pos;
pos.x = (groupID.x * 16) + threadID.x;
pos.y = (groupID.y * 16) + threadID.y;
return pos;
}
[numthreads(16,16,1)]
void BitReverseHorizontal (uint3 threadID : SV_GroupThreadID, uint3 groupID : SV_GroupID)
{
uint2 pos = getTextureThreadPosition(groupID, threadID);
uint xPos = reversebits(pos.x);
uint2 revPos = uint2(xPos, pos.y);
float4 values;
values.x = inTex[pos].x;
values.y = inTex[pos].y;
values.z = inTex[revPos].z;
values.w = 0.0f;
outTex[revPos] = values;
}
I played around with this for quite a while and found out, that if I replace the reversebits line with this one here:
uint xPos = reversebits(pos.x << 23);
it works. Although I have no idea why. Could be just coincidence. Could someone please explain to me, how I have to use the reversebits function correctly?
Are you sure you want to reverse the bits?
x = 0: reversed: x = 0
x = 1: reversed: x = 2,147,483,648
x = 2: reversed: x = 1,073,741,824
etc....
If you fetch texels from a texture using coordinates exceeding the width of the texture then you're going to get black. Unless the texture is > 1 billion texels wide (it isn't) then you're fetching well outside the border.
I am doing the same and came to the same problem and these answers actually answered it for me but i'll give you the explanation and a whole solution.
So the solution with variable length buffers in HLSL is:
uint reversedIndx;
uint bits = 32 - log2(xLen); // sizeof(uint) - log2(numberOfIndices);
for (uint j = 0; j < xLen; j ++)
reversedIndx = reversebits(j << bits);
And what you found/noticed essentially pushes out all the leading 0 of your index so you are just reversing the least significant or rightmost bits up until the max bits we want.
for example:
int length = 8;
int bits = 32 - 3; // because 1 << 3 is 0b1000 and we want the inverse like a mask
int j = 6;
and since the size of an int is generally 32bits in binary j would be
j = 0b00000000000000000000000000000110;
and reversed it would be (AKA reversebits(j);)
j = 0b01100000000000000000000000000000;
Which was our error, so j bit shifted by bits would be
j = 0b11000000000000000000000000000000;
and then reversed and what we want would be
j = 0b00000000000000000000000000000011;

Perform autocorrelation with vDSP_conv from Apple Accelerate Framework

I need to perform the autocorrelation of an array (vector) but I am having trouble finding the correct way to do so. I believe that I need the method "vDSP_conv" from the Accelerate Framework, but I can't follow how to successfully set it up. The thing throwing me off the most is the need for 2 inputs. Perhaps I have the wrong function, but I couldn't find one that operated on a single vector.
The documentation can be found here
Copied from the site
vDSP_conv
Performs either correlation or convolution on two vectors; single
precision.
void vDSP_conv ( const float __vDSP_signal[], vDSP_Stride
__vDSP_signalStride, const float __vDSP_filter[], vDSP_Stride __vDSP_strideFilter, float __vDSP_result[], vDSP_Stride __vDSP_strideResult, vDSP_Length __vDSP_lenResult, vDSP_Length __vDSP_lenFilter );
Parameters
__vDSP_signal
Input vector A. The length of this vector must be at least __vDSP_lenResult + __vDSP_lenFilter - 1.
__vDSP_signalStride
The stride through __vDSP_signal.
__vDSP_filter
Input vector B.
__vDSP_strideFilter
The stride through __vDSP_filter.
__vDSP_result
Output vector C.
__vDSP_strideResult
The stride through __vDSP_result.
__vDSP_lenResult
The length of __vDSP_result.
__vDSP_lenFilter
The length of __vDSP_filter.
For an example, just assume you have an array of float x = [1.0, 2.0, 3.0, 4.0, 5.0]. How would I take the autocorrelation of that?
The output should be something similar to float y = [5.0, 14.0, 26.0, 40.0, 55.0, 40.0, 26.0, 14.0, 5.0] //generated using Matlab's xcorr(x) function
performing autocorrelation simply means you take the cross-correlation of one vector with itself. There is nothing fancy about it.
so in your case, do:
vDSP_conv(x, 1, x, 1, result, 1, 2*len_X-1, len_X);
check a sample code for more details: (which does a convolution)
http://disanji.net/iOS_Doc/#documentation/Performance/Conceptual/vDSP_Programming_Guide/SampleCode/SampleCode.html
EDIT: This borders on ridiculous, but you need to offset the x value by a specific number of zeros, which is just crazy.
the following is a working code, just set filter to the value of x you desire, and it will put the rest in the correct position:
float *signal, *filter, *result;
int32_t signalStride, filterStride, resultStride;
uint32_t lenSignal, filterLength, resultLength;
uint32_t i;
filterLength = 5;
resultLength = filterLength*2 -1;
lenSignal = ((filterLength + 3) & 0xFFFFFFFC) + resultLength;
signalStride = filterStride = resultStride = 1;
printf("\nConvolution ( resultLength = %d, "
"filterLength = %d )\n\n", resultLength, filterLength);
/* Allocate memory for the input operands and check its availability. */
signal = (float *) malloc(lenSignal * sizeof(float));
filter = (float *) malloc(filterLength * sizeof(float));
result = (float *) malloc(resultLength * sizeof(float));
for (i = 0; i < filterLength; i++)
filter[i] = (float)(i+1);
for (i = 0; i < resultLength; i++)
if (i >=resultLength- filterLength)
signal[i] = filter[i - filterLength+1];
/* Correlation. */
vDSP_conv(signal, signalStride, filter, filterStride,
result, resultStride, resultLength, filterLength);
printf("signal: ");
for (i = 0; i < lenSignal; i++)
printf("%2.1f ", signal[i]);
printf("\n filter: ");
for (i = 0; i < filterLength; i++)
printf("%2.1f ", filter[i]);
printf("\n result: ");
for (i = 0; i < resultLength; i++)
printf("%2.1f ", result[i]);
/* Free allocated memory. */
free(signal);
free(filter);
free(result);

Help with IIR Comb Filter

Reverb.m
#define D 1000
OSStatus MusicPlayerCallback(
void* inRefCon,
AudioUnitRenderActionFlags * ioActionFlags,
const AudioTimeStamp * inTimeStamp,
UInt32 inBusNumber,
UInt32 inNumberFrames
AudioBufferList * ioData){
MusicPlaybackState *musicPlaybackState = (MusicPlaybackState*) inRefCon;
//Sample Rate 44.1
float a0,a1;
double y0, sampleinp;
//Delay Gain
a0 = 1;
a1 = 0.5;
for (int i = 0; i< ioData->mNumberBuffers; i++){
AudioBuffer buffer = ioData->mBuffers[i];
SIn16 *outSampleBuffer = buffer.mData;
for (int j = 0; j < inNumberFrames*2; j++) {
//Delay Left Channel
sampleinp = *musicPlaybackState->samplePtr++;
/* IIR equation of Comb Filter
y[n] = (a*x[n])+ (b*x[n-D])
*/
y0 = (a0*sampleinp) + (a1*sampleinp-D);
outSample[j] = fmax(fmin(y0, 32767.0), -32768.0);
j++;
//Delay Right Channel
sampleinp = *musicPlaybackState->samplePtr++;
y0 = (a0*sampleinp) + (a1*sampleinp-D);
outSample[j] = fmax(fmin(y0, 32767.0), -32768.0);
}
}
}
Ok, I got a lot of info but I'm having trouble implementing it. Can someone help, it's probably something really easy i'm forgeting. It's just playing back as normal with a little boost but no delays.
Your treatment of the x0[] variables doesn't look right -- the way you have it, the left and right channels will be intermingled. You assign to x0[j] for the left channel, then
overwrite x0[j] with the right channel data. So the delayed signal x0[j-D] will
always correspond to the right channel, with the delayed left channel data being lost.
You didn't say what your sample rate is, but for a typical audio application, a
three-sample delay might not have much of an audible effect. At 44.1 ksamp/sec,
with a 3-sample delay the peaks and troughs of the filter response will be at
multiples of 14,700 Hz. All you'll get is a single peak in the audio frequency
range, in a part of the spectrum where there's hardly any power (assuming the
signal is speech or music).