Segfault while running openMp on CMSIS NN function - neural-network

I'm trying to execute through multiple threads the following segment of code, which is part of CMSIS NN lib from ARM, but I'm observing segfault when adding the proper pragmas with openMP. The code is available here
#pragma omp parallel for collapse(2) shared(pOut) firstprivate(pBuffer, dim_im_out, stride, padding, dim_kernel, dim_im_in, out_shift, bias, ch_im_out)
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
printf("_%d",omp_get_thread_num());
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
*__SIMD32(pBuffer) = 0x0;
*(pBuffer + 2) = 0;
pBuffer += 3;
} else
{
/*
* Equivalent to:
* arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3);
*/
const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3;
q31_t buf = arm_nn_read_q7x4(pPixel);
union arm_nnword top;
union arm_nnword bottom;
top.word = __SXTB16(buf);
bottom.word = __SXTB16(__ROR(buf, 8));
*pBuffer++ = top.half_words[0];
*__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0);
pBuffer += 2;
}
}
}
#pragma omp critical
if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15(wt, bufferA,
ch_im_out,
3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
It looks like the execution goes fine till certain point where the ARM CPU gets lost while scheduling the number of threads...
... Application end!
[Parallel] RUN: Startup Convolution - Layer 1
_0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_1_1_1_1_1_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_1_1_1_1_1_2_2_3_3_3_3_3_1_1_1_1_1_2_2_2_2_2_0_0_0_0_0_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_2_2_2_2_2_0_0_0_0_0_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_1_1_1_1_1_2_2_2_2_2_0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_3_3_3_3_3_1_1_1_1_1_0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3Segmentation fault

Related

Why is the voltage collected by stm32adc inaccurate?

I am using the STM32F103C8T6. I use the ADC for multi-channel voltage acquisition, and a potentiometer to control the voltage change. I find that the ADC value changes fluctuate. Why?
This is part of my ADC code:
uint32_t ADC_Get_Average(uint8_t ch,uint8_t times)
{
ADC_ChannelConfTypeDef sConfig;
uint32_t value_sum=0;
uint8_t i;
switch(ch)
{
case 1:sConfig.Channel = ADC_CHANNEL_1;break;
case 2:sConfig.Channel = ADC_CHANNEL_2;break;
case 3:sConfig.Channel = ADC_CHANNEL_3;break;
}
sConfig.SamplingTime = ADC_SAMPLETIME_1CYCLE_5;
sConfig.Rank = 1;
HAL_ADC_ConfigChannel(&hadc1,&sConfig);
for(i=0;i<times;i++)
{
HAL_ADC_Start(&hadc1);
HAL_ADC_PollForConversion(&hadc1,5);
value_sum += HAL_ADC_GetValue(&hadc1);
HAL_ADC_Stop(&hadc1);
}
return value_sum/times;
}
void ADC_PROC(void)
{
ADC_value1 = ADC_Get_Average(1,5) / 4096.0 * 3.3;
ADC_value2 = ADC_Get_Average(2,5) / 4096.0 * 3.3;
ADC_value3 = ADC_Get_Average(3,5) / 4096.0 * 3.3;
sprintf(adcbuff1, "V1:%.2fV", ADC_value1);
oled_show_string(24, 0, adcbuff1, 2);
sprintf(adcbuff2, "V2:%.2fV", ADC_value2);
oled_show_string(24, 2, adcbuff2, 2);
sprintf(adcbuff3, "V3:%.2fV", ADC_value3);
oled_show_string(24, 4, adcbuff3, 2);
}
enter image description here
I have tried to change the length of ADC's acquisition cycle side, but the effect is still the same, with severe fluctuations.

How to find the first value of Bollinger Bands when bar open

Actually, the Bollinger Bands code is:
//#version=4
study(title="AAAA", shorttitle="AAAA", overlay=true)
len = 5
multi = 2
bb5med = sma(close, len)
devBB5 = mult2 * stdev(close, len)
bb5top = bb5med + devBB5
bb5bot = bb5med - devBB5
I would want to find the first value of those 3 lines when the new bar comes, means, when close==open.
Also, I need it to work when I change the len to 20, 50 and/or when I change the multi to 3
Please help me. Thank you.
//#version=5
indicator("BB Open", overlay = true)
len = input.int(20)
mult = input.float(2.000)
basis = (math.sum(close, len - 1)[1] + open) / len
float dev_sum = 0.0
for i = 1 to len - 1
dev_sum += math.pow(basis - close[i], 2)
dev_sum += math.pow(basis - open, 2)
stdev = math.sqrt(dev_sum / len)
up = basis + stdev * mult
dn = basis - stdev * mult
plot(basis, color = color.yellow)
plot(up)
plot(dn)
Function :
f_BBopen(_close, _open, _len, _mult) =>
_basis = (math.sum(_close, _len - 1)[1] + _open) / _len
float _dev_sum = 0.0
for i = 1 to _len - 1
_dev_sum += math.pow(_basis - _close[i], 2)
_dev_sum += math.pow(_basis - _open, 2)
_stdev = math.sqrt(_dev_sum / _len)
_up = _basis + _stdev * _mult
_dn = _basis - _stdev * _mult
[_basis, _up, _dn]
[basis, up, dn] = f_BBopen(close, open, len, mult)

What causes the retired instructions to increase?

I have a 496*O(N^3) loop. I am performing a blocking optimization technique where I'm operating 2 images at a time instead of 1. In raw terms, I am unrolling the outer loop. (The non-unrolled version of the code is as shown below: ) b.t.w I'm using Intel Xeon X5365 machine that has 8 cores and it has 3GHz clock, 1333MHz bus frequency, Shared 8MB L2( 4 MB shared between every 2 core), L1-I 32KB,L1-D 32KB .
for(imageNo =0; imageNo<496;imageNo++){
for (unsigned int k=0; k<256; k++)
{
double z = O_L + (double)k * R_L;
for (unsigned int j=0; j<256; j++)
{
double y = O_L + (double)j * R_L;
for (unsigned int i=0; i<256; i++)
{
double x[1] = {O_L + (double)i * R_L} ;
double w_n = (A_n[2] * x[0] + A_n[5] * y + A_n[8] * z + A_n[11]) ;
double u_n = ((A_n[0] * x[0] + A_n[3] * y + A_n[6] * z + A_n[9] ) / w_n);
double v_n = ((A_n[1] * x[0] + A_n[4] * y + A_n[7] * z + A_n[10]) / w_n);
for(int loop=0; loop<1;loop++)
{
px_x[loop] = (int) floor(u_n);
px_y[loop] = (int) floor(v_n);
alpha[loop] = u_n - px_x[loop] ;
beta[loop] = v_n - px_y[loop] ;
}
if(px_y[0]>=0 && px_y[0]<(int)threadCopy[0].S_y)
{
if (px_x[0]>=0 && px_x[0]<(int)threadCopy[0].S_x )
///////////////////(i,j) pixels ///////////////////////////////
pixel_1[0] = threadCopy[0].I_n[px_y[0] * threadCopy[0].S_x + px_x[0]];
else
pixel_1[0] =0.0;
if (px_x[0]+1>=0 && px_x[0]+1<(int)threadCopy[0].S_x)
/////////////////// (i+1, j) pixels/////////////////////////
pixel_1[2] = threadCopy[0].I_n[px_y[0] * threadCopy[0].S_x + (px_x[0]+1)];
else
pixel_1[2] = 0.0;
}
else{
pixel_1[0] =0.0;
pixel_1[2] =0.0;
}
if( px_y[0]+1>=0 && px_y[0]+1<(int)threadCopy[0].S_y)
{
if (px_x[0]>=0 && px_x[0]<(int)threadCopy[0].S_x)
pixel_1[1] = threadCopy[0].I_n[(px_y[0]+1) * threadCopy[0].S_x + px_x[0]];
else
pixel_1[1] = 0.0;
if (px_x[0]+1>=0 && px_x[0]+1<(int)threadCopy[0].S_x)
pixel_1[3] = threadCopy[0].I_n[(px_y[0]+1) * threadCopy[0].S_x + (px_x[0]+1)];
else
pixel_1[3] = 0.0;
}
else{
pixel_1[1] = 0.0;
pixel_1[3] = 0.0;
}
pix_1 = (1.0 - alpha[0]) * (1.0 - beta[0]) * pixel_1[0] + (1.0 - alpha[0]) * beta[0] * pixel_1[1]
+ alpha[0] * (1.0 - beta[0]) * pixel_1[2] + alpha[0] * beta[0] * pixel_1[3];
f_L[k * L * L + j * L + i] += (float)(1.0 / (w_n * w_n) * pix_1);
}
}
}
I profiled the results using Intel Vtune-2013 (Using binary created from gcc-4.1) and I can see that there is 40% reduction in memory bandwidth usage which was expected because 2 images are being processed for every iteration.(f_L store operation causes 8 bytes of traffic for every voxel). This accounts to 11.7% reduction in bus cycles! Also, since the block size is increased in the inner loop, the resource stalls decrease by 25.5%. These 2 accounts for 18% reduction in response time.
The mystery question is, why are instruction retired increased by 7.9%? (Which accounts for increase in response time by 6.51%) - Possible reason I could this of is:
1. Since the number of branch instructions increase inside the block (and core architecture has 8 bit global history) retired branch instruction increased by 2.5%( Although, mis-prediction remained the same! I know, smells fishy right?!!). But I am still missing answer for the rest 5.4%! Could anyone please shed me light in any direction? I'm completely out of options and No way to think. Thanks a lot!!

Alsa mixer and GtkVolumeButton

I make code to get and set alsa mixer volume:
snd_mixer_elem_t *elem = NULL;
long alsa_min, alsa_max, alsa_vol;
int alsa_get_volume( void )
{
long val;
assert (elem);
if (snd_mixer_selem_is_playback_mono(elem)) {
snd_mixer_selem_get_playback_volume(elem, SND_MIXER_SCHN_MONO, &val);
return val;
} else {
int c, n = 0;
long sum = 0;
for (c = 0; c <= SND_MIXER_SCHN_LAST; c++) {
if (snd_mixer_selem_has_playback_channel(elem, c)) {
snd_mixer_selem_get_playback_volume(elem, SND_MIXER_SCHN_FRONT_LEFT, &val);
sum += val;
n++;
}
}
if (! n) {
return 0;
}
val = sum / n;
sum = (long)((double)(alsa_vol * (alsa_max - alsa_min)) / 100. + 0.5);
if (sum != val) {
alsa_vol = (long)(((val * 100.) / (alsa_max - alsa_min)) + 0.5);
}
return alsa_vol;
}
}
int alsa_set_volume( int percentdiff )
{
long volume;
alsa_get_volume();
alsa_vol += percentdiff;
if( alsa_vol > 100 ) alsa_vol = 100;
if( alsa_vol < 0 ) alsa_vol = 0;
volume = (long)((alsa_vol * (alsa_max - alsa_min) / 100.) + 0.5);
snd_mixer_selem_set_playback_volume_all(elem, volume + alsa_min);
snd_mixer_selem_set_playback_switch_all(elem, 1);
muted = 0;
mutecount = 0;
return alsa_vol;
}
I wont to make alsa mixer volume to changed by GtkVolumeButton. Tried this but when value from gtk button is changed up or down, alsa mixer always jumps to 100 %:
int gtk_volume_button_get_value (GtkWidget *button)
{
return (int) (gtk_scale_button_get_value(GTK_SCALE_BUTTON(button)) * 100);
}
void gtk_volume_button_set_value (GtkWidget *button, int value)
{
gtk_scale_button_set_value(GTK_SCALE_BUTTON(button), (gdouble) value / 100);
}
void volume_value_changed_cb(GtkVolumeButton *button, gpointer user_data)
{
int vol = (int)(gtk_volume_button_get_value(volume_button) + 0.5);
alsa_set_volume(vol);
}
Please help me to write a corect code for GtkVolumeButton.
Your problem has nothing to do with GtkVolume. In fact, it comes from you using two different approaches to handle volume. alsa_get_volume gives you an absolute sound level, which is an integer. One would expect alsa_set_volume to accept the same kind of value range. And that's how you use it in volume_value_changed_cb: « get the volume level of the volume control, between 0 and 100, and set it as current volume. ».
However, the implementation is completely different. It's implemented as if you wanted to tell it « add or substract x% of the current sound volume ». You get the current volume level and add that percentage, thus you're computing a relative sound level, not an absolute one. So, if your initial sound level is 50%, and you want to lower it to 45%, one would expect you'd call alsa_set_volume (45) to do it. But currently, calling alsa_set_volume (45) will set alsa_vol to 50 + 45 = 95%.
So you need to use absolute volume, not relative.
/* newvol: Desired volume level in the [0;100] range */
int alsa_set_volume (int newvol)
{
long volume;
alsa_vol = CLAMP(absvol, 0, 100);
volume = (long)((alsa_vol * (alsa_max - alsa_min) / 100.) + alsa_min);
snd_mixer_selem_set_playback_volume_all(elem, volume);
snd_mixer_selem_set_playback_switch_all(elem, 1);
muted = 0;
mutecount = 0;
return alsa_vol;
}

How to encode using the FFMpeg in Android (using H263)

I am trying to follow the sample code on encoding in the ffmpeg document and successfully build a application to encode and generate a mp4 file but I face the following problems:
1) I am using the H263 for encoding but I can only set the width and height of the AVCodecContext to 176x144, for other case (like 720x480 or 640x480) it will return fail.
2) I can't play the output mp4 file by using the default Android player, isn't it support H263 mp4 file? p.s. I can play it by using other player
3) Is there any sample code on encoding other video frame to make a new video (which mean decode the video and encode it back in different quality setting, also i would like to modify the frame content)?
Here is my code, thanks!
JNIEXPORT jint JNICALL Java_com_ffmpeg_encoder_FFEncoder_nativeEncoder(JNIEnv* env, jobject thiz, jstring filename){
LOGI("nativeEncoder()");
avcodec_register_all();
avcodec_init();
av_register_all();
AVCodec *codec;
AVCodecContext *codecCtx;
int i;
int out_size;
int size;
int x;
int y;
int output_buffer_size;
FILE *file;
AVFrame *picture;
uint8_t *output_buffer;
uint8_t *picture_buffer;
/* Manual Variables */
int l;
int fps = 30;
int videoLength = 5;
/* find the H263 video encoder */
codec = avcodec_find_encoder(CODEC_ID_H263);
if (!codec) {
LOGI("avcodec_find_encoder() run fail.");
}
codecCtx = avcodec_alloc_context();
picture = avcodec_alloc_frame();
/* put sample parameters */
codecCtx->bit_rate = 400000;
/* resolution must be a multiple of two */
codecCtx->width = 176;
codecCtx->height = 144;
/* frames per second */
codecCtx->time_base = (AVRational){1,fps};
codecCtx->pix_fmt = PIX_FMT_YUV420P;
codecCtx->codec_id = CODEC_ID_H263;
codecCtx->codec_type = AVMEDIA_TYPE_VIDEO;
/* open it */
if (avcodec_open(codecCtx, codec) < 0) {
LOGI("avcodec_open() run fail.");
}
const char* mfileName = (*env)->GetStringUTFChars(env, filename, 0);
file = fopen(mfileName, "wb");
if (!file) {
LOGI("fopen() run fail.");
}
(*env)->ReleaseStringUTFChars(env, filename, mfileName);
/* alloc image and output buffer */
output_buffer_size = 100000;
output_buffer = malloc(output_buffer_size);
size = codecCtx->width * codecCtx->height;
picture_buffer = malloc((size * 3) / 2); /* size for YUV 420 */
picture->data[0] = picture_buffer;
picture->data[1] = picture->data[0] + size;
picture->data[2] = picture->data[1] + size / 4;
picture->linesize[0] = codecCtx->width;
picture->linesize[1] = codecCtx->width / 2;
picture->linesize[2] = codecCtx->width / 2;
for(l=0;l<videoLength;l++){
//encode 1 second of video
for(i=0;i<fps;i++) {
//prepare a dummy image YCbCr
//Y
for(y=0;y<codecCtx->height;y++) {
for(x=0;x<codecCtx->width;x++) {
picture->data[0][y * picture->linesize[0] + x] = x + y + i * 3;
}
}
//Cb and Cr
for(y=0;y<codecCtx->height/2;y++) {
for(x=0;x<codecCtx->width/2;x++) {
picture->data[1][y * picture->linesize[1] + x] = 128 + y + i * 2;
picture->data[2][y * picture->linesize[2] + x] = 64 + x + i * 5;
}
}
//encode the image
out_size = avcodec_encode_video(codecCtx, output_buffer, output_buffer_size, picture);
fwrite(output_buffer, 1, out_size, file);
}
//get the delayed frames
for(; out_size; i++) {
out_size = avcodec_encode_video(codecCtx, output_buffer, output_buffer_size, NULL);
fwrite(output_buffer, 1, out_size, file);
}
}
//add sequence end code to have a real mpeg file
output_buffer[0] = 0x00;
output_buffer[1] = 0x00;
output_buffer[2] = 0x01;
output_buffer[3] = 0xb7;
fwrite(output_buffer, 1, 4, file);
fclose(file);
free(picture_buffer);
free(output_buffer);
avcodec_close(codecCtx);
av_free(codecCtx);
av_free(picture);
LOGI("finish");
return 0; }
H263 accepts only certain resolutions:
128 x 96
176 x 144
352 x 288
704 x 576
1408 x 1152
It will fail with anything else.
The code supplied in the question (I used it myself at first) seems to only generate a very rudimentary, if any, container format.
I found that this example, http://cekirdek.pardus.org.tr/~ismail/ffmpeg-docs/output-example_8c-source.html, worked much better as it creates a real container for the video and audio streams. My video is now displayable on the Android device.