What causes the retired instructions to increase? - cpu-architecture

I have a 496*O(N^3) loop. I am performing a blocking optimization technique where I'm operating 2 images at a time instead of 1. In raw terms, I am unrolling the outer loop. (The non-unrolled version of the code is as shown below: ) b.t.w I'm using Intel Xeon X5365 machine that has 8 cores and it has 3GHz clock, 1333MHz bus frequency, Shared 8MB L2( 4 MB shared between every 2 core), L1-I 32KB,L1-D 32KB .
for(imageNo =0; imageNo<496;imageNo++){
for (unsigned int k=0; k<256; k++)
{
double z = O_L + (double)k * R_L;
for (unsigned int j=0; j<256; j++)
{
double y = O_L + (double)j * R_L;
for (unsigned int i=0; i<256; i++)
{
double x[1] = {O_L + (double)i * R_L} ;
double w_n = (A_n[2] * x[0] + A_n[5] * y + A_n[8] * z + A_n[11]) ;
double u_n = ((A_n[0] * x[0] + A_n[3] * y + A_n[6] * z + A_n[9] ) / w_n);
double v_n = ((A_n[1] * x[0] + A_n[4] * y + A_n[7] * z + A_n[10]) / w_n);
for(int loop=0; loop<1;loop++)
{
px_x[loop] = (int) floor(u_n);
px_y[loop] = (int) floor(v_n);
alpha[loop] = u_n - px_x[loop] ;
beta[loop] = v_n - px_y[loop] ;
}
if(px_y[0]>=0 && px_y[0]<(int)threadCopy[0].S_y)
{
if (px_x[0]>=0 && px_x[0]<(int)threadCopy[0].S_x )
///////////////////(i,j) pixels ///////////////////////////////
pixel_1[0] = threadCopy[0].I_n[px_y[0] * threadCopy[0].S_x + px_x[0]];
else
pixel_1[0] =0.0;
if (px_x[0]+1>=0 && px_x[0]+1<(int)threadCopy[0].S_x)
/////////////////// (i+1, j) pixels/////////////////////////
pixel_1[2] = threadCopy[0].I_n[px_y[0] * threadCopy[0].S_x + (px_x[0]+1)];
else
pixel_1[2] = 0.0;
}
else{
pixel_1[0] =0.0;
pixel_1[2] =0.0;
}
if( px_y[0]+1>=0 && px_y[0]+1<(int)threadCopy[0].S_y)
{
if (px_x[0]>=0 && px_x[0]<(int)threadCopy[0].S_x)
pixel_1[1] = threadCopy[0].I_n[(px_y[0]+1) * threadCopy[0].S_x + px_x[0]];
else
pixel_1[1] = 0.0;
if (px_x[0]+1>=0 && px_x[0]+1<(int)threadCopy[0].S_x)
pixel_1[3] = threadCopy[0].I_n[(px_y[0]+1) * threadCopy[0].S_x + (px_x[0]+1)];
else
pixel_1[3] = 0.0;
}
else{
pixel_1[1] = 0.0;
pixel_1[3] = 0.0;
}
pix_1 = (1.0 - alpha[0]) * (1.0 - beta[0]) * pixel_1[0] + (1.0 - alpha[0]) * beta[0] * pixel_1[1]
+ alpha[0] * (1.0 - beta[0]) * pixel_1[2] + alpha[0] * beta[0] * pixel_1[3];
f_L[k * L * L + j * L + i] += (float)(1.0 / (w_n * w_n) * pix_1);
}
}
}
I profiled the results using Intel Vtune-2013 (Using binary created from gcc-4.1) and I can see that there is 40% reduction in memory bandwidth usage which was expected because 2 images are being processed for every iteration.(f_L store operation causes 8 bytes of traffic for every voxel). This accounts to 11.7% reduction in bus cycles! Also, since the block size is increased in the inner loop, the resource stalls decrease by 25.5%. These 2 accounts for 18% reduction in response time.
The mystery question is, why are instruction retired increased by 7.9%? (Which accounts for increase in response time by 6.51%) - Possible reason I could this of is:
1. Since the number of branch instructions increase inside the block (and core architecture has 8 bit global history) retired branch instruction increased by 2.5%( Although, mis-prediction remained the same! I know, smells fishy right?!!). But I am still missing answer for the rest 5.4%! Could anyone please shed me light in any direction? I'm completely out of options and No way to think. Thanks a lot!!

Related

Segfault while running openMp on CMSIS NN function

I'm trying to execute through multiple threads the following segment of code, which is part of CMSIS NN lib from ARM, but I'm observing segfault when adding the proper pragmas with openMP. The code is available here
#pragma omp parallel for collapse(2) shared(pOut) firstprivate(pBuffer, dim_im_out, stride, padding, dim_kernel, dim_im_in, out_shift, bias, ch_im_out)
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
printf("_%d",omp_get_thread_num());
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
*__SIMD32(pBuffer) = 0x0;
*(pBuffer + 2) = 0;
pBuffer += 3;
} else
{
/*
* Equivalent to:
* arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3);
*/
const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3;
q31_t buf = arm_nn_read_q7x4(pPixel);
union arm_nnword top;
union arm_nnword bottom;
top.word = __SXTB16(buf);
bottom.word = __SXTB16(__ROR(buf, 8));
*pBuffer++ = top.half_words[0];
*__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0);
pBuffer += 2;
}
}
}
#pragma omp critical
if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15(wt, bufferA,
ch_im_out,
3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
It looks like the execution goes fine till certain point where the ARM CPU gets lost while scheduling the number of threads...
... Application end!
[Parallel] RUN: Startup Convolution - Layer 1
_0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_1_1_1_1_1_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_1_1_1_1_1_2_2_3_3_3_3_3_1_1_1_1_1_2_2_2_2_2_0_0_0_0_0_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_2_2_2_2_2_0_0_0_0_0_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_1_1_1_1_1_2_2_2_2_2_0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_3_3_3_3_3_1_1_1_1_1_0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3Segmentation fault

How to find the first value of Bollinger Bands when bar open

Actually, the Bollinger Bands code is:
//#version=4
study(title="AAAA", shorttitle="AAAA", overlay=true)
len = 5
multi = 2
bb5med = sma(close, len)
devBB5 = mult2 * stdev(close, len)
bb5top = bb5med + devBB5
bb5bot = bb5med - devBB5
I would want to find the first value of those 3 lines when the new bar comes, means, when close==open.
Also, I need it to work when I change the len to 20, 50 and/or when I change the multi to 3
Please help me. Thank you.
//#version=5
indicator("BB Open", overlay = true)
len = input.int(20)
mult = input.float(2.000)
basis = (math.sum(close, len - 1)[1] + open) / len
float dev_sum = 0.0
for i = 1 to len - 1
dev_sum += math.pow(basis - close[i], 2)
dev_sum += math.pow(basis - open, 2)
stdev = math.sqrt(dev_sum / len)
up = basis + stdev * mult
dn = basis - stdev * mult
plot(basis, color = color.yellow)
plot(up)
plot(dn)
Function :
f_BBopen(_close, _open, _len, _mult) =>
_basis = (math.sum(_close, _len - 1)[1] + _open) / _len
float _dev_sum = 0.0
for i = 1 to _len - 1
_dev_sum += math.pow(_basis - _close[i], 2)
_dev_sum += math.pow(_basis - _open, 2)
_stdev = math.sqrt(_dev_sum / _len)
_up = _basis + _stdev * _mult
_dn = _basis - _stdev * _mult
[_basis, _up, _dn]
[basis, up, dn] = f_BBopen(close, open, len, mult)

Improve speed on joining multiple images

What I'm trying to do is take numerous(up to 48) 1024x768 images that are color coded images(weather maps, the precip overlay) and add up the precip to fall over the course of time. When I run into non-precip I want to take a box 5x5 around the pixel in question and average the value and use that value as the value of the pixel in question.
I can do this but it takes a long time to accomplish it. I have heard numpy could improve the speed but I still haven't been able to wrap my mind around how it going to improve the speed given the sequence of events that have to take place. It seems like I would still have to do it pixel by pixel. I've included an idea of the code I'm using to accomplish this SLOWLY.
I have this actually as two separate program, one to download the images and the other does the image processing(working up toward merging the two programs in the near future, just trying to get all the bugs worked out before the merger.) Hence some of the download coding may look a little strange. I figure I could probably write the file straight to a variable but I haven't been doing it that way so I stuck with a bit longer approach.
Is there anyway of increasing the speed? I don't see anyway of avoiding pixel by pixel due to the color coding scheme in place(look at the color bar in the lower left it shows the full color scheme...I only included part of it for demo purposes in the coding below.) Some of the coding may be a bit rough since I chopped from the two programs and put the important parts in here...it shows what I'm currently doing and gives the full idea of how I'm going about doing it.
Also, if you happen to see this three to four or more days after it was posted you would need to change the date in the download link to the current date. The files are only kept on the server for 3-4 days before they are removed.
from PIL import Image
import time
import urllib
import os
pathstr = '/'
url = 'http://mag.ncep.noaa.gov/GemPakTier/MagGemPakImages/gfs/20140216/00/gfs_namer_006_1000_500_thick.gif'
urllib.urlretrieve(url,str(pathstr + '20140216006.gif'))
url = 'http://mag.ncep.noaa.gov/GemPakTier/MagGemPakImages/gfs/20140216/00/gfs_namer_012_1000_500_thick.gif'
urllib.urlretrieve(url,str(pathstr + '20140216012.gif'))
url = 'http://mag.ncep.noaa.gov/GemPakTier/MagGemPakImages/gfs/20140216/00/gfs_namer_018_1000_500_thick.gif'
urllib.urlretrieve(url,str(pathstr + '20140216018.gif'))
url = 'http://mag.ncep.noaa.gov/GemPakTier/MagGemPakImages/gfs/20140216/00/gfs_namer_024_1000_500_thick.gif'
urllib.urlretrieve(url,str(pathstr + '20140216024.gif'))
class Convert():
def __init__(self):
self.colorscale2 = [(255,255,255),(127,255,0),(0,205,0),(145,44,238),(16,78,139),
(30,144,255),(0,178,238),(0,238,238),(137,104,205),(0,139,0),
(139,0,139),(139,0,0),(205,0,0),(238,64,0),(255,127,0),(205,133,0),
(255,215,0),(238,238,0),(255,255,0),(139,71,38),(255,0,0),(0,0,255),(0,0,0)]
self.x = 0
self.y = 0
self.grid = 0
self.moist = 0
self.scan = 0
self.turn = 0
self.precip = {}
start = time.time()
for i in range(6, 30, 6):
if i < 10:
filename = '/2014021600' + str(i) + '.gif'
else:
filename = '/201402160' + str(i) + '.gif'
self.im1 = Image.open(filename).convert('RGB')
self.image = self.im1.getdata()
self.size = width, height = self.im1.size
self.coordinates = self.x,self.y = width, height
self.getprecip()
self.turn = 1
print (time.time()-start)
def getprecip(self):
for self.x in range(81, 950):
for self.y in range(29, 749):
if self.turn == 0:
self.moist = 0
else:
self.moist = self.precip[self.x,self.y]
self.coordinates = self.x,self.y
self.scan = 0
self.imagescan()
if self.turn == 0:
self.precip[self.x,self.y] = self.moist
else:
self.precip[self.x,self.y] += self.moist
def imagescan(self):
if self.image[(self.y * 1024) + self.x] == self.colorscale2[0]:
self.moist =0
self.grid -=1
elif self.image[(self.y * 1024) + self.x] == self.colorscale2[1]:
self.moist =.01
elif self.image[(self.y * 1024) + self.x] == self.colorscale2[2]:
self.moist =.1
elif self.image[(self.y * 1024) + self.x] == self.colorscale2[3]:
self.moist =.25
elif self.image[(self.y * 1024) + self.x] == self.colorscale2[4]:
self.moist =.5
#on and on through self.colorscale2[18]
if self.scan == 1:
self.grid += 1
if self.scan == 0:
x = self.x
y = self.y
self.deliso540()
self.x = x
self.y = y
def deliso540(self):
self.grid = 1
self.scan = 1
for p in range(self.x-2,self.x+2):
for q in range(self.y-2,self.y+2):
self.x = p
self.y = q
self.imagescan()
self.moist = self.moist / self.grid

Calculate IRR (Internal Rate Return) and NPV programmatically in Objective-C

I am developing a financial app and require IRR (in-built functionality of Excel) calculation and found such great tutorials in C here and such answer in C# here.
I implemented code of the C language above, but it gives a perfect result when IRR is in positive. It is not returning a negative value when it should be. Whereas in Excel =IRR(values,guessrate) returns negative IRR as well for some values.
I have referred to code in above C# link too, and it seems that it follows good procedures and returns errors and also hope that it returns negative IRR too, the same as Excel. But I am not familiar with C#, so I am not able to implement the same code in Objective-C or C.
I am writing C code from the above link which I have implemented for helping you guys.
#define LOW_RATE 0.01
#define HIGH_RATE 0.5
#define MAX_ITERATION 1000
#define PRECISION_REQ 0.00000001
double computeIRR(double cf[], int numOfFlows)
{
int i = 0, j = 0;
double m = 0.0;
double old = 0.00;
double new = 0.00;
double oldguessRate = LOW_RATE;
double newguessRate = LOW_RATE;
double guessRate = LOW_RATE;
double lowGuessRate = LOW_RATE;
double highGuessRate = HIGH_RATE;
double npv = 0.0;
double denom = 0.0;
for (i=0; i<MAX_ITERATION; i++)
{
npv = 0.00;
for (j=0; j<numOfFlows; j++)
{
denom = pow((1 + guessRate),j);
npv = npv + (cf[j]/denom);
}
/* Stop checking once the required precision is achieved */
if ((npv > 0) && (npv < PRECISION_REQ))
break;
if (old == 0)
old = npv;
else
old = new;
new = npv;
if (i > 0)
{
if (old < new)
{
if (old < 0 && new < 0)
highGuessRate = newguessRate;
else
lowGuessRate = newguessRate;
}
else
{
if (old > 0 && new > 0)
lowGuessRate = newguessRate;
else
highGuessRate = newguessRate;
}
}
oldguessRate = guessRate;
guessRate = (lowGuessRate + highGuessRate) / 2;
newguessRate = guessRate;
}
return guessRate;
}
I have attached the result for some value which are different in Excel and the above C language code.
Values: Output of Excel: -33.5%
1 = -18.5, Output of C code: 0.010 or say (1.0%)
2 = -18.5,
3 = -18.5,
4 = -18.5,
5 = -18.5,
6 = 32.0
Guess rate: 0.1
Since low_rate and high_rate are both positive, you're not able to get a negative score. You have to change:
#define LOW_RATE 0.01
to, for example,
#define LOW_RATE -0.5

Alsa mixer and GtkVolumeButton

I make code to get and set alsa mixer volume:
snd_mixer_elem_t *elem = NULL;
long alsa_min, alsa_max, alsa_vol;
int alsa_get_volume( void )
{
long val;
assert (elem);
if (snd_mixer_selem_is_playback_mono(elem)) {
snd_mixer_selem_get_playback_volume(elem, SND_MIXER_SCHN_MONO, &val);
return val;
} else {
int c, n = 0;
long sum = 0;
for (c = 0; c <= SND_MIXER_SCHN_LAST; c++) {
if (snd_mixer_selem_has_playback_channel(elem, c)) {
snd_mixer_selem_get_playback_volume(elem, SND_MIXER_SCHN_FRONT_LEFT, &val);
sum += val;
n++;
}
}
if (! n) {
return 0;
}
val = sum / n;
sum = (long)((double)(alsa_vol * (alsa_max - alsa_min)) / 100. + 0.5);
if (sum != val) {
alsa_vol = (long)(((val * 100.) / (alsa_max - alsa_min)) + 0.5);
}
return alsa_vol;
}
}
int alsa_set_volume( int percentdiff )
{
long volume;
alsa_get_volume();
alsa_vol += percentdiff;
if( alsa_vol > 100 ) alsa_vol = 100;
if( alsa_vol < 0 ) alsa_vol = 0;
volume = (long)((alsa_vol * (alsa_max - alsa_min) / 100.) + 0.5);
snd_mixer_selem_set_playback_volume_all(elem, volume + alsa_min);
snd_mixer_selem_set_playback_switch_all(elem, 1);
muted = 0;
mutecount = 0;
return alsa_vol;
}
I wont to make alsa mixer volume to changed by GtkVolumeButton. Tried this but when value from gtk button is changed up or down, alsa mixer always jumps to 100 %:
int gtk_volume_button_get_value (GtkWidget *button)
{
return (int) (gtk_scale_button_get_value(GTK_SCALE_BUTTON(button)) * 100);
}
void gtk_volume_button_set_value (GtkWidget *button, int value)
{
gtk_scale_button_set_value(GTK_SCALE_BUTTON(button), (gdouble) value / 100);
}
void volume_value_changed_cb(GtkVolumeButton *button, gpointer user_data)
{
int vol = (int)(gtk_volume_button_get_value(volume_button) + 0.5);
alsa_set_volume(vol);
}
Please help me to write a corect code for GtkVolumeButton.
Your problem has nothing to do with GtkVolume. In fact, it comes from you using two different approaches to handle volume. alsa_get_volume gives you an absolute sound level, which is an integer. One would expect alsa_set_volume to accept the same kind of value range. And that's how you use it in volume_value_changed_cb: « get the volume level of the volume control, between 0 and 100, and set it as current volume. ».
However, the implementation is completely different. It's implemented as if you wanted to tell it « add or substract x% of the current sound volume ». You get the current volume level and add that percentage, thus you're computing a relative sound level, not an absolute one. So, if your initial sound level is 50%, and you want to lower it to 45%, one would expect you'd call alsa_set_volume (45) to do it. But currently, calling alsa_set_volume (45) will set alsa_vol to 50 + 45 = 95%.
So you need to use absolute volume, not relative.
/* newvol: Desired volume level in the [0;100] range */
int alsa_set_volume (int newvol)
{
long volume;
alsa_vol = CLAMP(absvol, 0, 100);
volume = (long)((alsa_vol * (alsa_max - alsa_min) / 100.) + alsa_min);
snd_mixer_selem_set_playback_volume_all(elem, volume);
snd_mixer_selem_set_playback_switch_all(elem, 1);
muted = 0;
mutecount = 0;
return alsa_vol;
}