Summarized question Does unrolling a loop affect the accuracy of the computations performed within the loop? And if so, why?
Elaboration and background I am writing a compute shader using HLSL for use in a Unity-project (2021.2.9f1). Parts of my code include numerical procedures and highly osciallatory functions, meaning that high computational accuracy is essential.
When comparing my results with an equivalent procedure in Python, I noticed that some deviations in the order of 1e-5. This was concerning, as I did not expect such large errors to be the result of precision differences, e.g., the float-precision in trigonometric or power functions in HLSL.
Ultimatley, after much debugging, I now believe the choice of unrolling or not unrolling a loop to be the cause of the deviation. However, I do find this strange, as I can not seem to find any sources indicating that unrolling a loop affects the accuracy in addition to the "space–time tradeoff".
For clarification, if considering my Python results as the correct solution, unrolling the loop in HLSL gives me better results than what not unrolling gives.
Minimal working example Below is an MWE consisting of a C# script for Unity, the corresponding compute shader where the computations are performed and a screen-shot of my console when running in Unity (2021.2.9f1). Forgive me for a somewhat messy implementation of Newtons method, but I chose to keep it since I believe it might be a cause to this deviation. That is, if simply computing cos(x), then there is not difference between the unrolled and not unrolled. None the less, I still fail to understand how the simple addition of [unroll(N)] in the testing kernel changes the result...
// C# for Unity
using UnityEngine;
public class UnrollTest : MonoBehaviour
{
[SerializeField] ComputeShader CS;
ComputeBuffer CBUnrolled, CBNotUnrolled;
readonly int N = 3;
private void Start()
{
CBUnrolled = new ComputeBuffer(N, sizeof(double));
CBNotUnrolled = new ComputeBuffer(N, sizeof(double));
CS.SetBuffer(0, "_CBUnrolled", CBUnrolled);
CS.SetBuffer(0, "_CBNotUnrolled", CBNotUnrolled);
CS.Dispatch(0, (int)((N + (64 - 1)) / 64), 1, 1);
double[] ansUnrolled = new double[N];
double[] ansNotUnrolled = new double[N];
CBUnrolled.GetData(ansUnrolled);
CBNotUnrolled.GetData(ansNotUnrolled);
for (int i = 0; i < N; i++)
{
Debug.Log("Unrolled ans = " + ansUnrolled[i] +
" - Not Unrolled ans = " + ansNotUnrolled[i] +
" -- Difference is: " + (ansUnrolled[i] - ansNotUnrolled[i]));
}
CBUnrolled.Release();
CBNotUnrolled.Release();
}
}
#pragma kernel CSMain
RWStructuredBuffer<double> _CBUnrolled, _CBNotUnrolled;
// Dummy function for Newtons method
double fDummy(double k, double fnh, double h, double theta)
{
return fnh * fnh * k * h * cos(theta) * cos(theta) - (double) tanh(k * h);
}
// Derivative of Dummy function above using a central finite difference scheme.
double dfDummy(double k, double fnh, double h, double theta)
{
return (fDummy(k + (double) 1e-3, fnh, h, theta) - fDummy(k - (double) 1e-3, fnh, h, theta)) / (double) 2e-3;
}
// Function to solve.
double f(double fnh, double h, double theta)
{
// Solved using Newton's method.
int max_iter = 50;
double epsilon = 1e-8;
double fxn, dfxn;
// Define initial guess for k, herby denoted as x.
double xn = 10.0;
for (int n = 0; n < max_iter; n++)
{
fxn = fDummy(xn, fnh, h, theta);
if (abs(fxn) < epsilon) // A solution is found.
return xn;
dfxn = dfDummy(xn, fnh, h, theta);
if (dfxn == 0.0) // No solution found.
return xn;
xn = xn - fxn / dfxn;
}
// No solution found.
return xn;
}
[numthreads(64,1,1)]
void CSMain(uint3 threadID : SV_DispatchThreadID)
{
int N = 3;
// ---------------
double fnh = 0.9, h = 4.53052, theta = -0.161, dtheta = 0.01; // Example values.
for (int i = 0; i < N; i++) // Not being unrolled
{
_CBNotUnrolled[i] = f(fnh, h, theta);
theta += dtheta;
}
// ---------------
fnh = 0.9, h = 4.53052, theta = -0.161, dtheta = 0.01; // Example values.
[unroll(N)] for (int j = 0; j < N; j++) // Being unrolled.
{
_CBUnrolled[j] = f(fnh, h, theta);
theta += dtheta;
}
}
Image of Unity console when running the above
Edit After some more testing, the deviation has been narrowed down to the following code, giving a difference of about 1e-17 between the exact same code unrolled vs not unrolled. Despite the small difference, I still consider it a valid example of the issue, as I believe they should be equal.
[numthreads(64, 1, 1)]
void CSMain(uint3 threadID : SV_DispatchThreadID)
{
if ((int) threadID.x != 1)
return;
int N = 3;
double k = 1.0;
// ---------------
double fnh = 0.9, h = 4.53052, theta = -0.161, dtheta = 0.01; // Example values.
for (int i = 0; i < N; i++) // Not being unrolled
{
_CBNotUnrolled[i] = (k + (double) 1e-3) * theta - (k - (double) 1e-3) * theta;
theta += dtheta;
}
// ---------------
fnh = 0.9, h = 4.53052, theta = -0.161, dtheta = 0.01; // Example values.
[unroll(N)]
for (int j = 0; j < N; j++) // Being unrolled.
{
_CBUnrolled[j] = (k + (double) 1e-3) * theta - (k - (double) 1e-3) * theta;
theta += dtheta;
}
}
Image of Unity console when running the edited script above
Edit 2 The following is the compiled code for the kernel given in Edit 1. Unfortunately, my experience with assembly language is limited, and I am not capable of spotting if this script shows any errors, or if it is useful to the problem at hand.
**** Platform Direct3D 11:
Compiled code for kernel CSMain
keywords: <none>
binary blob size 648:
//
// Generated by Microsoft (R) D3D Shader Disassembler
//
//
// Note: shader requires additional functionality:
// Double-precision floating point
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Input
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Output
cs_5_0
dcl_globalFlags refactoringAllowed | enableDoublePrecisionFloatOps
dcl_uav_structured u0, 8
dcl_uav_structured u1, 8
dcl_input vThreadID.x
dcl_temps 2
dcl_thread_group 64, 1, 1
0: ine r0.x, vThreadID.x, l(1)
1: if_nz r0.x
2: ret
3: endif
4: dmov r0.xy, d(-0.161000l, 0.000000l)
5: mov r0.z, l(0)
6: loop
7: ige r0.w, r0.z, l(3)
8: breakc_nz r0.w
9: dmul r1.xyzw, r0.xyxy, d(1.001000l, 0.999000l)
10: dadd r1.xy, -r1.zwzw, r1.xyxy
11: store_structured u1.xy, r0.z, l(0), r1.xyxx
12: dadd r0.xy, r0.xyxy, d(0.010000l, 0.000000l)
13: iadd r0.z, r0.z, l(1)
14: endloop
15: store_structured u0.xy, l(0), l(0), l(-0.000000,-0.707432,0,0)
16: store_structured u0.xy, l(1), l(0), l(0.000000,-0.702312,0,0)
17: store_structured u0.xy, l(2), l(0), l(-918250586112.000000,-0.697192,0,0)
18: ret
// Approximately 0 instruction slots used
Edit 3 After reaching out to Microsoft, (see https://learn.microsoft.com/en-us/an...nrolling-a-loop-affect-the-accuracy-of-t.html), they stated that the problem is more about Unity. This because
"The pragma unroll [(n)] is keil compiler which Unity uses topic"
This is driver, hardware, compiler, and unity dependent.
In essence, the HLSL specification has somewhat looser guarantees for rounding behavior of mathematical operations than regular IEEE-754 floating point.
First, it is implementation-dependent whether operations round up or down.
IEEE-754 requires floating-point operations to produce a result that
is the nearest representable value to an infinitely-precise result,
known as round-to-nearest-even. Direct3D 10, however, defines a looser
requirement: 32-bit floating-point operations produce a result that is
within one unit-last-place (1 ULP) of the infinitely-precise result.
This means that, for example, hardware is allowed to truncate results
to 32-bit rather than perform round-to-nearest-even, as that would
result in error of at most one ULP.
See https://learn.microsoft.com/en-us/windows/win32/direct3d10/d3d10-graphics-programming-guide-resources-float-rules#32-bit-floating-point-rules
Going one step further, the HLSL compiler itself has many fast-math optimizations that can violate IEEE-754 float conformance; see, for example:
D3DCOMPILE_IEEE_STRICTNESS - Forces strict compile, which might not allow for legacy syntax. By default, the compiler disables strictness on deprecated syntax.
D3DCOMPILE_OPTIMIZATION_LEVEL3 - Directs the compiler to use the highest optimization level. If you set this constant, the compiler produces the best possible code but might take significantly longer to do so. Set this constant for final builds of an application when performance is the most important factor.
D3DCOMPILE_PARTIAL_PRECISION - Directs the compiler to perform all computations with partial precision. If you set this constant, the compiled code might run faster on some hardware.
Source: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/d3dcompile-constants
This particularly matters for your scenario, because if optimizations are enabled, the existence of loop unrolling can trigger constant folding optimizations that reduce the computational cost of your code and change the precision of its results (potentially even improving them). Note that when constant folding occurs, the compiler has to decide how to perform rounding, and that might disagree with what your hardware FPUs would do.
Oh, and note that IEEE-754 does not place constraints on the precision, let alone require implementation, of "additional operations" (e.g. sin, cos, tanh, atan, ln, etc); it purely recommends them.
See, a very common case where this goes wrong and sin gets quantized to 4 different values on intel integrated graphics, but otherwise has reasonable precision on alternative hardware: sin(x) only returns 4 different values for moderately large input on GLSL fragment shader, Intel HD4000
Also, note that Unity does not guarantee that a float in shader is actually a 32-bit float; on certain hardware (e.g. mobile), it can even be backed by a 16-bit half or an 11-bit fixed.
High precision: float
Highest precision floating point value; generally 32 bits (just like float from regular programming languages).
...
One complication of float/half/fixed data type usage is that PC GPUs are always high precision. That is, for all the PC (Windows/Mac/Linux) GPUs, it does not matter whether you write float, half or fixed data types in your shaders. They always compute everything in full 32-bit floating point precision.
The half and fixed types only become relevant when targeting mobile
GPUs, where these types primarily exist for power (and sometimes
performance) constraints. Keep in mind that you need to test your
shaders on mobile to see whether or not you are running into
precision/numerical issues.
Even on mobile GPUs, the different precision support varies between
GPU families.
Source: https://docs.unity3d.com/Manual/SL-DataTypesAndPrecision.html
I don't believe Unity exposes compiler flags to developers; you are at its whim as to what optimizations it passes to dxc/fxc. Given it's primarily used for games, you can bet they enable optimizations.
Source: https://forum.unity.com/threads/possible-to-set-directx-compiler-flags-in-shaders.453790/
Finally, check out "Floating-Point Determinism" by Bruce Dawson if you want an in-depth dive into this topic; I will add that this problem also exists if you want consistent results between languages (since languages themselves can implement math functions themselves rather than using hardware intrinsics, e.g. for better precision), when cross-compiling (since different compilers / backends can optimize differently or use different system libraries), or when running managed code across different runtimes (e.g. since JIT can do different optimiztions).
Why am I getting the wrong answer (err2) from GoodnessOfFit.StandardError? In the code below, I do the computation myself and get the right answer (err3). I get the right answer from GoodnessOfFit.RSquared. Note: esttime and phrf are double[]. Length is 63.
Tuple<double, double> p = Fit.Line(esttime, phrf);
double ss = 0.0;
for (int j = 0; j < esttime.Length; j++)
{
est[j] = p.Item1 + p.Item2 * esttime[j];
ss += Math.Pow(est[j] - phrf[j], 2);
}
double err2 = GoodnessOfFit.StandardError(est, phrf, phrf.Length - 2);
Console.WriteLine(err2.ToString()); //writes 70.91 which is wrong
double err3 = Math.Sqrt(ss / est.Length - 2);
Console.WriteLine(err3.ToString()); // writes 12.56 which is correct
Answer: The third argument, Degrees of Freedom, is actually the number of degrees of freedom lost in the regression. So in the example it should be 2 and not phrf.Length - 2. Even so, it does not match my calculation, and Excel's, exactly.
I get the exact same result with excel and Math.NET for a polynomial fit when I set the degrees of freedom to be the order of the polynomial fit + 1. So, in the following exemple:
double[] paramsPoly = MathNet.Numerics.Fit.Polynomial(X.ToArray(),Y.ToArray(),6);
The StandardError function must receive 6 + 1 as it's last parameter:
double sey = MathNet.Numerics.GoodnessOfFit.StandardError(Yest, Y, 7);
I have a 900×1 vector of values (in MATLAB). Each 9 consecutive values should be averaged -without overlap- result in a 100×1 vector of values. The problem is that the averaging should be weighted based on a weighting vector of [1 2 1;2 4 2;1 2 1]. Is there any efficient way to do that averaging? I’ve heard about conv function in MATLAB; Is it helpful?
conv works by sliding a kernel through your data. But in your case, you need the mask to be jumping through your data, so I don't think conv will work for you.
If you want to use existing MATLAB function, you can do this (I have to assume your weighting matrix has only one dimension) :
kernel = [1;2;1;2;4;2;1;2;1];
in_matrix = reshape(in_matrix, 9, 100);
base = sum(kernel);
out_matrix = bsxfun(#times, in_matrix, kernel);
result = sum(out_matrix,1)/base;
I don't know if there is any clever way to speed this up. bsxfun allows singleton expansion, but maybe not dimension reduction.
A faster way would be to use mex. Open a new file in editor, paste the following code and save file as weighted_average.c.
#include "mex.h"
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
double *in_matrix, *kernel, *out_matrix, base;
int niter;
size_t nrows_data, nrows_kernel;
/* Get number of element along first dimension of input matrix. */
nrows_kernel = mxGetM(prhs[1]);
nrows_data = mxGetM(prhs[0]);
/* Create output matrix*/
plhs[0] = mxCreateDoubleMatrix((mwSize)nrows_data/nrows_kernel,1,mxREAL);
/* Get a pointer to the real data */
in_matrix = mxGetPr(prhs[0]);
kernel = mxGetPr(prhs[1]);
out_matrix = mxGetPr(plhs[0]);
/* Sum the elements in weighting array */
base = 0;
for (int i = 0; i < nrows_kernel; i +=1)
{
base += kernel[i];
}
/* Perform calculation */
niter = nrows_data/nrows_kernel;
for (int i = 0; i < niter ; i += 1)
{
for (int j = 0; j < nrows_kernel; j += 1)
{
out_matrix[i] += in_matrix[i*nrows_kernel+j]*kernel[j];
}
out_matrix[i] /= base;
}
}
Then in command window , type in
mex weighted_average.c
To use it:
result = weighted_average(input, kernel);
Note that both input and kernel have to be M x 1 matrix. On my computer, the first method took 0.0012 second. The second method took 0.00007 second. That's an order of magnitude faster than the first method.
I am analysing gafchromic filters in a freeware called ImageJ, which uses a simplified form of Java to write macros.
I have a set of datapoints I have successfully connected with different methods and have decided that a third degree polynomial fits the data best, however I need to work with the actual curve, so I need to somehow extract the equation/formula of said polynomial. This should be possible as the variables defining the polynomial are listed on the generated graph, however I can't seem to find a way to extract them in the code.
Here's my code so far:
n = nResults();
x = newArray(n);
for (i=0; i<x.length; i++)
{
x[i] = getResult("Grays ", i);
}
y = newArray(n);
for (i=0; i<y.length; i++)
{
y[i] = getResult("Mean ", i);
}
// Do all possible fits, plot them and add the plots to a stack
setBatchMode(true);
for (i = 0; i < Fit.nEquations; i++) {
Fit.doFit(i, x, y);
Fit.plot();
if (i == 0)
stack = getImageID;
else {
run("Copy");
close();
selectImage(stack);
run("Add Slice");
run("Paste");
}
Fit.getEquation(i, name, formula);
print(""); print(name+ " ["+formula+"]");
print(" R^2="+d2s(Fit.rSquared,3));
for (j=0; j<Fit.nParams; j++)
print(" p["+j+"]="+d2s(Fit.p(j),6));
}
setBatchMode(false);
run("Select None");
rename("Curve Fits");
}
As hinted above, I already got an answer elsewhere. Nonetheless, I'd like to also keep it here for the record.
Basically, the answer is already included in the original post, as it prints the individual variables into the "Log" window.
For the third-degree polynomial, I could have just used:
Fit.doFit(2, x, y); // 2 is 3rd Degree Polynomial
Fit.plot();
rename("Calibrating curve");
And then the can be extracted easily as thus:
a = Fit.p(0);
b = Fit.p(1);
c = Fit.p(2);
d = Fit.p(3);
Currently I am trying to implement Resilient Propagation for my network. I'm doing this based on the encog implementation, but there is one thing I don't understand:
The documentation for RPROP and iRPROP+ says when change > 0: weightChange = -sign(gradient) * delta
The source code in lines 298 and 366 does not have a minus!
Since I assume both are in some case correct: Why is there a difference between the two?
And concerning the gradient: I'm using tanh as activion in the output layer. Is this the correct calculation of the gradient?
gradientOutput = (1 - lastOutput[j] * lastOutput[j]) * (target[j] - lastOutput[j]);
After re-reading the relevant papers and looking up in a textbook I think the documentation of encog is not correct at this point. Why don't you just try it out by temporarily adding the minus-signs in the source code? If you use the same initial weights, you should receive exact the same results, given the documentation was correct. But in the end it just matters how you use the weightUpdate variable. If the author of the documentation is used to subtracting the weightUpdate from the weights instead of adding it, this will work.
Edit: I revisited the part about the gradient calculation in my original answer.
First, here is a brief explanation on how you can imagine the gradient for the weights in your output layer. First, you calculate the error between your outputs and the target values.
What you are now trying to do is to "blame" those neurons in the previous layer, which were active. Imagine the output neuron saying "Well, I have an error here, who is responsible?". Responsible are the neurons of the previous layer. Depending on the output being too small or too large compared to the target value, it will increase or decrease the weights to each of the neurons in the previous layers depending on how active they have been.
x is the activation of a neuron in the hidden layer.
o is the activation of the output neuron.
φ is the activation function of the output neuron, φ' its derivative.
Edit2: Corrected the part below. Added matrix style computation of backpropagation.
The error at each output neuron j is:
(1) δout, j = φ'(oj)(t - oj)
The gradient for the weight connecting the hidden neuron i with the output neuron j:
(2) gradi, j = xi * δout, j
The backpropagated error at each hidden neuron i with the weights w:
(3) δhid, i = φ'(x)*∑wi, j * δout, j
By repeatedly applying formula 2 and 3, you can backpropagate up to the input layer.
Written in loops, regarding one training sample:
The error at each output neuron j is:
for(int j=0; j < numOutNeurons; j++) {
errorOut[j] = activationDerivative(o[j])*(t[j] - o[j]);
}
The gradient for the weight connecting the hidden neuron i with the output neuron j:
for(int i=0; i < numHidNeurons; i++) {
for(int j=0; j < numOutNeurons; j++) {
grad[i][j] = x[i] * errorOut[j]
}
}
The backpropagated error at each hidden neuron i:
for(int i=0; i < numHidNeurons; i++) {
for(int j=0; j < numOutNeurons; j++) {
errorHid[i] = activationDerivative(x[i]) * weights[i][j] * errorOut[j]
}
}
In fully connected Multilayer Perceptrons without convolution or anything like that you can can use standard matrix operations, which is a lot faster.
Assuming each of your samples is a row in your input matrix and the columns are its attributes, you can propagate the input through your network like this:
activations[0] = input;
for(int i=0; i < numWeightMatrices; i++){
activations[i+1] = activations[i].dot(weightMatrices[i]);
activations[i+1] = activationFunction(activations[i+1]);
}
Backpropagation then becomes:
n = numWeightMatrices;
error = activationDerivative(activations[n]) * (target - activations[n]);
for (int l=n-1; l >= 0; l--){
gradient[l] = activations[l].transposed().dot(error);
if (l > 0) {
error = error.dot(weightMatrices[l].transposed());
error = activationDerivative(activations[l])*error;
}
}
I omitted the bias neuron in the above explanations. In literature it is recommended to model the bias neuron as an additional column in each activation matrix which is alway 1.0 . You will need to deal with some slice assigns. When using the matrix backpropagation loop, do not forget to set the error at the position of the bias to 0 before each step!
private float resilientPropagation(int i, int j){
float gradientSignChange = sign(prevGradient[i][j]*gradient[i][j]);
float delta = 0;
if(gradientSignChange > 0){
float change = Math.min((prevChange[i][j]*increaseFactor), maxDelta);
delta = sign(gradient[i][j])*change;
prevChange[i][j] = change;
prevGradient[i][j] = gradient[i][j];
}
else if(gradientSignChange < 0){
float change = Math.max((prevChange[i][j]*decreaseFactor), minDelta);
prevChange[i][j] = change;
delta = -prevDelta[i][j];
prevGradient[i][j] = 0;
}
else if(gradientSignChange == 0){
float change = prevChange[i][j];
delta = sign(gradient[i][j])*change;
prevGradient[i][j] = gradient[i][j];
}
prevDelta[i][j] = delta;
return delta;
}
gradient[i][j] = error[j]*layerInput[i];
weights[i][j]= weights[i][j]+resilientPropagation(i,j);