tdf#137679 Use KahanSum for SSE2
Change-Id: I97970cbb7a9562081f9a84b1d81423c80ed7f7f7
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115113
Tested-by: Jenkins
Reviewed-by: Tomaž Vajngerl <quikee@gmail.com>
diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx
index 8946753..e69f672 100644
--- a/sc/source/core/tool/arraysumSSE2.cxx
+++ b/sc/source/core/tool/arraysumSSE2.cxx
@@ -27,25 +27,65 @@ double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const
__m128d sum3 = _mm_setzero_pd();
__m128d sum4 = _mm_setzero_pd();
__m128d err1 = _mm_setzero_pd();
__m128d err2 = _mm_setzero_pd();
__m128d err3 = _mm_setzero_pd();
__m128d err4 = _mm_setzero_pd();
__m128d y, t;
for (; i < nUnrolledSize; i += 8)
{
// Kahan sum 1
__m128d load1 = _mm_load_pd(pCurrent);
sum1 = _mm_add_pd(sum1, load1);
y = _mm_sub_pd(load1, err1);
t = _mm_add_pd(sum1, y);
err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y);
sum1 = t;
pCurrent += 2;
// Kahan sum 2
__m128d load2 = _mm_load_pd(pCurrent);
sum2 = _mm_add_pd(sum2, load2);
y = _mm_sub_pd(load2, err2);
t = _mm_add_pd(sum2, y);
err2 = _mm_sub_pd(_mm_sub_pd(t, sum2), y);
sum2 = t;
pCurrent += 2;
// Kahan sum 3
__m128d load3 = _mm_load_pd(pCurrent);
sum3 = _mm_add_pd(sum3, load3);
y = _mm_sub_pd(load3, err3);
t = _mm_add_pd(sum3, y);
err3 = _mm_sub_pd(_mm_sub_pd(t, sum3), y);
sum3 = t;
pCurrent += 2;
// Kahan sum 4
__m128d load4 = _mm_load_pd(pCurrent);
sum4 = _mm_add_pd(sum4, load4);
y = _mm_sub_pd(load4, err4);
t = _mm_add_pd(sum4, y);
err4 = _mm_sub_pd(_mm_sub_pd(t, sum4), y);
sum4 = t;
pCurrent += 2;
}
sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4));
// Now we combine pairwise summation with Kahan summation
// sum 1 + sum 2
y = _mm_sub_pd(sum2, err1);
t = _mm_add_pd(sum1, y);
err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y);
sum1 = t;
// sum 3 + sum 4
y = _mm_sub_pd(sum4, err3);
t = _mm_add_pd(sum3, y);
sum3 = t;
// sum 1 + sum 3
y = _mm_sub_pd(sum3, err1);
t = _mm_add_pd(sum1, y);
sum1 = t;
double temp;
@@ -62,4 +102,4 @@ double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const
return 0.0;
#endif
}
}
\ No newline at end of file
}