tdf#137679 Use KahanSum for SSE2

Change-Id: I97970cbb7a9562081f9a84b1d81423c80ed7f7f7
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115113
Tested-by: Jenkins
Reviewed-by: Tomaž Vajngerl <quikee@gmail.com>
diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx
index 8946753..e69f672 100644
--- a/sc/source/core/tool/arraysumSSE2.cxx
+++ b/sc/source/core/tool/arraysumSSE2.cxx
@@ -27,25 +27,65 @@ double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const
        __m128d sum3 = _mm_setzero_pd();
        __m128d sum4 = _mm_setzero_pd();

        __m128d err1 = _mm_setzero_pd();
        __m128d err2 = _mm_setzero_pd();
        __m128d err3 = _mm_setzero_pd();
        __m128d err4 = _mm_setzero_pd();

        __m128d y, t;

        for (; i < nUnrolledSize; i += 8)
        {
            // Kahan sum 1
            __m128d load1 = _mm_load_pd(pCurrent);
            sum1 = _mm_add_pd(sum1, load1);
            y = _mm_sub_pd(load1, err1);
            t = _mm_add_pd(sum1, y);
            err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y);
            sum1 = t;
            pCurrent += 2;

            // Kahan sum 2
            __m128d load2 = _mm_load_pd(pCurrent);
            sum2 = _mm_add_pd(sum2, load2);
            y = _mm_sub_pd(load2, err2);
            t = _mm_add_pd(sum2, y);
            err2 = _mm_sub_pd(_mm_sub_pd(t, sum2), y);
            sum2 = t;
            pCurrent += 2;

            // Kahan sum 3
            __m128d load3 = _mm_load_pd(pCurrent);
            sum3 = _mm_add_pd(sum3, load3);
            y = _mm_sub_pd(load3, err3);
            t = _mm_add_pd(sum3, y);
            err3 = _mm_sub_pd(_mm_sub_pd(t, sum3), y);
            sum3 = t;
            pCurrent += 2;

            // Kahan sum 4
            __m128d load4 = _mm_load_pd(pCurrent);
            sum4 = _mm_add_pd(sum4, load4);
            y = _mm_sub_pd(load4, err4);
            t = _mm_add_pd(sum4, y);
            err4 = _mm_sub_pd(_mm_sub_pd(t, sum4), y);
            sum4 = t;
            pCurrent += 2;
        }
        sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4));

        // Now we combine pairwise summation with Kahan summation

        // sum 1 + sum 2
        y = _mm_sub_pd(sum2, err1);
        t = _mm_add_pd(sum1, y);
        err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y);
        sum1 = t;

        // sum 3 + sum 4
        y = _mm_sub_pd(sum4, err3);
        t = _mm_add_pd(sum3, y);
        sum3 = t;

        // sum 1 + sum 3
        y = _mm_sub_pd(sum3, err1);
        t = _mm_add_pd(sum1, y);
        sum1 = t;

        double temp;

@@ -62,4 +102,4 @@ double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const
    return 0.0;
#endif
}
}
\ No newline at end of file
}