GPU Calc: separate out parallel reduction from DynamicKernelSlidingArgument

Create a new class ParallelReductionVectorRef to straighten out code
generation and marshaling logic between sequential and parallel code
generation alternatives.

Change-Id: Id029ad441f80712f8e7396dcd985e3363ce08ff8
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index abd3230..388605c 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -409,6 +409,7 @@ protected:

/// Handling a Double Vector that is used as a sliding window input
/// to either a sliding window average or sum-of-products
/// Generate a sequential loop for reductions
class OpSum; // Forward Declaration
class OpAverage; // Forward Declaration
class OpMin; // Forward Declaration
@@ -430,79 +431,8 @@ public:
        bIsStartFixed = mpDVR->IsStartFixed();
        bIsEndFixed = mpDVR->IsEndFixed();
    }
    virtual bool NeedParallelReduction(void) const
    {
        if ((dynamic_cast<OpSum*>(mpCodeGen.get())
            && !dynamic_cast<OpAverage*>(mpCodeGen.get())) ||
            dynamic_cast<OpMin*>(mpCodeGen.get()) ||
            dynamic_cast<OpMax*>(mpCodeGen.get()) ||
            dynamic_cast<OpSumIfs*>(mpCodeGen.get()))
            return GetWindowSize()> 100 &&
                ( (GetStartFixed() && GetEndFixed()) ||
                  (!GetStartFixed() && !GetEndFixed())  ) ;
        else
            return false;
    }
    virtual void GenSlidingWindowFunction(std::stringstream &ss) {
        if (!dynamic_cast<OpSumIfs*>(mpCodeGen.get())
            && NeedParallelReduction())
        {
            std::string name = Base::GetName();
            ss << "__kernel void "<<name;
            ss << "_reduction(__global double* A, "
                "__global double *result,int arrayLength,int windowSize){\n";
            ss << "    double tmp, current_result =" <<
                mpCodeGen->GetBottom();
            ss << ";\n";
            ss << "    int writePos = get_group_id(1);\n";
            ss << "    int lidx = get_local_id(0);\n";
            ss << "    __local double shm_buf[256];\n";
            if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
                ss << "    int offset = 0;\n";
            else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
                ss << "    int offset = get_group_id(1);\n";
            else
                throw Unhandled();
            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
            ss << "    int loop = arrayLength/512 + 1;\n";
            ss << "    for (int l=0; l<loop; l++){\n";
            ss << "    tmp = "<< mpCodeGen->GetBottom() << ";\n";
            ss << "    int loopOffset = l*512;\n";
            ss << "    if((loopOffset + lidx + offset + 256) < min( offset + windowSize, arrayLength))\n";
            ss << "        tmp = ";
            ss << mpCodeGen->Gen2(
                    std::string(
                        "legalize(A[loopOffset + lidx + offset], ")+
                    mpCodeGen->GetBottom() +")",
                    std::string(
                        "legalize(A[loopOffset + lidx + offset + 256], ")+
                    mpCodeGen->GetBottom() +")"
                    );
            ss << ";";
            ss << "    else if ((loopOffset + lidx + offset) < min(offset + windowSize, arrayLength))\n";
            ss << "        tmp = legalize(A[loopOffset + lidx + offset],";
            ss << mpCodeGen->GetBottom() << ");\n";
            ss << "    shm_buf[lidx] = tmp;\n";
            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
            ss << "    for (int i = 128; i >0; i/=2) {\n";
            ss << "        if (lidx < i)\n";
            ss << "            shm_buf[lidx] = ";
            ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]");
            ss << ";";
            ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
            ss << "    }\n";
            ss << "    if (lidx == 0)\n";
            ss << "        current_result =";
            ss << mpCodeGen->Gen2("current_result", "shm_buf[0]");
            ss << ";\n";
            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
            ss << "    }\n";
            ss << "    if (lidx == 0)\n";
            ss << "        result[writePos] = current_result;\n";
            ss << "}\n";
        }
     }

    virtual void GenSlidingWindowFunction(std::stringstream &) {}

    virtual std::string GenSlidingWindowDeclRef(bool=false) const
    {
@@ -519,26 +449,7 @@ public:
    {
        assert(mpDVR);
        size_t nCurWindowSize = mpDVR->GetRefRowSize();
        if (!dynamic_cast<OpSumIfs*>(mpCodeGen.get())
                && NeedParallelReduction())
        {
            if ((!bIsStartFixed && !bIsEndFixed) ||
                (bIsStartFixed && bIsEndFixed))
            {
                // set 100 as a temporary threshold for invoking reduction
                // kernel in NeedParalleLReduction function
                if (NeedParallelReduction())
                {
                    std::string temp = Base::GetName() + "[gid0]";
                    ss << "tmp = ";
                    ss << mpCodeGen->Gen2(temp, "tmp");
                    ss << ";\n\t";
                    needBody = false;
                    return nCurWindowSize;
                }
            }
        }
// original for loop
        // original for loop
#ifndef UNROLLING
        needBody = true;
        // No need to generate a for-loop for degenerated cases
@@ -586,8 +497,6 @@ public:
return nCurWindowSize;
#endif



#ifdef UNROLLING
        {
            if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) {
@@ -709,13 +618,134 @@ return nCurWindowSize;
        }
#endif
}
    ~DynamicKernelSlidingArgument()
    {
        if (mpClmem2)
        {
            clReleaseMemObject(mpClmem2);
            mpClmem2 = NULL;
        }
    }

    size_t GetArrayLength(void) const {return mpDVR->GetArrayLength(); }

    size_t GetWindowSize(void) const {return mpDVR->GetRefRowSize(); }

    size_t GetStartFixed(void) const {return bIsStartFixed; }

    size_t GetEndFixed(void) const {return bIsEndFixed; }

protected:
    bool bIsStartFixed, bIsEndFixed;
    const formula::DoubleVectorRefToken *mpDVR;
    // from parent nodes
    boost::shared_ptr<SlidingFunctionBase> mpCodeGen;
    // controls whether to invoke the reduction kernel during marshaling or not
    cl_mem mpClmem2;
};

/// Handling a Double Vector that is used as a sliding window input
/// Performs parallel reduction based on given operator
template<class Base>
class ParallelReductionVectorRef: public Base
{
public:
    ParallelReductionVectorRef(const std::string &s,
        FormulaTreeNodeRef ft, boost::shared_ptr<SlidingFunctionBase> &CodeGen,
        int index=0):
        Base(s, ft, index), mpCodeGen(CodeGen), mpClmem2(NULL)
    {
        FormulaToken *t = ft->GetFormulaToken();
        if (t->GetType() != formula::svDoubleVectorRef)
            throw Unhandled();
        mpDVR = dynamic_cast<const formula::DoubleVectorRefToken *>(t);
        assert(mpDVR);
        bIsStartFixed = mpDVR->IsStartFixed();
        bIsEndFixed = mpDVR->IsEndFixed();
    }
    /// Emit the definition for the auxiliary reduction kernel
    virtual void GenSlidingWindowFunction(std::stringstream &ss) {
        std::string name = Base::GetName();
        ss << "__kernel void "<<name;
        ss << "_reduction(__global double* A, "
            "__global double *result,int arrayLength,int windowSize){\n";
        ss << "    double tmp, current_result =" <<
            mpCodeGen->GetBottom();
        ss << ";\n";
        ss << "    int writePos = get_group_id(1);\n";
        ss << "    int lidx = get_local_id(0);\n";
        ss << "    __local double shm_buf[256];\n";
        if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
            ss << "    int offset = 0;\n";
        else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
            ss << "    int offset = get_group_id(1);\n";
        else
            throw Unhandled();
        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
        ss << "    int loop = arrayLength/512 + 1;\n";
        ss << "    for (int l=0; l<loop; l++){\n";
        ss << "    tmp = "<< mpCodeGen->GetBottom() << ";\n";
        ss << "    int loopOffset = l*512;\n";
        ss << "    if((loopOffset + lidx + offset + 256) < min( offset + windowSize, arrayLength))\n";
        ss << "        tmp = ";
        ss << mpCodeGen->Gen2(
                std::string(
                    "legalize(A[loopOffset + lidx + offset], ")+
                mpCodeGen->GetBottom() +")",
                std::string(
                    "legalize(A[loopOffset + lidx + offset + 256], ")+
                mpCodeGen->GetBottom() +")"
                );
        ss << ";\n";
        ss << "    else if ((loopOffset + lidx + offset) < min(offset + windowSize, arrayLength))\n";
        ss << "        tmp = legalize(A[loopOffset + lidx + offset],";
        ss << mpCodeGen->GetBottom() << ");\n";
        ss << "    shm_buf[lidx] = tmp;\n";
        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
        ss << "    for (int i = 128; i >0; i/=2) {\n";
        ss << "        if (lidx < i)\n";
        ss << "            shm_buf[lidx] = ";
        ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]");
        ss << ";";
        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
        ss << "    }\n";
        ss << "    if (lidx == 0)\n";
        ss << "        current_result =";
        ss << mpCodeGen->Gen2("current_result", "shm_buf[0]");
        ss << ";\n";
        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
        ss << "    }\n";
        ss << "    if (lidx == 0)\n";
        ss << "        result[writePos] = current_result;\n";
        ss << "}\n";
     }


    virtual std::string GenSlidingWindowDeclRef(bool=false) const
    {
        std::stringstream ss;
        if (!bIsStartFixed && !bIsEndFixed)
            ss << Base::GetName() << "[i + gid0]";
        else
            ss << Base::GetName() << "[i]";
        return ss.str();
    }
    /// Controls how the elements in the DoubleVectorRef are traversed
    virtual size_t GenReductionLoopHeader(
        std::stringstream &ss, bool &needBody)
    {
        assert(mpDVR);
        size_t nCurWindowSize = mpDVR->GetRefRowSize();
        std::string temp = Base::GetName() + "[gid0]";
        ss << "tmp = ";
        ss << mpCodeGen->Gen2(temp, "tmp");
        ss << ";\n\t";
        needBody = false;
        return nCurWindowSize;
    }

    virtual size_t Marshal(cl_kernel k, int argno, int w, cl_program mpProgram)
    {
        if (!NeedParallelReduction() ||
            dynamic_cast<OpSumIfs*>(mpCodeGen.get()))
            return Base::Marshal(k, argno, w, mpProgram);

        assert(Base::mpClmem == NULL);
        // Obtain cl context
        KernelEnv kEnv;
@@ -780,7 +810,7 @@ return nCurWindowSize;
            throw OpenCLError(err);
        return 1;
    }
    ~DynamicKernelSlidingArgument()
    ~ParallelReductionVectorRef()
    {
        if (mpClmem2)
        {
@@ -806,6 +836,8 @@ protected:
    cl_mem mpClmem2;
};



/// Abstract class for code generation

class Reduction: public SlidingFunctionBase
@@ -813,6 +845,7 @@ class Reduction: public SlidingFunctionBase
public:
    typedef DynamicKernelSlidingArgument<VectorRef> NumericRange;
    typedef DynamicKernelSlidingArgument<DynamicKernelStringArgument> StringRange;
    typedef ParallelReductionVectorRef<VectorRef> ParallelNumericRange;

    virtual void GenSlidingWindowFunction(std::stringstream &ss,
            const std::string sSymName, SubArguments &vSubArguments)
@@ -834,13 +867,23 @@ public:
        size_t nItems = 0;
        while (i--)
        {
            if (NumericRange *NR = dynamic_cast<NumericRange *> (vSubArguments[i].get()))
            if (NumericRange *NR =
                    dynamic_cast<NumericRange *> (vSubArguments[i].get()))
            {
                bool needBody;
                nItems += NR->GenReductionLoopHeader(ss, needBody);
                if (needBody == false) continue;
            }
            else if (StringRange *SR = dynamic_cast<StringRange *> (vSubArguments[i].get()))
            else if (ParallelNumericRange *PNR =
                    dynamic_cast<ParallelNumericRange *> (vSubArguments[i].get()))
            {
                //did not handle yet
                bool needBody;
                nItems += PNR->GenReductionLoopHeader(ss, needBody);
                if (needBody == false) continue;
            }
            else if (StringRange *SR =
                    dynamic_cast<StringRange *> (vSubArguments[i].get()))
            {
                //did not handle yet
                bool needBody;
@@ -1497,6 +1540,46 @@ boost::shared_ptr<DynamicKernelArgument> SoPHelper(
    return boost::shared_ptr<DynamicKernelArgument>(new DynamicKernelSoPArguments(ts, ft, pCodeGen));
}

template<class Base>
DynamicKernelArgument *VectorRefFactory(const std::string &s,
        const FormulaTreeNodeRef& ft,
        boost::shared_ptr<SlidingFunctionBase> &pCodeGen,
        int index)
{
    //Black lists ineligible classes here ..
    // SUMIFS does not perform parallel reduction at DoubleVectorRef level
    if (dynamic_cast<OpSumIfs*>(pCodeGen.get())) {
        return new DynamicKernelSlidingArgument<Base>(s, ft, pCodeGen, index);
    }
    // AVERAGE is not supported yet
    else if (dynamic_cast<OpAverage*>(pCodeGen.get()))
    {
        return new DynamicKernelSlidingArgument<Base>(s, ft, pCodeGen, index);
    }
    // COUNT is not supported yet
    else if (dynamic_cast<OpCount*>(pCodeGen.get()))
    {
        return new DynamicKernelSlidingArgument<Base>(s, ft, pCodeGen, index);
    }
    // Only child class of Reduction is supported
    else if (!dynamic_cast<Reduction*>(pCodeGen.get()))
    {
        return new DynamicKernelSlidingArgument<Base>(s, ft, pCodeGen, index);
    }

    const formula::DoubleVectorRefToken* pDVR =
        dynamic_cast< const formula::DoubleVectorRefToken* >(
                ft->GetFormulaToken());
    // Window being too small to justify a parallel reduction
    if (pDVR->GetRefRowSize() < 100)
        return new DynamicKernelSlidingArgument<Base>(s, ft, pCodeGen, index);
    if ((pDVR->IsStartFixed() && pDVR->IsEndFixed()) ||
            (!pDVR->IsStartFixed() && !pDVR->IsEndFixed()))
        return new ParallelReductionVectorRef<Base>(s, ft, pCodeGen, index);
    else // Other cases are not supported as well
        return new DynamicKernelSlidingArgument<Base>(s, ft, pCodeGen, index);
}

DynamicKernelSoPArguments::DynamicKernelSoPArguments(
    const std::string &s, const FormulaTreeNodeRef& ft, SlidingFunctionBase* pCodeGen) :
    DynamicKernelArgument(s, ft), mpCodeGen(pCodeGen)
@@ -1523,12 +1606,11 @@ DynamicKernelSoPArguments::DynamicKernelSoPArguments(
                    {
                        if (pDVR->GetArrays()[j].mpNumericArray)
                            mvSubArguments.push_back(
                                    SubArgument(new DynamicKernelSlidingArgument
                                        <VectorRef>(
                                    SubArgument(VectorRefFactory<VectorRef>(
                                            ts, ft->Children[i], mpCodeGen, j)));
                        else
                            mvSubArguments.push_back(
                                    SubArgument(new DynamicKernelSlidingArgument
                                    SubArgument(VectorRefFactory
                                        <DynamicKernelStringArgument>(
                                            ts, ft->Children[i], mpCodeGen, j)));
                    }