vcl: parallelize image scaling.

(cherry picked from commit ffdf0ed4cd76188e780eceee4333f90a00217f9d)

Change-Id: Ia452487c0c8c66a35c4b9fba225348bdef1a27f7
diff --git a/include/comphelper/threadpool.hxx b/include/comphelper/threadpool.hxx
index 1aa4733..88375ab 100644
--- a/include/comphelper/threadpool.hxx
+++ b/include/comphelper/threadpool.hxx
@@ -45,6 +45,9 @@ public:
    /// wait until all queued tasks are completed
    void        waitUntilEmpty();

    /// return the number of live worker threads
    sal_Int32   getWorkerCount() const { return maWorkers.size(); }

private:
    ThreadPool(const ThreadPool&) SAL_DELETED_FUNCTION;
    ThreadPool& operator=(const ThreadPool&) SAL_DELETED_FUNCTION;
diff --git a/vcl/source/bitmap/bitmapscalesuper.cxx b/vcl/source/bitmap/bitmapscalesuper.cxx
index 196123a..f2e0929 100644
--- a/vcl/source/bitmap/bitmapscalesuper.cxx
+++ b/vcl/source/bitmap/bitmapscalesuper.cxx
@@ -20,7 +20,9 @@
#include <vcl/bmpacc.hxx>
#include <vcl/bitmapscalesuper.hxx>

#include <algorithm>
#include <boost/scoped_array.hpp>
#include <comphelper/threadpool.hxx>

namespace {

@@ -71,11 +73,37 @@ struct ScaleContext {
    }
};

#define SCALE_THREAD_STRIP 32
struct ScaleRangeContext {
    ScaleContext &mrCtx;
    long mnStartY, mnEndY;
    ScaleRangeContext( ScaleContext &rCtx, long nStartY )
        : mrCtx( rCtx ), mnStartY( nStartY ),
          mnEndY( nStartY + SCALE_THREAD_STRIP ) {}
};

typedef void (*ScaleRangeFn)(ScaleContext &rCtx, long nStartY, long nEndY);

class ScaleTask : public comphelper::ThreadTask
{
    ScaleRangeFn mpFn;
    std::vector< ScaleRangeContext > maStrips;
public:
    ScaleTask( ScaleRangeFn pFn ) : mpFn( pFn ) {}
    void push( ScaleRangeContext &aRC ) { maStrips.push_back( aRC ); }
    virtual void doWork() SAL_OVERRIDE
    {
        std::vector< ScaleRangeContext >::iterator it;
        for (it = maStrips.begin(); it != maStrips.end(); ++it)
            mpFn( it->mrCtx, it->mnStartY, it->mnEndY );
    }
};

void scalePallete8bit(ScaleContext &rCtx, long nStartY, long nEndY)
{
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;

    for( long nY = nStartY, nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTempY = rCtx.mpMapIY[ nY ];
        long nTempFY = rCtx.mpMapFY[ nY ];
@@ -103,7 +131,7 @@ void scalePallete8bit(ScaleContext &rCtx, long nStartY, long nEndY)
            BitmapColor aColRes( MAP( cR0, cR1, nTempFY ),
                    MAP( cG0, cG1, nTempFY ),
                    MAP( cB0, cB1, nTempFY ) );
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -112,7 +140,7 @@ void scalePalleteGeneral(ScaleContext &rCtx, long nStartY, long nEndY)
{
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;

    for( long nY = nStartY, nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTempY = rCtx.mpMapIY[ nY ];
        long nTempFY = rCtx.mpMapFY[ nY ];
@@ -137,7 +165,7 @@ void scalePalleteGeneral(ScaleContext &rCtx, long nStartY, long nEndY)
            BitmapColor aColRes( MAP( cR0, cR1, nTempFY ),
                    MAP( cG0, cG1, nTempFY ),
                    MAP( cB0, cB1, nTempFY ) );
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -146,7 +174,7 @@ void scale24bitBGR(ScaleContext &rCtx, long nStartY, long nEndY)
{
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;

    for( long nY = nStartY, nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTempY = rCtx.mpMapIY[ nY ];
        long nTempFY = rCtx.mpMapFY[ nY ];
@@ -176,7 +204,7 @@ void scale24bitBGR(ScaleContext &rCtx, long nStartY, long nEndY)
            BitmapColor aColRes( MAP( cR0, cR1, nTempFY ),
                    MAP( cG0, cG1, nTempFY ),
                    MAP( cB0, cB1, nTempFY ) );
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -185,7 +213,7 @@ void scale24bitRGB(ScaleContext &rCtx, long nStartY, long nEndY)
{
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;

    for( long nY = nStartY, nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTempY = rCtx.mpMapIY[ nY ];
        long nTempFY = rCtx.mpMapFY[ nY ];
@@ -215,7 +243,7 @@ void scale24bitRGB(ScaleContext &rCtx, long nStartY, long nEndY)
            BitmapColor aColRes( MAP( cR0, cR1, nTempFY ),
                    MAP( cG0, cG1, nTempFY ),
                    MAP( cB0, cB1, nTempFY ) );
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -224,7 +252,7 @@ void scaleNonPalleteGeneral(ScaleContext &rCtx, long nStartY, long nEndY)
{
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;

    for( long nY = nStartY, nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTempY = rCtx.mpMapIY[ nY ];
        long nTempFY = rCtx.mpMapFY[ nY ];
@@ -249,7 +277,7 @@ void scaleNonPalleteGeneral(ScaleContext &rCtx, long nStartY, long nEndY)
            BitmapColor aColRes( MAP( cR0, cR1, nTempFY ),
                    MAP( cG0, cG1, nTempFY ),
                    MAP( cB0, cB1, nTempFY ) );
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -259,7 +287,7 @@ void scalePallete8bit2(ScaleContext &rCtx, long nStartY, long nEndY)
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;
    const long nMax = 1 << 7L;

    for( long nY = nStartY , nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTop = rCtx.mbVMirr ? ( nY + 1 ) : nY;
        long nBottom = rCtx.mbVMirr ? nY : ( nY + 1 ) ;
@@ -374,7 +402,7 @@ void scalePallete8bit2(ScaleContext &rCtx, long nStartY, long nEndY)
            }

            BitmapColor aColRes((sal_uInt8)nSumR, (sal_uInt8)nSumG, (sal_uInt8)nSumB);
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -384,7 +412,7 @@ void scalePalleteGeneral2(ScaleContext &rCtx, long nStartY, long nEndY)
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;
    const long nMax = 1 << 7L;

    for( long nY = nStartY , nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTop = rCtx.mbVMirr ? ( nY + 1 ) : nY;
        long nBottom = rCtx.mbVMirr ? nY : ( nY + 1 ) ;
@@ -501,7 +529,7 @@ void scalePalleteGeneral2(ScaleContext &rCtx, long nStartY, long nEndY)
            }

            BitmapColor aColRes((sal_uInt8)nSumR, (sal_uInt8)nSumG, (sal_uInt8)nSumB);
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -511,7 +539,7 @@ void scale24bitBGR2(ScaleContext &rCtx, long nStartY, long nEndY)
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;
    const long nMax = 1 << 7L;

    for( long nY = nStartY , nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTop = rCtx.mbVMirr ? ( nY + 1 ) : nY;
        long nBottom = rCtx.mbVMirr ? nY : ( nY + 1 ) ;
@@ -624,7 +652,7 @@ void scale24bitBGR2(ScaleContext &rCtx, long nStartY, long nEndY)
                nSumB /= nTotalWeightY;
            }
            BitmapColor aColRes((sal_uInt8)nSumR, (sal_uInt8)nSumG, (sal_uInt8)nSumB);
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -634,7 +662,7 @@ void scale24bitRGB2(ScaleContext &rCtx, long nStartY, long nEndY)
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;
    const long nMax = 1 << 7L;

    for( long nY = nStartY , nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTop = rCtx.mbVMirr ? ( nY + 1 ) : nY;
        long nBottom = rCtx.mbVMirr ? nY : ( nY + 1 ) ;
@@ -745,7 +773,7 @@ void scale24bitRGB2(ScaleContext &rCtx, long nStartY, long nEndY)
                nSumB /= nTotalWeightY;
            }
            BitmapColor aColRes((sal_uInt8)nSumR, (sal_uInt8)nSumG, (sal_uInt8)nSumB);
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -755,7 +783,7 @@ void scaleNonPalleteGeneral2(ScaleContext &rCtx, long nStartY, long nEndY)
    const long nStartX = 0, nEndX = rCtx.mnDestW - 1L;
    const long nMax = 1 << 7L;

    for( long nY = nStartY , nYDst = 0L; nY <= nEndY; nY++, nYDst++ )
    for( long nY = nStartY; nY <= nEndY; nY++ )
    {
        long nTop = rCtx.mbVMirr ? ( nY + 1 ) : nY;
        long nBottom = rCtx.mbVMirr ? nY : ( nY + 1 ) ;
@@ -871,7 +899,7 @@ void scaleNonPalleteGeneral2(ScaleContext &rCtx, long nStartY, long nEndY)
            }

            BitmapColor aColRes((sal_uInt8)nSumR, (sal_uInt8)nSumG, (sal_uInt8)nSumB);
            rCtx.mpDest->SetPixel( nYDst, nXDst++, aColRes );
            rCtx.mpDest->SetPixel( nY, nXDst++, aColRes );
        }
    }
}
@@ -916,6 +944,7 @@ bool BitmapScaleSuper::filter(Bitmap& rBitmap)

    if (pReadAccess && pWriteAccess)
    {
        ScaleRangeFn pScaleRangeFn;
        ScaleContext aContext( pReadAccess.get(),
                               pWriteAccess.get(),
                               pReadAccess->Width(),
@@ -923,7 +952,6 @@ bool BitmapScaleSuper::filter(Bitmap& rBitmap)
                               pReadAccess->Height(),
                               pWriteAccess->Height(),
                               bVMirr, bHMirr );
        void (*scaleRangeFn)(ScaleContext &rCtx, long nStartY, long nEndY);

        bool bScaleUp = fScaleX >= fScaleThresh && fScaleY >= fScaleThresh;
        if( pReadAccess->HasPalette() )
@@ -931,10 +959,10 @@ bool BitmapScaleSuper::filter(Bitmap& rBitmap)
            switch( pReadAccess->GetScanlineFormat() )
            {
            case BMP_FORMAT_8BIT_PAL:
                scaleRangeFn = bScaleUp ? scalePallete8bit : scalePallete8bit2;
                pScaleRangeFn = bScaleUp ? scalePallete8bit : scalePallete8bit2;
                break;
            default:
                scaleRangeFn = bScaleUp ? scalePalleteGeneral
                pScaleRangeFn = bScaleUp ? scalePalleteGeneral
                                        : scalePalleteGeneral2;
                break;
            }
@@ -944,18 +972,57 @@ bool BitmapScaleSuper::filter(Bitmap& rBitmap)
            switch( pReadAccess->GetScanlineFormat() )
            {
            case BMP_FORMAT_24BIT_TC_BGR:
                scaleRangeFn = bScaleUp ? scale24bitBGR : scale24bitBGR2;
                pScaleRangeFn = bScaleUp ? scale24bitBGR : scale24bitBGR2;
                break;
            case BMP_FORMAT_24BIT_TC_RGB:
                scaleRangeFn = bScaleUp ? scale24bitRGB : scale24bitRGB2;
                pScaleRangeFn = bScaleUp ? scale24bitRGB : scale24bitRGB2;
                break;
            default:
                scaleRangeFn = bScaleUp ? scaleNonPalleteGeneral
                pScaleRangeFn = bScaleUp ? scaleNonPalleteGeneral
                                        : scaleNonPalleteGeneral2;
                break;
            }
        }
        scaleRangeFn( aContext, nStartY, nEndY );

        // We want to thread - only if there is a lot of work to do:
        // We work hard when there is a large destination image, or
        // A large source image.
        bool bHorizontalWork = pReadAccess->Width() > 512 || pWriteAccess->Width() > 512;

        static bool bDisableThreadedScaling = getenv ("VCL_NO_THREAD_SCALE");
        if ( bDisableThreadedScaling || !bHorizontalWork ||
             nEndY - nStartY < SCALE_THREAD_STRIP )
        {
            SAL_INFO("vcl.gdi", "Scale in main thread");
            pScaleRangeFn( aContext, nStartY, nEndY );
        }
        else
        {
            // partition and queue work
            comphelper::ThreadPool &rShared = comphelper::ThreadPool::getSharedOptimalPool();
            sal_uInt32 nThreads = rShared.getWorkerCount();
            assert( nThreads > 0 );
            sal_uInt32 nStrips = ((nEndY - nStartY) + SCALE_THREAD_STRIP - 1) / SCALE_THREAD_STRIP;
            sal_uInt32 nStripsPerThread = nStrips / nThreads;
            SAL_INFO("vcl.gdi", "Scale in " << nStrips << " strips " << nStripsPerThread << " per thread" << " we have " << nThreads << " CPU threads ");
            long nStripY = nStartY;
            for ( sal_uInt32 t = 0; t < nThreads - 1; t++ )
            {
                ScaleTask *pTask = new ScaleTask( pScaleRangeFn );
                for ( sal_uInt32 j = 0; j < nStripsPerThread; j++ )
                {
                    ScaleRangeContext aRC( aContext, nStripY );
                    pTask->push( aRC );
                    nStripY += SCALE_THREAD_STRIP;
                }
                rShared.pushTask( pTask );
            }
            // finish any remaining bits here
            pScaleRangeFn( aContext, nStripY, nEndY );

            rShared.waitUntilEmpty();
            SAL_INFO("vcl.gdi", "All threaded scaling tasks complete");
        }

        bRet = true;
    }