tdf#109158 improve sorting when loading large autocorrect file

reduces time from 2.0s to 1.7s

reduce work by
(*) reserving some arrays
(*) pre-sorting with a cheaper comparator
(*) don't copy when returning result, just return a const&
(*) flattening the data-structures to reduce pointer-chasing

Change-Id: I972bd7ffdbf2121c2d38c24aca9618ca708e920c
Reviewed-on: https://gerrit.libreoffice.org/79119
Tested-by: Jenkins
Reviewed-by: Noel Grandin <noel.grandin@collabora.co.uk>
Signed-off-by: Xisco Fauli <xiscofauli@libreoffice.org>
Reviewed-on: https://gerrit.libreoffice.org/79377
diff --git a/cui/source/tabpages/autocdlg.cxx b/cui/source/tabpages/autocdlg.cxx
index 20028e1..6573e3b 100644
--- a/cui/source/tabpages/autocdlg.cxx
+++ b/cui/source/tabpages/autocdlg.cxx
@@ -743,12 +743,14 @@
        std::vector<SvxAutocorrWord> aDeleteWords;
        std::vector<SvxAutocorrWord> aNewWords;

        aDeleteWords.reserve( rStringChangeList.aDeletedEntries.size() );
        for (DoubleString & deleteEntry : rStringChangeList.aDeletedEntries)
        {
            SvxAutocorrWord aDeleteWord( deleteEntry.sShort, deleteEntry.sLong );
            aDeleteWords.push_back( aDeleteWord );
        }

        aNewWords.reserve( rStringChangeList.aNewEntries.size() );
        for (DoubleString & newEntry : rStringChangeList.aNewEntries)
        {
            //fdo#67697 if the user data is set then we want to retain the
@@ -835,10 +837,10 @@
    {
        SvxAutoCorrect* pAutoCorrect = SvxAutoCorrCfg::Get().GetAutoCorrect();
        SvxAutocorrWordList* pWordList = pAutoCorrect->LoadAutocorrWordList(eLang);
        SvxAutocorrWordList::Content aContent = pWordList->getSortedContent();
        m_xReplaceTLB->bulk_insert_for_each(aContent.size(), [this, &aContent](weld::TreeIter& rIter, int nIndex) {
            auto const& elem = aContent[nIndex];
            bool bTextOnly = elem->IsTextOnly();
        const SvxAutocorrWordList::AutocorrWordSetType & rContent = pWordList->getSortedContent();
        m_xReplaceTLB->bulk_insert_for_each(rContent.size(), [this, rContent](weld::TreeIter& rIter, int nIndex) {
            auto const& elem = rContent[nIndex];
            bool bTextOnly = elem.IsTextOnly();
            // formatted text is only in Writer
            if (bSWriter || bTextOnly)
            {
@@ -848,12 +850,12 @@
                    OUString sId = OUString::number(reinterpret_cast<sal_Int64>(m_xTextOnlyCB.get()));
                    m_xReplaceTLB->set_id(rIter, sId);
                }
                m_xReplaceTLB->set_text(rIter, elem->GetShort(), 0);
                m_xReplaceTLB->set_text(rIter, elem->GetLong(), 1);
                m_xReplaceTLB->set_text(rIter, elem.GetShort(), 0);
                m_xReplaceTLB->set_text(rIter, elem.GetLong(), 1);
            }
            else
            {
                aFormatText.insert(elem->GetShort());
                aFormatText.insert(elem.GetShort());
            }
        }, &m_aReplaceFixedWidths);
        m_xNewReplacePB->set_sensitive(false);
diff --git a/editeng/source/misc/SvXMLAutoCorrectExport.cxx b/editeng/source/misc/SvXMLAutoCorrectExport.cxx
index 8ebefcc..4dadeca 100644
--- a/editeng/source/misc/SvXMLAutoCorrectExport.cxx
+++ b/editeng/source/misc/SvXMLAutoCorrectExport.cxx
@@ -51,15 +51,15 @@
                   GetNamespaceMap_().GetNameByKey ( XML_NAMESPACE_BLOCKLIST ) );
    {
        SvXMLElementExport aRoot (*this, XML_NAMESPACE_BLOCKLIST, XML_BLOCK_LIST, true, true);
        SvxAutocorrWordList::Content aContent = pAutocorr_List->getSortedContent();
        for (auto const& content : aContent)
        const SvxAutocorrWordList::AutocorrWordSetType& rContent = pAutocorr_List->getSortedContent();
        for (auto const& content : rContent)
        {
            AddAttribute( XML_NAMESPACE_BLOCKLIST,
                          XML_ABBREVIATED_NAME,
                          content->GetShort());
                          content.GetShort());
            AddAttribute( XML_NAMESPACE_BLOCKLIST,
                          XML_NAME,
                          content->IsTextOnly() ? content->GetLong() : content->GetShort());
                          content.IsTextOnly() ? content.GetLong() : content.GetShort());

            SvXMLElementExport aBlock( *this, XML_NAMESPACE_BLOCKLIST, XML_BLOCK, true, true);
        }
diff --git a/editeng/source/misc/svxacorr.cxx b/editeng/source/misc/svxacorr.cxx
index 01a4f27..7ac9398 100644
--- a/editeng/source/misc/svxacorr.cxx
+++ b/editeng/source/misc/svxacorr.cxx
@@ -2488,10 +2488,10 @@
    {
        for (SvxAutocorrWord & aWordToDelete : aDeleteEntries)
        {
            std::unique_ptr<SvxAutocorrWord> pFoundEntry = pAutocorr_List->FindAndRemove( &aWordToDelete );
            if( pFoundEntry )
            boost::optional<SvxAutocorrWord> xFoundEntry = pAutocorr_List->FindAndRemove( &aWordToDelete );
            if( xFoundEntry )
            {
                if( !pFoundEntry->IsTextOnly() )
                if( !xFoundEntry->IsTextOnly() )
                {
                    OUString aName( aWordToDelete.GetShort() );
                    if (xStorage->IsOLEStorage())
@@ -2510,24 +2510,24 @@

        for (SvxAutocorrWord & aNewEntrie : aNewEntries)
        {
            std::unique_ptr<SvxAutocorrWord> pWordToAdd(new SvxAutocorrWord( aNewEntrie.GetShort(), aNewEntrie.GetLong(), true ));
            std::unique_ptr<SvxAutocorrWord> pRemoved = pAutocorr_List->FindAndRemove( pWordToAdd.get() );
            if( pRemoved )
            SvxAutocorrWord aWordToAdd(aNewEntrie.GetShort(), aNewEntrie.GetLong(), true );
            boost::optional<SvxAutocorrWord> xRemoved = pAutocorr_List->FindAndRemove( &aWordToAdd );
            if( xRemoved )
            {
                if( !pRemoved->IsTextOnly() )
                if( !xRemoved->IsTextOnly() )
                {
                    // Still have to remove the Storage
                    OUString sStorageName( pWordToAdd->GetShort() );
                    OUString sStorageName( aWordToAdd.GetShort() );
                    if (xStorage->IsOLEStorage())
                        sStorageName = EncryptBlockName_Imp(sStorageName);
                    else
                        GeneratePackageName ( pWordToAdd->GetShort(), sStorageName);
                        GeneratePackageName ( aWordToAdd.GetShort(), sStorageName);

                    if( xStorage->IsContained( sStorageName ) )
                        xStorage->Remove( sStorageName );
                }
            }
            bRet = pAutocorr_List->Insert( std::move(pWordToAdd) );
            bRet = pAutocorr_List->Insert( std::move(aWordToAdd) );

            if ( !bRet )
            {
@@ -2556,11 +2556,11 @@
    // Update the word list
    if( bRet )
    {
        std::unique_ptr<SvxAutocorrWord> pNew(new SvxAutocorrWord( rShort, rLong, true ));
        std::unique_ptr<SvxAutocorrWord> pRemove = pAutocorr_List->FindAndRemove( pNew.get() );
        if( pRemove )
        SvxAutocorrWord aNew(rShort, rLong, true );
        boost::optional<SvxAutocorrWord> xRemove = pAutocorr_List->FindAndRemove( &aNew );
        if( xRemove )
        {
            if( !pRemove->IsTextOnly() )
            if( !xRemove->IsTextOnly() )
            {
                // Still have to remove the Storage
                OUString sStgNm( rShort );
@@ -2574,7 +2574,7 @@
            }
        }

        if( pAutocorr_List->Insert( std::move(pNew) ) )
        if( pAutocorr_List->Insert( std::move(aNew) ) )
        {
            bRet = MakeBlocklist_Imp( *xStg );
            xStg = nullptr;
@@ -2605,8 +2605,7 @@
        // Update the word list
        if( bRet )
        {
            std::unique_ptr<SvxAutocorrWord> pNew(new SvxAutocorrWord( rShort, sLong, false ));
            if( pAutocorr_List->Insert( std::move(pNew) ) )
            if( pAutocorr_List->Insert( SvxAutocorrWord(rShort, sLong, false) ) )
            {
                tools::SvRef<SotStorage> xStor = new SotStorage( sUserAutoCorrFile, StreamMode::READWRITE );
                MakeBlocklist_Imp( *xStor );
@@ -2619,20 +2618,18 @@
}

// Keep the list sorted ...
struct CompareSvxAutocorrWordList
struct SvxAutocorrWordList::CompareSvxAutocorrWordList
{
    bool operator()( SvxAutocorrWord* const& lhs, SvxAutocorrWord* const& rhs ) const
    bool operator()( SvxAutocorrWord const & lhs, SvxAutocorrWord const & rhs ) const
    {
        CollatorWrapper& rCmp = ::GetCollatorWrapper();
        return rCmp.compareString( lhs->GetShort(), rhs->GetShort() ) < 0;
        return rCmp.compareString( lhs.GetShort(), rhs.GetShort() ) < 0;
    }
};

namespace {

// can't use std::unique_ptr until we have C++14
typedef std::set<SvxAutocorrWord*, CompareSvxAutocorrWordList> AutocorrWordSetType;
typedef std::unordered_map<OUString, std::unique_ptr<SvxAutocorrWord>> AutocorrWordHashType;
typedef std::unordered_map<OUString, SvxAutocorrWord> AutocorrWordHashType;

}

@@ -2640,16 +2637,14 @@
{

    // only one of these contains the data
    mutable AutocorrWordSetType maSet;
    // maSortedVector is manually sorted so we can optimise data movement
    mutable AutocorrWordSetType maSortedVector;
    mutable AutocorrWordHashType maHash; // key is 'Short'

    void DeleteAndDestroyAll()
    {
        maHash.clear();

        for (auto const& elem : maSet)
            delete elem;
        maSet.clear();
        maSortedVector.clear();
    }
};

@@ -2657,7 +2652,6 @@

SvxAutocorrWordList::~SvxAutocorrWordList()
{
    mpImpl->DeleteAndDestroyAll();
}

void SvxAutocorrWordList::DeleteAndDestroyAll()
@@ -2666,70 +2660,89 @@
}

// returns true if inserted
bool SvxAutocorrWordList::Insert(std::unique_ptr<SvxAutocorrWord> pWord) const
const SvxAutocorrWord* SvxAutocorrWordList::Insert(SvxAutocorrWord aWord) const
{
    if ( mpImpl->maSet.empty() ) // use the hash
    if ( mpImpl->maSortedVector.empty() ) // use the hash
    {
        OUString aShort( pWord->GetShort() );
        return mpImpl->maHash.insert( std::pair<OUString, std::unique_ptr<SvxAutocorrWord> >( aShort, std::move(pWord) ) ).second;
        OUString aShort = aWord.GetShort();
        auto [it,inserted] = mpImpl->maHash.emplace( std::move(aShort), std::move(aWord) );
        if (inserted)
            return &(it->second);
        return nullptr;
    }
    else
        return mpImpl->maSet.insert( pWord.release() ).second;
    {
        auto it = std::lower_bound(mpImpl->maSortedVector.begin(), mpImpl->maSortedVector.end(), aWord, CompareSvxAutocorrWordList());
        if (it != mpImpl->maSortedVector.end() && !CompareSvxAutocorrWordList()(aWord, *it))
        {
            it = mpImpl->maSortedVector.insert(it, std::move(aWord));
            return &*it;
        }
        return nullptr;
    }
}

void SvxAutocorrWordList::LoadEntry(const OUString& sWrong, const OUString& sRight, bool bOnlyTxt)
{
    std::unique_ptr<SvxAutocorrWord> pNew(new SvxAutocorrWord( sWrong, sRight, bOnlyTxt ));
    (void)Insert(std::move(pNew));
    (void)Insert(SvxAutocorrWord( sWrong, sRight, bOnlyTxt ));
}

bool SvxAutocorrWordList::empty() const
{
    return mpImpl->maHash.empty() && mpImpl->maSet.empty();
    return mpImpl->maHash.empty() && mpImpl->maSortedVector.empty();
}

std::unique_ptr<SvxAutocorrWord> SvxAutocorrWordList::FindAndRemove(SvxAutocorrWord *pWord)
boost::optional<SvxAutocorrWord> SvxAutocorrWordList::FindAndRemove(SvxAutocorrWord *pWord)
{
    std::unique_ptr<SvxAutocorrWord> pMatch;

    if ( mpImpl->maSet.empty() ) // use the hash
    if ( mpImpl->maSortedVector.empty() ) // use the hash
    {
        AutocorrWordHashType::iterator it = mpImpl->maHash.find( pWord->GetShort() );
        if( it != mpImpl->maHash.end() )
        {
            pMatch = std::move(it->second);
            SvxAutocorrWord pMatch = std::move(it->second);
            mpImpl->maHash.erase (it);
            return pMatch;
        }
    }
    else
    {
        AutocorrWordSetType::iterator it = mpImpl->maSet.find( pWord );
        if( it != mpImpl->maSet.end() )
        auto it = std::lower_bound(mpImpl->maSortedVector.begin(), mpImpl->maSortedVector.end(), *pWord, CompareSvxAutocorrWordList());
        if (it != mpImpl->maSortedVector.end() && !CompareSvxAutocorrWordList()(*pWord, *it))
        {
            pMatch = std::unique_ptr<SvxAutocorrWord>(*it);
            mpImpl->maSet.erase (it);
            SvxAutocorrWord pMatch = std::move(*it);
            mpImpl->maSortedVector.erase (it);
            return pMatch;
        }
    }
    return pMatch;
    return boost::optional<SvxAutocorrWord>();
}

// return the sorted contents - defer sorting until we have to.
SvxAutocorrWordList::Content SvxAutocorrWordList::getSortedContent() const
const SvxAutocorrWordList::AutocorrWordSetType& SvxAutocorrWordList::getSortedContent() const
{
    Content aContent;

    // convert from hash to set permanently
    if ( mpImpl->maSet.empty() )
    if ( mpImpl->maSortedVector.empty() )
    {
        // This beast has some O(N log(N)) in a terribly slow ICU collate fn.
        for (auto & elem : mpImpl->maHash)
            mpImpl->maSet.insert( elem.second.release() );
        std::vector<SvxAutocorrWord> tmp;
        tmp.reserve(mpImpl->maHash.size());
        for (auto & rPair : mpImpl->maHash)
            tmp.emplace_back(std::move(rPair.second));
        mpImpl->maHash.clear();
        // sort twice - this gets the list into mostly-sorted order, which
        // reduces the number of times we need to invoke the expensive ICU collate fn.
        std::sort(tmp.begin(), tmp.end(),
            [] ( SvxAutocorrWord const & lhs, SvxAutocorrWord const & rhs )
            {
                return lhs.GetShort() < rhs.GetShort();
            });
        // This beast has some O(N log(N)) in a terribly slow ICU collate fn.
        // stable_sort is twice as fast as sort in this situation because it does
        // fewer comparison operations.
        std::stable_sort(tmp.begin(), tmp.end(), CompareSvxAutocorrWordList());
        mpImpl->maSortedVector = std::move(tmp);
    }
    for (auto const& elem : mpImpl->maSet)
        aContent.push_back(elem);

    return aContent;
    return mpImpl->maSortedVector;
}

const SvxAutocorrWord* SvxAutocorrWordList::WordMatches(const SvxAutocorrWord *pFnd,
@@ -2776,8 +2789,7 @@
                OUString left_pattern = rTxt.copy(rStt, nEndPos - rStt - rChk.getLength() + left_wildcard);
                // avoid double spaces before simple "word" replacement
                left_pattern += (left_pattern.getLength() == 0 && pFnd->GetLong()[0] == 0x20) ? pFnd->GetLong().copy(1) : pFnd->GetLong();
                SvxAutocorrWord* pNew = new SvxAutocorrWord(rTxt.copy(rStt, nEndPos - rStt), left_pattern);
                if( Insert( std::unique_ptr<SvxAutocorrWord>(pNew) ) )
                if( const SvxAutocorrWord* pNew = Insert( SvxAutocorrWord(rTxt.copy(rStt, nEndPos - rStt), left_pattern) ) )
                    return pNew;
            }
        } else
@@ -2843,8 +2855,7 @@
                        buf.append(std::u16string_view(rTxt).substr(nFndPos, nEndPos - nFndPos));
                    aLong = buf.makeStringAndClear();
                }
                SvxAutocorrWord* pNew = new SvxAutocorrWord(aShort, aLong);
                if ( Insert( std::unique_ptr<SvxAutocorrWord>(pNew) ) )
                if ( const SvxAutocorrWord* pNew = Insert( SvxAutocorrWord(aShort, aLong) ) )
                {
                    if ( (rTxt.getLength() > nEndPos && IsWordDelim(rTxt[nEndPos])) || rTxt.getLength() == nEndPos )
                        return pNew;
@@ -2860,14 +2871,14 @@
{
    for (auto const& elem : mpImpl->maHash)
    {
        if( const SvxAutocorrWord *aTmp = WordMatches( elem.second.get(), rTxt, rStt, nEndPos ) )
            return aTmp;
        if( const SvxAutocorrWord *pTmp = WordMatches( &elem.second, rTxt, rStt, nEndPos ) )
            return pTmp;
    }

    for (auto const& elem : mpImpl->maSet)
    for (auto const& elem : mpImpl->maSortedVector)
    {
        if( const SvxAutocorrWord *aTmp = WordMatches( elem, rTxt, rStt, nEndPos ) )
            return aTmp;
        if( const SvxAutocorrWord *pTmp = WordMatches( &elem, rTxt, rStt, nEndPos ) )
            return pTmp;
    }
    return nullptr;
}
diff --git a/include/editeng/svxacorr.hxx b/include/editeng/svxacorr.hxx
index 52cae8f..e6477f2 100644
--- a/include/editeng/svxacorr.hxx
+++ b/include/editeng/svxacorr.hxx
@@ -28,6 +28,7 @@
#include <editeng/swafopt.hxx>
#include <editeng/editengdllapi.h>

#include <boost/optional.hpp>
#include <map>
#include <memory>

@@ -153,13 +154,14 @@
                           // free any objects still in the set
                           ~SvxAutocorrWordList();
    void                   DeleteAndDestroyAll();
    bool                   Insert(std::unique_ptr<SvxAutocorrWord> pWord) const;
    std::unique_ptr<SvxAutocorrWord> FindAndRemove(SvxAutocorrWord *pWord);
    const SvxAutocorrWord* Insert(SvxAutocorrWord aWord) const;
    boost::optional<SvxAutocorrWord> FindAndRemove(SvxAutocorrWord *pWord);
    void                   LoadEntry(const OUString& sWrong, const OUString& sRight, bool bOnlyTxt);
    bool                   empty() const;

    typedef std::vector<SvxAutocorrWord *> Content;
    Content                getSortedContent() const;
    struct CompareSvxAutocorrWordList;
    typedef std::vector<SvxAutocorrWord> AutocorrWordSetType;
    const AutocorrWordSetType & getSortedContent() const;

    const SvxAutocorrWord* SearchWordsInList(const OUString& rTxt, sal_Int32& rStt, sal_Int32 nEndPos) const;
};