tdf#109158 improve sorting when loading large autocorrect file
reduces time from 2.0s to 1.7s
reduce work by
(*) reserving some arrays
(*) pre-sorting with a cheaper comparator
(*) don't copy when returning result, just return a const&
(*) flattening the data-structures to reduce pointer-chasing
Change-Id: I972bd7ffdbf2121c2d38c24aca9618ca708e920c
Reviewed-on: https://gerrit.libreoffice.org/79119
Tested-by: Jenkins
Reviewed-by: Noel Grandin <noel.grandin@collabora.co.uk>
Signed-off-by: Xisco Fauli <xiscofauli@libreoffice.org>
Reviewed-on: https://gerrit.libreoffice.org/79377
diff --git a/cui/source/tabpages/autocdlg.cxx b/cui/source/tabpages/autocdlg.cxx
index 20028e1..6573e3b 100644
--- a/cui/source/tabpages/autocdlg.cxx
+++ b/cui/source/tabpages/autocdlg.cxx
@@ -743,12 +743,14 @@
std::vector<SvxAutocorrWord> aDeleteWords;
std::vector<SvxAutocorrWord> aNewWords;
aDeleteWords.reserve( rStringChangeList.aDeletedEntries.size() );
for (DoubleString & deleteEntry : rStringChangeList.aDeletedEntries)
{
SvxAutocorrWord aDeleteWord( deleteEntry.sShort, deleteEntry.sLong );
aDeleteWords.push_back( aDeleteWord );
}
aNewWords.reserve( rStringChangeList.aNewEntries.size() );
for (DoubleString & newEntry : rStringChangeList.aNewEntries)
{
//fdo#67697 if the user data is set then we want to retain the
@@ -835,10 +837,10 @@
{
SvxAutoCorrect* pAutoCorrect = SvxAutoCorrCfg::Get().GetAutoCorrect();
SvxAutocorrWordList* pWordList = pAutoCorrect->LoadAutocorrWordList(eLang);
SvxAutocorrWordList::Content aContent = pWordList->getSortedContent();
m_xReplaceTLB->bulk_insert_for_each(aContent.size(), [this, &aContent](weld::TreeIter& rIter, int nIndex) {
auto const& elem = aContent[nIndex];
bool bTextOnly = elem->IsTextOnly();
const SvxAutocorrWordList::AutocorrWordSetType & rContent = pWordList->getSortedContent();
m_xReplaceTLB->bulk_insert_for_each(rContent.size(), [this, rContent](weld::TreeIter& rIter, int nIndex) {
auto const& elem = rContent[nIndex];
bool bTextOnly = elem.IsTextOnly();
// formatted text is only in Writer
if (bSWriter || bTextOnly)
{
@@ -848,12 +850,12 @@
OUString sId = OUString::number(reinterpret_cast<sal_Int64>(m_xTextOnlyCB.get()));
m_xReplaceTLB->set_id(rIter, sId);
}
m_xReplaceTLB->set_text(rIter, elem->GetShort(), 0);
m_xReplaceTLB->set_text(rIter, elem->GetLong(), 1);
m_xReplaceTLB->set_text(rIter, elem.GetShort(), 0);
m_xReplaceTLB->set_text(rIter, elem.GetLong(), 1);
}
else
{
aFormatText.insert(elem->GetShort());
aFormatText.insert(elem.GetShort());
}
}, &m_aReplaceFixedWidths);
m_xNewReplacePB->set_sensitive(false);
diff --git a/editeng/source/misc/SvXMLAutoCorrectExport.cxx b/editeng/source/misc/SvXMLAutoCorrectExport.cxx
index 8ebefcc..4dadeca 100644
--- a/editeng/source/misc/SvXMLAutoCorrectExport.cxx
+++ b/editeng/source/misc/SvXMLAutoCorrectExport.cxx
@@ -51,15 +51,15 @@
GetNamespaceMap_().GetNameByKey ( XML_NAMESPACE_BLOCKLIST ) );
{
SvXMLElementExport aRoot (*this, XML_NAMESPACE_BLOCKLIST, XML_BLOCK_LIST, true, true);
SvxAutocorrWordList::Content aContent = pAutocorr_List->getSortedContent();
for (auto const& content : aContent)
const SvxAutocorrWordList::AutocorrWordSetType& rContent = pAutocorr_List->getSortedContent();
for (auto const& content : rContent)
{
AddAttribute( XML_NAMESPACE_BLOCKLIST,
XML_ABBREVIATED_NAME,
content->GetShort());
content.GetShort());
AddAttribute( XML_NAMESPACE_BLOCKLIST,
XML_NAME,
content->IsTextOnly() ? content->GetLong() : content->GetShort());
content.IsTextOnly() ? content.GetLong() : content.GetShort());
SvXMLElementExport aBlock( *this, XML_NAMESPACE_BLOCKLIST, XML_BLOCK, true, true);
}
diff --git a/editeng/source/misc/svxacorr.cxx b/editeng/source/misc/svxacorr.cxx
index 01a4f27..7ac9398 100644
--- a/editeng/source/misc/svxacorr.cxx
+++ b/editeng/source/misc/svxacorr.cxx
@@ -2488,10 +2488,10 @@
{
for (SvxAutocorrWord & aWordToDelete : aDeleteEntries)
{
std::unique_ptr<SvxAutocorrWord> pFoundEntry = pAutocorr_List->FindAndRemove( &aWordToDelete );
if( pFoundEntry )
boost::optional<SvxAutocorrWord> xFoundEntry = pAutocorr_List->FindAndRemove( &aWordToDelete );
if( xFoundEntry )
{
if( !pFoundEntry->IsTextOnly() )
if( !xFoundEntry->IsTextOnly() )
{
OUString aName( aWordToDelete.GetShort() );
if (xStorage->IsOLEStorage())
@@ -2510,24 +2510,24 @@
for (SvxAutocorrWord & aNewEntrie : aNewEntries)
{
std::unique_ptr<SvxAutocorrWord> pWordToAdd(new SvxAutocorrWord( aNewEntrie.GetShort(), aNewEntrie.GetLong(), true ));
std::unique_ptr<SvxAutocorrWord> pRemoved = pAutocorr_List->FindAndRemove( pWordToAdd.get() );
if( pRemoved )
SvxAutocorrWord aWordToAdd(aNewEntrie.GetShort(), aNewEntrie.GetLong(), true );
boost::optional<SvxAutocorrWord> xRemoved = pAutocorr_List->FindAndRemove( &aWordToAdd );
if( xRemoved )
{
if( !pRemoved->IsTextOnly() )
if( !xRemoved->IsTextOnly() )
{
// Still have to remove the Storage
OUString sStorageName( pWordToAdd->GetShort() );
OUString sStorageName( aWordToAdd.GetShort() );
if (xStorage->IsOLEStorage())
sStorageName = EncryptBlockName_Imp(sStorageName);
else
GeneratePackageName ( pWordToAdd->GetShort(), sStorageName);
GeneratePackageName ( aWordToAdd.GetShort(), sStorageName);
if( xStorage->IsContained( sStorageName ) )
xStorage->Remove( sStorageName );
}
}
bRet = pAutocorr_List->Insert( std::move(pWordToAdd) );
bRet = pAutocorr_List->Insert( std::move(aWordToAdd) );
if ( !bRet )
{
@@ -2556,11 +2556,11 @@
// Update the word list
if( bRet )
{
std::unique_ptr<SvxAutocorrWord> pNew(new SvxAutocorrWord( rShort, rLong, true ));
std::unique_ptr<SvxAutocorrWord> pRemove = pAutocorr_List->FindAndRemove( pNew.get() );
if( pRemove )
SvxAutocorrWord aNew(rShort, rLong, true );
boost::optional<SvxAutocorrWord> xRemove = pAutocorr_List->FindAndRemove( &aNew );
if( xRemove )
{
if( !pRemove->IsTextOnly() )
if( !xRemove->IsTextOnly() )
{
// Still have to remove the Storage
OUString sStgNm( rShort );
@@ -2574,7 +2574,7 @@
}
}
if( pAutocorr_List->Insert( std::move(pNew) ) )
if( pAutocorr_List->Insert( std::move(aNew) ) )
{
bRet = MakeBlocklist_Imp( *xStg );
xStg = nullptr;
@@ -2605,8 +2605,7 @@
// Update the word list
if( bRet )
{
std::unique_ptr<SvxAutocorrWord> pNew(new SvxAutocorrWord( rShort, sLong, false ));
if( pAutocorr_List->Insert( std::move(pNew) ) )
if( pAutocorr_List->Insert( SvxAutocorrWord(rShort, sLong, false) ) )
{
tools::SvRef<SotStorage> xStor = new SotStorage( sUserAutoCorrFile, StreamMode::READWRITE );
MakeBlocklist_Imp( *xStor );
@@ -2619,20 +2618,18 @@
}
// Keep the list sorted ...
struct CompareSvxAutocorrWordList
struct SvxAutocorrWordList::CompareSvxAutocorrWordList
{
bool operator()( SvxAutocorrWord* const& lhs, SvxAutocorrWord* const& rhs ) const
bool operator()( SvxAutocorrWord const & lhs, SvxAutocorrWord const & rhs ) const
{
CollatorWrapper& rCmp = ::GetCollatorWrapper();
return rCmp.compareString( lhs->GetShort(), rhs->GetShort() ) < 0;
return rCmp.compareString( lhs.GetShort(), rhs.GetShort() ) < 0;
}
};
namespace {
// can't use std::unique_ptr until we have C++14
typedef std::set<SvxAutocorrWord*, CompareSvxAutocorrWordList> AutocorrWordSetType;
typedef std::unordered_map<OUString, std::unique_ptr<SvxAutocorrWord>> AutocorrWordHashType;
typedef std::unordered_map<OUString, SvxAutocorrWord> AutocorrWordHashType;
}
@@ -2640,16 +2637,14 @@
{
// only one of these contains the data
mutable AutocorrWordSetType maSet;
// maSortedVector is manually sorted so we can optimise data movement
mutable AutocorrWordSetType maSortedVector;
mutable AutocorrWordHashType maHash; // key is 'Short'
void DeleteAndDestroyAll()
{
maHash.clear();
for (auto const& elem : maSet)
delete elem;
maSet.clear();
maSortedVector.clear();
}
};
@@ -2657,7 +2652,6 @@
SvxAutocorrWordList::~SvxAutocorrWordList()
{
mpImpl->DeleteAndDestroyAll();
}
void SvxAutocorrWordList::DeleteAndDestroyAll()
@@ -2666,70 +2660,89 @@
}
// returns true if inserted
bool SvxAutocorrWordList::Insert(std::unique_ptr<SvxAutocorrWord> pWord) const
const SvxAutocorrWord* SvxAutocorrWordList::Insert(SvxAutocorrWord aWord) const
{
if ( mpImpl->maSet.empty() ) // use the hash
if ( mpImpl->maSortedVector.empty() ) // use the hash
{
OUString aShort( pWord->GetShort() );
return mpImpl->maHash.insert( std::pair<OUString, std::unique_ptr<SvxAutocorrWord> >( aShort, std::move(pWord) ) ).second;
OUString aShort = aWord.GetShort();
auto [it,inserted] = mpImpl->maHash.emplace( std::move(aShort), std::move(aWord) );
if (inserted)
return &(it->second);
return nullptr;
}
else
return mpImpl->maSet.insert( pWord.release() ).second;
{
auto it = std::lower_bound(mpImpl->maSortedVector.begin(), mpImpl->maSortedVector.end(), aWord, CompareSvxAutocorrWordList());
if (it != mpImpl->maSortedVector.end() && !CompareSvxAutocorrWordList()(aWord, *it))
{
it = mpImpl->maSortedVector.insert(it, std::move(aWord));
return &*it;
}
return nullptr;
}
}
void SvxAutocorrWordList::LoadEntry(const OUString& sWrong, const OUString& sRight, bool bOnlyTxt)
{
std::unique_ptr<SvxAutocorrWord> pNew(new SvxAutocorrWord( sWrong, sRight, bOnlyTxt ));
(void)Insert(std::move(pNew));
(void)Insert(SvxAutocorrWord( sWrong, sRight, bOnlyTxt ));
}
bool SvxAutocorrWordList::empty() const
{
return mpImpl->maHash.empty() && mpImpl->maSet.empty();
return mpImpl->maHash.empty() && mpImpl->maSortedVector.empty();
}
std::unique_ptr<SvxAutocorrWord> SvxAutocorrWordList::FindAndRemove(SvxAutocorrWord *pWord)
boost::optional<SvxAutocorrWord> SvxAutocorrWordList::FindAndRemove(SvxAutocorrWord *pWord)
{
std::unique_ptr<SvxAutocorrWord> pMatch;
if ( mpImpl->maSet.empty() ) // use the hash
if ( mpImpl->maSortedVector.empty() ) // use the hash
{
AutocorrWordHashType::iterator it = mpImpl->maHash.find( pWord->GetShort() );
if( it != mpImpl->maHash.end() )
{
pMatch = std::move(it->second);
SvxAutocorrWord pMatch = std::move(it->second);
mpImpl->maHash.erase (it);
return pMatch;
}
}
else
{
AutocorrWordSetType::iterator it = mpImpl->maSet.find( pWord );
if( it != mpImpl->maSet.end() )
auto it = std::lower_bound(mpImpl->maSortedVector.begin(), mpImpl->maSortedVector.end(), *pWord, CompareSvxAutocorrWordList());
if (it != mpImpl->maSortedVector.end() && !CompareSvxAutocorrWordList()(*pWord, *it))
{
pMatch = std::unique_ptr<SvxAutocorrWord>(*it);
mpImpl->maSet.erase (it);
SvxAutocorrWord pMatch = std::move(*it);
mpImpl->maSortedVector.erase (it);
return pMatch;
}
}
return pMatch;
return boost::optional<SvxAutocorrWord>();
}
// return the sorted contents - defer sorting until we have to.
SvxAutocorrWordList::Content SvxAutocorrWordList::getSortedContent() const
const SvxAutocorrWordList::AutocorrWordSetType& SvxAutocorrWordList::getSortedContent() const
{
Content aContent;
// convert from hash to set permanently
if ( mpImpl->maSet.empty() )
if ( mpImpl->maSortedVector.empty() )
{
// This beast has some O(N log(N)) in a terribly slow ICU collate fn.
for (auto & elem : mpImpl->maHash)
mpImpl->maSet.insert( elem.second.release() );
std::vector<SvxAutocorrWord> tmp;
tmp.reserve(mpImpl->maHash.size());
for (auto & rPair : mpImpl->maHash)
tmp.emplace_back(std::move(rPair.second));
mpImpl->maHash.clear();
// sort twice - this gets the list into mostly-sorted order, which
// reduces the number of times we need to invoke the expensive ICU collate fn.
std::sort(tmp.begin(), tmp.end(),
[] ( SvxAutocorrWord const & lhs, SvxAutocorrWord const & rhs )
{
return lhs.GetShort() < rhs.GetShort();
});
// This beast has some O(N log(N)) in a terribly slow ICU collate fn.
// stable_sort is twice as fast as sort in this situation because it does
// fewer comparison operations.
std::stable_sort(tmp.begin(), tmp.end(), CompareSvxAutocorrWordList());
mpImpl->maSortedVector = std::move(tmp);
}
for (auto const& elem : mpImpl->maSet)
aContent.push_back(elem);
return aContent;
return mpImpl->maSortedVector;
}
const SvxAutocorrWord* SvxAutocorrWordList::WordMatches(const SvxAutocorrWord *pFnd,
@@ -2776,8 +2789,7 @@
OUString left_pattern = rTxt.copy(rStt, nEndPos - rStt - rChk.getLength() + left_wildcard);
// avoid double spaces before simple "word" replacement
left_pattern += (left_pattern.getLength() == 0 && pFnd->GetLong()[0] == 0x20) ? pFnd->GetLong().copy(1) : pFnd->GetLong();
SvxAutocorrWord* pNew = new SvxAutocorrWord(rTxt.copy(rStt, nEndPos - rStt), left_pattern);
if( Insert( std::unique_ptr<SvxAutocorrWord>(pNew) ) )
if( const SvxAutocorrWord* pNew = Insert( SvxAutocorrWord(rTxt.copy(rStt, nEndPos - rStt), left_pattern) ) )
return pNew;
}
} else
@@ -2843,8 +2855,7 @@
buf.append(std::u16string_view(rTxt).substr(nFndPos, nEndPos - nFndPos));
aLong = buf.makeStringAndClear();
}
SvxAutocorrWord* pNew = new SvxAutocorrWord(aShort, aLong);
if ( Insert( std::unique_ptr<SvxAutocorrWord>(pNew) ) )
if ( const SvxAutocorrWord* pNew = Insert( SvxAutocorrWord(aShort, aLong) ) )
{
if ( (rTxt.getLength() > nEndPos && IsWordDelim(rTxt[nEndPos])) || rTxt.getLength() == nEndPos )
return pNew;
@@ -2860,14 +2871,14 @@
{
for (auto const& elem : mpImpl->maHash)
{
if( const SvxAutocorrWord *aTmp = WordMatches( elem.second.get(), rTxt, rStt, nEndPos ) )
return aTmp;
if( const SvxAutocorrWord *pTmp = WordMatches( &elem.second, rTxt, rStt, nEndPos ) )
return pTmp;
}
for (auto const& elem : mpImpl->maSet)
for (auto const& elem : mpImpl->maSortedVector)
{
if( const SvxAutocorrWord *aTmp = WordMatches( elem, rTxt, rStt, nEndPos ) )
return aTmp;
if( const SvxAutocorrWord *pTmp = WordMatches( &elem, rTxt, rStt, nEndPos ) )
return pTmp;
}
return nullptr;
}
diff --git a/include/editeng/svxacorr.hxx b/include/editeng/svxacorr.hxx
index 52cae8f..e6477f2 100644
--- a/include/editeng/svxacorr.hxx
+++ b/include/editeng/svxacorr.hxx
@@ -28,6 +28,7 @@
#include <editeng/swafopt.hxx>
#include <editeng/editengdllapi.h>
#include <boost/optional.hpp>
#include <map>
#include <memory>
@@ -153,13 +154,14 @@
// free any objects still in the set
~SvxAutocorrWordList();
void DeleteAndDestroyAll();
bool Insert(std::unique_ptr<SvxAutocorrWord> pWord) const;
std::unique_ptr<SvxAutocorrWord> FindAndRemove(SvxAutocorrWord *pWord);
const SvxAutocorrWord* Insert(SvxAutocorrWord aWord) const;
boost::optional<SvxAutocorrWord> FindAndRemove(SvxAutocorrWord *pWord);
void LoadEntry(const OUString& sWrong, const OUString& sRight, bool bOnlyTxt);
bool empty() const;
typedef std::vector<SvxAutocorrWord *> Content;
Content getSortedContent() const;
struct CompareSvxAutocorrWordList;
typedef std::vector<SvxAutocorrWord> AutocorrWordSetType;
const AutocorrWordSetType & getSortedContent() const;
const SvxAutocorrWord* SearchWordsInList(const OUString& rTxt, sal_Int32& rStt, sal_Int32 nEndPos) const;
};