tdf#91764: Combining marks from “complex” scripts can’t be searched for

Don’t skip search results that are in the middle of a grapheme cluster
(AKA cell in LO speak).

It is not clear why it was done like this, as these checks are present
all the way back to the first commit of this file:

commit 36eb193f4809221af42c01c5ac226a97cf74ec21
Author: Rüdiger Timm <rt@openoffice.org>
Date:   Tue Apr 8 15:01:00 2003 +0000

    INTEGRATION: CWS calc06 (1.1.2); FILE ADDED
    2003/03/26 15:54:42 er 1.1.2.1: #i3393# moved from i18n module, cleaned out tools module usage, and added support for regexp

But ignoring such results and only for so-called “complex” scripts seems
arbitrary, and as the linked issue shows, people want to be able to
search for combining marks. Furthermore, it prevents searching for a
base character followed by a combining mark, unless ignoring diacritics
is enabled.

Change-Id: I530788d928861ddfa18dd7b813d0a13f53c0b77b
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/138410
Tested-by: Jenkins
Reviewed-by: خالد حسني <khaled@aliftype.com>
diff --git a/i18npool/qa/cppunit/test_textsearch.cxx b/i18npool/qa/cppunit/test_textsearch.cxx
index 1d72a8d..f224e58 100644
--- a/i18npool/qa/cppunit/test_textsearch.cxx
+++ b/i18npool/qa/cppunit/test_textsearch.cxx
@@ -38,12 +38,14 @@ public:
    void testSearches();
    void testWildcardSearch();
    void testApostropheSearch();
    void testTdf138410();

    CPPUNIT_TEST_SUITE(TestTextSearch);
    CPPUNIT_TEST(testICU);
    CPPUNIT_TEST(testSearches);
    CPPUNIT_TEST(testWildcardSearch);
    CPPUNIT_TEST(testApostropheSearch);
    CPPUNIT_TEST(testTdf138410);
    CPPUNIT_TEST_SUITE_END();
private:
    uno::Reference<util::XTextSearch> m_xSearch;
@@ -402,6 +404,125 @@ void TestTextSearch::testApostropheSearch()
    CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
}

void TestTextSearch::testTdf138410()
{
    OUString str(u"\u0643\u064f\u062a\u064f\u0628 \u0643\u062a\u0628");
    sal_Int32 startPos = 0, endPos = str.getLength();

    util::SearchOptions aOptions;
    aOptions.algorithmType = util::SearchAlgorithms_ABSOLUTE;

    util::SearchResult aRes;

    // A) base alone
    // The search string will be found whether it is followed by a mark in the
    // text or not, and whether IGNORE_DIACRITICS_CTL is set or not.

    // set options
    aOptions.searchString = u"\u0643";
    aOptions.transliterateFlags = 0;
    m_xSearch->setOptions(aOptions);

    // search forward
    aRes = m_xSearch->searchForward(str, startPos, endPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.endOffset[0]);

    // search backwards
    aRes = m_xSearch->searchBackward(str, endPos, startPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(7), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(6), aRes.endOffset[0]);

    // check with transliteration
    aOptions.transliterateFlags = static_cast<int>(TransliterationFlags::IGNORE_DIACRITICS_CTL);
    m_xSearch->setOptions(aOptions);

    // search forward
    aRes = m_xSearch->searchForward(str, startPos, endPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.endOffset[0]);

    // search backwards
    aRes = m_xSearch->searchBackward(str, endPos, startPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(7), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(6), aRes.endOffset[0]);

    // b) base+mark
    // The search string will be found when followed by a mark in the text, or
    // when IGNORE_DIACRITICS_CTL is set whether it is followed by a mark or
    // not.

    // set options
    aOptions.searchString = u"\u0643\u064f";
    aOptions.transliterateFlags = 0;
    m_xSearch->setOptions(aOptions);

    // search forward
    aRes = m_xSearch->searchForward(str, startPos, endPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(2), aRes.endOffset[0]);

    // search backwards
    aRes = m_xSearch->searchBackward(str, endPos, startPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(2), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.endOffset[0]);

    // check with transliteration
    aOptions.transliterateFlags = static_cast<int>(TransliterationFlags::IGNORE_DIACRITICS_CTL);
    m_xSearch->setOptions(aOptions);

    // search forward
    aRes = m_xSearch->searchForward(str, startPos, endPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.endOffset[0]);

    // search backwards
    aRes = m_xSearch->searchBackward(str, endPos, startPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(7), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(6), aRes.endOffset[0]);

    // b) mark alone
    // The search string will be found only when IGNORE_DIACRITICS_CTL is not
    // set.

    // set options
    aOptions.searchString = u"\u064f";
    aOptions.transliterateFlags = 0;
    m_xSearch->setOptions(aOptions);

    // search forward
    aRes = m_xSearch->searchForward(str, startPos, endPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(2), aRes.endOffset[0]);

    // search backwards
    aRes = m_xSearch->searchBackward(str, endPos, startPos);
    CPPUNIT_ASSERT(aRes.subRegExpressions > 0);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(4), aRes.startOffset[0]);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(3), aRes.endOffset[0]);

    // with ignore marks the mark will not be found
    aOptions.transliterateFlags = static_cast<int>(TransliterationFlags::IGNORE_DIACRITICS_CTL);
    m_xSearch->setOptions(aOptions);

    // search forward
    aRes = m_xSearch->searchForward(str, startPos, endPos);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions);

    // search backwards
    aRes = m_xSearch->searchBackward(str, endPos, startPos);
    CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions);
}

void TestTextSearch::setUp()
{
    BootstrapFixtureBase::setUp();
diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx
index c80afc1..a16c3e1 100644
--- a/i18npool/source/search/textsearch.cxx
+++ b/i18npool/source/search/textsearch.cxx
@@ -214,13 +214,6 @@ void TextSearch::setOptions2( const SearchOptions2& rOptions )
                    aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
    }

    // When start or end of search string is a complex script type, we need to
    // make sure the result boundary is not located in the middle of cell.
    checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
                ScriptType::COMPLEX));
    checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
                    sSrchStr.getLength()-1) == ScriptType::COMPLEX));

    if ( bReplaceApostrophe )
        sSrchStr = sSrchStr.replace(u'\u2019', '\'');

@@ -305,13 +298,6 @@ static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 
    return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
}

bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
{
    sal_Int32 nDone;
    return nPos == xBreak->previousCharacters(searchStr, nPos+1,
            aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
}

SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
{
    std::unique_lock g(m_aMutex);
@@ -737,11 +723,6 @@ SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startP
            nCmpIdx <= nEnd;
            nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
    {
        // if the match would be the completed cells, skip it.
        if ( (checkCTLStart && !isCellStart( searchStr, nCmpIdx )) || (checkCTLEnd
                    && !isCellStart( searchStr, nCmpIdx + sSearchKey.getLength())) )
            continue;

        nSuchIdx = sSearchKey.getLength() - 1;
        while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
        {
@@ -804,41 +785,28 @@ SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startP

    while (nCmpIdx >= nEnd)
    {
        // if the match would be the completed cells, skip it.
        if ( (!checkCTLStart || isCellStart( searchStr, nCmpIdx -
                        sSearchKey.getLength() )) && (!checkCTLEnd ||
                    isCellStart( searchStr, nCmpIdx)))
        nSuchIdx = 0;
        while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
                searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
            nSuchIdx++;
        if( nSuchIdx >= sSearchKey.getLength() )
        {
            nSuchIdx = 0;
            while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
                    searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
                nSuchIdx++;
            if( nSuchIdx >= sSearchKey.getLength() )
            if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
            {
                if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
                {
                    sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
                    bool bAtStart = !nFndStt;
                    bool bAtEnd = nCmpIdx == startPos;
                    bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
                    bool bDelimBefore = bAtStart || // begin of paragraph
                        IsDelimiter( searchStr, nFndStt-1 );
                    //  *       1 -> only one word in the paragraph
                    //  *       2 -> at begin of paragraph
                    //  *       3 -> at end of paragraph
                    //  *       4 -> inside the paragraph
                    if( ( bAtStart && bAtEnd ) ||           // 1
                            ( bAtStart && bDelimBehind ) ||     // 2
                            ( bAtEnd && bDelimBefore ) ||       // 3
                            ( bDelimBefore && bDelimBehind ))   // 4
                    {
                        aRet.subRegExpressions = 1;
                        aRet.startOffset = { nCmpIdx };
                        aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
                        return aRet;
                    }
                }
                else
                sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
                bool bAtStart = !nFndStt;
                bool bAtEnd = nCmpIdx == startPos;
                bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
                bool bDelimBefore = bAtStart || // begin of paragraph
                    IsDelimiter( searchStr, nFndStt-1 );
                //  *       1 -> only one word in the paragraph
                //  *       2 -> at begin of paragraph
                //  *       3 -> at end of paragraph
                //  *       4 -> inside the paragraph
                if( ( bAtStart && bAtEnd ) ||           // 1
                        ( bAtStart && bDelimBehind ) ||     // 2
                        ( bAtEnd && bDelimBefore ) ||       // 3
                        ( bDelimBefore && bDelimBehind ))   // 4
                {
                    aRet.subRegExpressions = 1;
                    aRet.startOffset = { nCmpIdx };
@@ -846,6 +814,13 @@ SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startP
                    return aRet;
                }
            }
            else
            {
                aRet.subRegExpressions = 1;
                aRet.startOffset = { nCmpIdx };
                aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
                return aRet;
            }
        }
        nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
        if( nCmpIdx < nSuchIdx )
diff --git a/i18npool/source/search/textsearch.hxx b/i18npool/source/search/textsearch.hxx
index 0a4da19..43a6435 100644
--- a/i18npool/source/search/textsearch.hxx
+++ b/i18npool/source/search/textsearch.hxx
@@ -130,10 +130,6 @@ class TextSearch: public cppu::WeakImplHelper

    bool IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const;

    bool checkCTLStart, checkCTLEnd;
    /// @throws css::uno::RuntimeException
    bool isCellStart(const OUString& searchStr, sal_Int32 nPos);

public:
    explicit TextSearch(
        const css::uno::Reference < css::uno::XComponentContext >& rxContext );