tdf#130984: use RegexMatcher::region to properly limit the search

This allows to pass enough of the text into the matcher to have the
context for anchors/assertions, and at the same time, control the
search region correctly for the cases where the end position is not
at the end of the passed text, like when searching only inside runs
of text having specified attributes.

Change-Id: I6d1ff379c61cec734c0aa2a1dd913b1a73c5b84d
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/89660
Tested-by: Jenkins
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx
index 964dc6c..0efa67b 100644
--- a/i18npool/source/search/textsearch.cxx
+++ b/i18npool/source/search/textsearch.cxx
@@ -896,9 +896,15 @@ void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
}


static bool lcl_findRegex( std::unique_ptr<icu::RegexMatcher> const & pRegexMatcher, sal_Int32 nStartPos, UErrorCode & rIcuErr )
static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
                          sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
{
    if (!pRegexMatcher->find( nStartPos, rIcuErr))
    pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
    pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
    pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
                                               // look-ahead/behind assertions

    if (!pRegexMatcher->find(rIcuErr))
    {
        /* TODO: future versions could pass the UErrorCode or translations
         * thereof to the caller, for example to inform the user of
@@ -930,7 +936,7 @@ SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
    // search until there is a valid match
    for(;;)
    {
        if (!lcl_findRegex( pRegexMatcher, startPos, nIcuErr))
        if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
            return aRet;

        // #i118887# ignore zero-length matches e.g. "a*" in "bc"
@@ -979,9 +985,10 @@ SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
    // TODO: use ICU's backward searching once it becomes available
    //       as its replacement using forward search is not as good as the real thing
    UErrorCode nIcuErr = U_ZERO_ERROR;
    const IcuUniString aSearchTargetStr( reinterpret_cast<const UChar*>(searchStr.getStr()), startPos);
    const IcuUniString aSearchTargetStr(reinterpret_cast<const UChar*>(searchStr.getStr()),
                                        searchStr.getLength());
    pRegexMatcher->reset( aSearchTargetStr);
    if (!lcl_findRegex( pRegexMatcher, endPos, nIcuErr))
    if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
        return aRet;

    // find the last match
@@ -1003,7 +1010,7 @@ SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
        bFirst = false;
        if( nFoundEnd == nLastPos)
            ++nFoundEnd;
    } while( lcl_findRegex( pRegexMatcher, nFoundEnd, nIcuErr));
    } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));

    // Ignore all zero-length matches except "$" anchor on first match.
    if (nGoodPos == nGoodEnd)
@@ -1015,7 +1022,7 @@ SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
    }

    // find last match again to get its details
    lcl_findRegex( pRegexMatcher, nGoodPos, nIcuErr);
    lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);

    // fill in the details of the last match
    const int nGroupCount = pRegexMatcher->groupCount();
diff --git a/sw/qa/extras/uiwriter/uiwriter.cxx b/sw/qa/extras/uiwriter/uiwriter.cxx
index 669471d..8c4123c 100644
--- a/sw/qa/extras/uiwriter/uiwriter.cxx
+++ b/sw/qa/extras/uiwriter/uiwriter.cxx
@@ -2302,6 +2302,31 @@ void SwUiWriterTest::testTextSearch()
    uno::Reference<container::XIndexAccess> xIndex2(xReplace->findAll(xSearchDes));
    CPPUNIT_ASSERT_EQUAL(sal_Int32(3), xIndex2->getCount());
    // regex tests
    xSearchDes->setPropertyValue("SearchRegularExpression", uno::makeAny(true));
    // regex: test correct matching combined with attributes like BOLD
    xSearchDes->setSearchString(".*"); // should match all bold words in the text
    xIndex.set(xReplace->findAll(xSearchDes), uno::UNO_SET_THROW);
    CPPUNIT_ASSERT_EQUAL(sal_Int32(3), xIndex->getCount());
    uno::Reference<text::XTextRange> xFound(xIndex->getByIndex(0), uno::UNO_QUERY_THROW);
    CPPUNIT_ASSERT_EQUAL(OUString("Hello"), xFound->getString());
    xFound.set(xIndex->getByIndex(1), uno::UNO_QUERY_THROW);
    CPPUNIT_ASSERT_EQUAL(OUString("This"), xFound->getString());
    xFound.set(xIndex->getByIndex(2), uno::UNO_QUERY_THROW);
    CPPUNIT_ASSERT_EQUAL(OUString("task"), xFound->getString());
    // regex: test anchor combined with attributes like BOLD
    xSearchDes->setSearchString("^.*|.*$"); // should match first and last words (they are bold)
    xIndex.set(xReplace->findAll(xSearchDes), uno::UNO_SET_THROW);
    CPPUNIT_ASSERT_EQUAL(sal_Int32(2), xIndex->getCount());
    xFound.set(xIndex->getByIndex(0), uno::UNO_QUERY_THROW);
    CPPUNIT_ASSERT_EQUAL(OUString("Hello"), xFound->getString());
    xFound.set(xIndex->getByIndex(1), uno::UNO_QUERY_THROW);
    CPPUNIT_ASSERT_EQUAL(OUString("task"), xFound->getString());
    // regex: test look-ahead/look-behind assertions outside of the bold text
    xSearchDes->setSearchString("(?<= ).*(?= )"); // should match second bold word
    xIndex.set(xReplace->findAll(xSearchDes), uno::UNO_SET_THROW);
    CPPUNIT_ASSERT_EQUAL(sal_Int32(1), xIndex->getCount());
    xFound.set(xIndex->getByIndex(0), uno::UNO_QUERY_THROW);
    CPPUNIT_ASSERT_EQUAL(OUString("This"), xFound->getString());
    xReplaceDes->setPropertyValue("SearchRegularExpression", uno::makeAny(true));
    // regex: test correct match of paragraph start
    xReplaceDes->setSearchString("^."); // should only match first character of the paragraph