Resolves: tdf#102846 CSV: Detect separator, limit preview line concatenations

In CSV import preview, if a line starts with a quote character and
none of the remembered last field separators used occur in data in
conjunction with a closing quote, then reading data tried to
concatenate line by line to form a data field to be presented in
the preview, worst case the entire file..

For the preview, detect one possible not yet selected separator if
used with a quoted field (similar to commit
c807e7ea7a0725a4d8375eda07d6f70870e0d50a for tdf#56910 space
separator) and limit the number of source lines that are tried to
be concatenated if no separator was encountered after a possibly
closing field quote.

Change-Id: Iefd37a8301161e72cb607cea88d4faadad47b4ae
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120690
Reviewed-by: Eike Rathke <erack@redhat.com>
Tested-by: Jenkins
diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx
index b664034..a95d8bd 100644
--- a/sc/source/ui/dbgui/scuiasciiopt.cxx
+++ b/sc/source/ui/dbgui/scuiasciiopt.cxx
@@ -44,6 +44,14 @@
//! TODO make dynamic
const SCSIZE ASCIIDLG_MAXROWS                = MAXROWCOUNT;

// Maximum number of source lines to concatenate while generating the preview
// for one logical line. This may result in a wrong preview if the actual
// number of embedded line feeds is greater, but a number too high would take
// too much time (loop excessively if unlimited and large data) if none of the
// selected separators are actually used in data but a field at start of line
// is quoted.
constexpr sal_uInt32 kMaxEmbeddedLinefeeds = 500;

using namespace com::sun::star::uno;

namespace {
@@ -293,7 +301,7 @@ ScImportAsciiDlg::ScImportAsciiDlg(weld::Window* pParent, const OUString& aDatNa
    , mnRowPosCount(0)
    , mcTextSep(ScAsciiOptions::cDefaultTextSep)
    , meCall(eCall)
    , mbDetectSpaceSep(eCall != SC_TEXTTOCOLUMNS)
    , mbDetectSep(eCall != SC_TEXTTOCOLUMNS)
    , mxFtCharSet(m_xBuilder->weld_label("textcharset"))
    , mxLbCharSet(new SvxTextEncodingBox(m_xBuilder->weld_combo_box("charset")))
    , mxFtCustomLang(m_xBuilder->weld_label("textlanguage"))
@@ -579,7 +587,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& r
                break;
            }
            rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators,
                    mcTextSep, rcDetectSep);
                    mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds);
            mnStreamPos = mpDatStream->Tell();
            mpRowPosArray[++mnRowPosCount] = mnStreamPos;
        } while (nLine >= mnRowPosCount && mpDatStream->good());
@@ -594,7 +602,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& r
    else
    {
        Seek( mpRowPosArray[nLine]);
        rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep);
        rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds);
        mnStreamPos = mpDatStream->Tell();
    }

@@ -793,7 +801,11 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
    // when the dialog wasn't already presented to the user.
    // As a side effect this has the benefit that the check is only done on the
    // first set of visible lines.
    sal_Unicode cDetectSep = (mbDetectSpaceSep && !mxRbFixed->get_active() && !mxCkbSpace->get_active() ? 0 : 0xffff);
    sal_Unicode cDetectSep = ((mbDetectSep && !mxRbFixed->get_active()
                && (!mxCkbTab->get_active() || !mxCkbSemicolon->get_active()
                    || !mxCkbComma->get_active() || !mxCkbSpace->get_active())) ? 0 : 0xffff);
    if (cDetectSep == 0xffff)
        mbDetectSep = false;

    sal_Int32 nBaseLine = mxTableBox->GetGrid().GetFirstVisLine();
    sal_Int32 nRead = mxTableBox->GetGrid().GetVisLineCount();
@@ -813,16 +825,22 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
    for (; i < CSV_PREVIEW_LINES; i++)
        maPreviewLine[i].clear();

    if (mbDetectSpaceSep)
    if (mbDetectSep)
    {
        mbDetectSpaceSep = false;
        if (cDetectSep == ' ')
        mbDetectSep = false;
        if (cDetectSep)
        {
            // Expect space to be appended by now so all subsequent
            // Expect separator to be appended by now so all subsequent
            // GetLine()/ReadCsvLine() actually used it.
            assert(maFieldSeparators.endsWith(" "));
            // Preselect Space in UI.
            mxCkbSpace->set_active(true);
            assert(maFieldSeparators.endsWith(OUStringChar(cDetectSep)));
            // Preselect separator in UI.
            switch (cDetectSep)
            {
                case '\t':  mxCkbTab->set_active(true);         break;
                case ';':   mxCkbSemicolon->set_active(true);   break;
                case ',':   mxCkbComma->set_active(true);       break;
                case ' ':   mxCkbSpace->set_active(true);       break;
            }
        }
    }

diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index 3a798b3..3c27726 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -607,17 +607,40 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
    // Due to broken CSV generators that don't double embedded quotes check if
    // a field separator immediately or with trailing spaces follows the quote,
    // only then end the field, or at end of string.
    const sal_Unicode cBlank = ' ';
    constexpr sal_Unicode cBlank = ' ';
    if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank))
        return FIELDEND_QUOTE;
    // Detect a possible blank separator if it's not already in the list (which
    // was checked right above for p[1]==cBlank).
    if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank)
        rcDetectSep = cBlank;
    const bool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank);
    while (p[1] == cBlank)
        ++p;
    if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
        return FIELDEND_QUOTE;
    // Extended separator detection after a closing quote (with or without
    // blanks). Note that nQuotes is incremented *after* the call so is not yet
    // even here, and that with separator detection we reach here only if
    // lcl_isEscapedOrFieldEndQuote() did not already detect FIRST_QUOTE or
    // SECOND_QUOTE for an escaped embedded quote, thus nQuotes does not have
    // to be checked.
    if (!rcDetectSep)
    {
        constexpr sal_Unicode vSep[] = { ',', '\t', ';' };
        for (const sal_Unicode c : vSep)
        {
            if (p[1] == c)
            {
                rcDetectSep = c;
                return FIELDEND_QUOTE;
            }
        }
    }
    // Blank separator is least significant, after others.
    if (bBlankSep)
    {
        rcDetectSep = cBlank;
        return FIELDEND_QUOTE;
    }
    return DONTKNOW_QUOTE;
}

@@ -645,7 +668,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
        const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep )
{
    if ((nQuotes % 2) == 0)
    if ((nQuotes & 1) == 0)
    {
        if (p[-1] == cStr)
            return SECOND_QUOTE;
@@ -2481,7 +2504,7 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr )
}

OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep )
        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep, sal_uInt32 nMaxSourceLines )
{
    enum RetryState
    {
@@ -2506,6 +2529,8 @@ Label_RetryWithNewSep:

    if (bEmbeddedLineBreak)
    {
        sal_uInt32 nLine = 0;

        const sal_Unicode* pSeps = rFieldSeparators.getStr();

        QuoteType eQuoteState = FIELDEND_QUOTE;
@@ -2544,10 +2569,11 @@ Label_RetryWithNewSep:
                        {
                            eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep);

                            if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ')
                            if (eRetryState == RetryState::ALLOW && rcDetectSep)
                            {
                                eRetryState = RetryState::RETRY;
                                rFieldSeparators += " ";
                                rFieldSeparators += OUStringChar(rcDetectSep);
                                pSeps = rFieldSeparators.getStr();
                                goto Label_RetryWithNewSep;
                            }

@@ -2593,10 +2619,14 @@ Label_RetryWithNewSep:
                ++p;
            }

            if (nQuotes % 2 == 0)
            if ((nQuotes & 1) == 0)
                // We still have a (theoretical?) problem here if due to
                // nArbitraryLineLengthLimit we split a string right between a
                // doubled quote pair.
                // nArbitraryLineLengthLimit (or nMaxSourceLines below) we
                // split a string right between a doubled quote pair.
                break;
            else if (++nLine >= nMaxSourceLines && nMaxSourceLines > 0)
                // Unconditionally increment nLine even if nMaxSourceLines==0
                // so it can be observed in debugger.
                break;
            else
            {
diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx
index b334755..031a0ca 100644
--- a/sc/source/ui/inc/impex.hxx
+++ b/sc/source/ui/inc/impex.hxx
@@ -194,14 +194,25 @@ public:
    The quote character used.

    @param rcDetectSep
    If 0 then attempt to detect a possible space (blank) separator if
    If 0 then attempt to detect a possible separator if
    rFieldSeparators doesn't include it already. This can be necessary because
    of the "accept broken misquoted CSV fields" feature that tries to ignore
    trailing blanks after a quoted field and if no separator follows continues
    to add content to the field assuming the single double quote was in error.
    If this blank separator is detected it is added to rFieldSeparators and the
    It is also necessary if the only possible separator was not selected and
    not included in rFieldSeparators and a line starts with a quoted field, in
    which case appending lines is tried until end of file.
    If a separator is detected it is added to rFieldSeparators and the
    line is reread with the new separators

    @param nMaxSourceLines
    Maximum source lines to read and combine into one logical line for embedded
    new line purpose. Should be limited for the preview dialog because only
    non-matching separators selected otherwise would lead to trying to
    concatenate lines until file end.
    If 0 no limit other than the internal arbitrary resulting line length
    limit.

    check Stream::good() to detect IO problems during read

    @ATTENTION
@@ -222,6 +233,7 @@ public:

  */
SC_DLLPUBLIC OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep );
        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep,
        sal_uInt32 nMaxSourceLines = 0 );

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/ui/inc/scuiasciiopt.hxx b/sc/source/ui/inc/scuiasciiopt.hxx
index 140ca09..eae2f2f 100644
--- a/sc/source/ui/inc/scuiasciiopt.hxx
+++ b/sc/source/ui/inc/scuiasciiopt.hxx
@@ -44,7 +44,7 @@ class ScImportAsciiDlg : public weld::GenericDialogController
    rtl_TextEncoding            meCharSet;          /// Selected char set.
    bool                        mbCharSetSystem;    /// Is System char set selected?
    ScImportAsciiCall           meCall;             /// How the dialog is called (see asciiopt.hxx)
    bool                        mbDetectSpaceSep;   /// Whether to detect a possible space separator.
    bool                        mbDetectSep;        /// Whether to detect a possible separator.

    std::unique_ptr<weld::Label> mxFtCharSet;
    std::unique_ptr<SvxTextEncodingBox> mxLbCharSet;