Resolves: tdf#102846 CSV: Detect separator, limit preview line concatenations
In CSV import preview, if a line starts with a quote character and
none of the remembered last field separators used occur in data in
conjunction with a closing quote, then reading data tried to
concatenate line by line to form a data field to be presented in
the preview, worst case the entire file..
For the preview, detect one possible not yet selected separator if
used with a quoted field (similar to commit
c807e7ea7a0725a4d8375eda07d6f70870e0d50a for tdf#56910 space
separator) and limit the number of source lines that are tried to
be concatenated if no separator was encountered after a possibly
closing field quote.
Change-Id: Iefd37a8301161e72cb607cea88d4faadad47b4ae
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120690
Reviewed-by: Eike Rathke <erack@redhat.com>
Tested-by: Jenkins
(cherry picked from commit ff62e0165a0add7c7e3cb606df5b24b20c822d8a)
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120717
Reviewed-by: Caolán McNamara <caolanm@redhat.com>
diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx
index b664034..a95d8bd 100644
--- a/sc/source/ui/dbgui/scuiasciiopt.cxx
+++ b/sc/source/ui/dbgui/scuiasciiopt.cxx
@@ -44,6 +44,14 @@
//! TODO make dynamic
const SCSIZE ASCIIDLG_MAXROWS = MAXROWCOUNT;
// Maximum number of source lines to concatenate while generating the preview
// for one logical line. This may result in a wrong preview if the actual
// number of embedded line feeds is greater, but a number too high would take
// too much time (loop excessively if unlimited and large data) if none of the
// selected separators are actually used in data but a field at start of line
// is quoted.
constexpr sal_uInt32 kMaxEmbeddedLinefeeds = 500;
using namespace com::sun::star::uno;
namespace {
@@ -293,7 +301,7 @@ ScImportAsciiDlg::ScImportAsciiDlg(weld::Window* pParent, const OUString& aDatNa
, mnRowPosCount(0)
, mcTextSep(ScAsciiOptions::cDefaultTextSep)
, meCall(eCall)
, mbDetectSpaceSep(eCall != SC_TEXTTOCOLUMNS)
, mbDetectSep(eCall != SC_TEXTTOCOLUMNS)
, mxFtCharSet(m_xBuilder->weld_label("textcharset"))
, mxLbCharSet(new SvxTextEncodingBox(m_xBuilder->weld_combo_box("charset")))
, mxFtCustomLang(m_xBuilder->weld_label("textlanguage"))
@@ -579,7 +587,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& r
break;
}
rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators,
mcTextSep, rcDetectSep);
mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds);
mnStreamPos = mpDatStream->Tell();
mpRowPosArray[++mnRowPosCount] = mnStreamPos;
} while (nLine >= mnRowPosCount && mpDatStream->good());
@@ -594,7 +602,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& r
else
{
Seek( mpRowPosArray[nLine]);
rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep);
rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds);
mnStreamPos = mpDatStream->Tell();
}
@@ -793,7 +801,11 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
// when the dialog wasn't already presented to the user.
// As a side effect this has the benefit that the check is only done on the
// first set of visible lines.
sal_Unicode cDetectSep = (mbDetectSpaceSep && !mxRbFixed->get_active() && !mxCkbSpace->get_active() ? 0 : 0xffff);
sal_Unicode cDetectSep = ((mbDetectSep && !mxRbFixed->get_active()
&& (!mxCkbTab->get_active() || !mxCkbSemicolon->get_active()
|| !mxCkbComma->get_active() || !mxCkbSpace->get_active())) ? 0 : 0xffff);
if (cDetectSep == 0xffff)
mbDetectSep = false;
sal_Int32 nBaseLine = mxTableBox->GetGrid().GetFirstVisLine();
sal_Int32 nRead = mxTableBox->GetGrid().GetVisLineCount();
@@ -813,16 +825,22 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
for (; i < CSV_PREVIEW_LINES; i++)
maPreviewLine[i].clear();
if (mbDetectSpaceSep)
if (mbDetectSep)
{
mbDetectSpaceSep = false;
if (cDetectSep == ' ')
mbDetectSep = false;
if (cDetectSep)
{
// Expect space to be appended by now so all subsequent
// Expect separator to be appended by now so all subsequent
// GetLine()/ReadCsvLine() actually used it.
assert(maFieldSeparators.endsWith(" "));
// Preselect Space in UI.
mxCkbSpace->set_active(true);
assert(maFieldSeparators.endsWith(OUStringChar(cDetectSep)));
// Preselect separator in UI.
switch (cDetectSep)
{
case '\t': mxCkbTab->set_active(true); break;
case ';': mxCkbSemicolon->set_active(true); break;
case ',': mxCkbComma->set_active(true); break;
case ' ': mxCkbSpace->set_active(true); break;
}
}
}
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index 6297fba..f271a70 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -606,17 +606,40 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
// Due to broken CSV generators that don't double embedded quotes check if
// a field separator immediately or with trailing spaces follows the quote,
// only then end the field, or at end of string.
const sal_Unicode cBlank = ' ';
constexpr sal_Unicode cBlank = ' ';
if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank))
return FIELDEND_QUOTE;
// Detect a possible blank separator if it's not already in the list (which
// was checked right above for p[1]==cBlank).
if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank)
rcDetectSep = cBlank;
const bool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank);
while (p[1] == cBlank)
++p;
if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
return FIELDEND_QUOTE;
// Extended separator detection after a closing quote (with or without
// blanks). Note that nQuotes is incremented *after* the call so is not yet
// even here, and that with separator detection we reach here only if
// lcl_isEscapedOrFieldEndQuote() did not already detect FIRST_QUOTE or
// SECOND_QUOTE for an escaped embedded quote, thus nQuotes does not have
// to be checked.
if (!rcDetectSep)
{
constexpr sal_Unicode vSep[] = { ',', '\t', ';' };
for (const sal_Unicode c : vSep)
{
if (p[1] == c)
{
rcDetectSep = c;
return FIELDEND_QUOTE;
}
}
}
// Blank separator is least significant, after others.
if (bBlankSep)
{
rcDetectSep = cBlank;
return FIELDEND_QUOTE;
}
return DONTKNOW_QUOTE;
}
@@ -644,7 +667,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep )
{
if ((nQuotes % 2) == 0)
if ((nQuotes & 1) == 0)
{
if (p[-1] == cStr)
return SECOND_QUOTE;
@@ -2480,7 +2503,7 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr )
}
OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep )
OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep, sal_uInt32 nMaxSourceLines )
{
enum RetryState
{
@@ -2505,6 +2528,8 @@ Label_RetryWithNewSep:
if (bEmbeddedLineBreak)
{
sal_uInt32 nLine = 0;
const sal_Unicode* pSeps = rFieldSeparators.getStr();
QuoteType eQuoteState = FIELDEND_QUOTE;
@@ -2543,10 +2568,11 @@ Label_RetryWithNewSep:
{
eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep);
if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ')
if (eRetryState == RetryState::ALLOW && rcDetectSep)
{
eRetryState = RetryState::RETRY;
rFieldSeparators += " ";
rFieldSeparators += OUStringChar(rcDetectSep);
pSeps = rFieldSeparators.getStr();
goto Label_RetryWithNewSep;
}
@@ -2592,10 +2618,14 @@ Label_RetryWithNewSep:
++p;
}
if (nQuotes % 2 == 0)
if ((nQuotes & 1) == 0)
// We still have a (theoretical?) problem here if due to
// nArbitraryLineLengthLimit we split a string right between a
// doubled quote pair.
// nArbitraryLineLengthLimit (or nMaxSourceLines below) we
// split a string right between a doubled quote pair.
break;
else if (++nLine >= nMaxSourceLines && nMaxSourceLines > 0)
// Unconditionally increment nLine even if nMaxSourceLines==0
// so it can be observed in debugger.
break;
else
{
diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx
index b334755..031a0ca 100644
--- a/sc/source/ui/inc/impex.hxx
+++ b/sc/source/ui/inc/impex.hxx
@@ -194,14 +194,25 @@ public:
The quote character used.
@param rcDetectSep
If 0 then attempt to detect a possible space (blank) separator if
If 0 then attempt to detect a possible separator if
rFieldSeparators doesn't include it already. This can be necessary because
of the "accept broken misquoted CSV fields" feature that tries to ignore
trailing blanks after a quoted field and if no separator follows continues
to add content to the field assuming the single double quote was in error.
If this blank separator is detected it is added to rFieldSeparators and the
It is also necessary if the only possible separator was not selected and
not included in rFieldSeparators and a line starts with a quoted field, in
which case appending lines is tried until end of file.
If a separator is detected it is added to rFieldSeparators and the
line is reread with the new separators
@param nMaxSourceLines
Maximum source lines to read and combine into one logical line for embedded
new line purpose. Should be limited for the preview dialog because only
non-matching separators selected otherwise would lead to trying to
concatenate lines until file end.
If 0 no limit other than the internal arbitrary resulting line length
limit.
check Stream::good() to detect IO problems during read
@ATTENTION
@@ -222,6 +233,7 @@ public:
*/
SC_DLLPUBLIC OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep );
OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep,
sal_uInt32 nMaxSourceLines = 0 );
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/ui/inc/scuiasciiopt.hxx b/sc/source/ui/inc/scuiasciiopt.hxx
index 140ca09..eae2f2f 100644
--- a/sc/source/ui/inc/scuiasciiopt.hxx
+++ b/sc/source/ui/inc/scuiasciiopt.hxx
@@ -44,7 +44,7 @@ class ScImportAsciiDlg : public weld::GenericDialogController
rtl_TextEncoding meCharSet; /// Selected char set.
bool mbCharSetSystem; /// Is System char set selected?
ScImportAsciiCall meCall; /// How the dialog is called (see asciiopt.hxx)
bool mbDetectSpaceSep; /// Whether to detect a possible space separator.
bool mbDetectSep; /// Whether to detect a possible separator.
std::unique_ptr<weld::Label> mxFtCharSet;
std::unique_ptr<SvxTextEncodingBox> mxLbCharSet;