tdf#146048: detect UTF-16 without BOM

Change-Id: I3c1742cdf88dfa08cf35d9f95875d4d3d6af09db
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/126596
Tested-by: Jenkins
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
diff --git a/include/svtools/svparser.hxx b/include/svtools/svparser.hxx
index b0ba450..d54b4a9 100644
--- a/include/svtools/svparser.hxx
+++ b/include/svtools/svparser.hxx
@@ -62,7 +62,6 @@ protected:
    sal_uInt64          nNextChPos;
    sal_uInt32          nNextCh;            // current character codepoint in UTF32 for the "lex"

    bool                bUCS2BSrcEnc : 1;   // or as big-endian UCS2
    bool                bSwitchToUCS2 : 1;  // switching is allowed
    bool                bRTF_InTextRead : 1;  // only for RTF-Parser!!!

diff --git a/svtools/Library_svt.mk b/svtools/Library_svt.mk
index 59bd29ac..31685bb 100644
--- a/svtools/Library_svt.mk
+++ b/svtools/Library_svt.mk
@@ -61,6 +61,7 @@ $(eval $(call gb_Library_use_libraries,svt,\

$(eval $(call gb_Library_use_externals,svt,\
	boost_headers \
    icui18n \
    icuuc \
    icu_headers \
))
diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx
index 3da3440..0fec7a97 100644
--- a/svtools/source/svrtf/svparser.cxx
+++ b/svtools/source/svrtf/svparser.cxx
@@ -25,6 +25,7 @@
#include <rtl/tencinfo.h>
#include <rtl/character.hxx>
#include <sal/log.hxx>
#include <unicode/ucsdet.h>

#include <vector>
#include <climits>
@@ -85,7 +86,6 @@ SvParser<T>::SvParser( SvStream& rIn, sal_uInt8 nStackSize )
    , eSrcEnc( RTL_TEXTENCODING_DONTKNOW )
    , nNextChPos(0)
    , nNextCh(0)
    , bUCS2BSrcEnc(false)
    , bSwitchToUCS2(false)
    , bRTF_InTextRead(false)
    , nTokenStackSize( nStackSize )
@@ -188,87 +188,66 @@ sal_uInt32 SvParser<T>::GetNextChar()
    // When reading multiple bytes, we don't have to care about the file
    // position when we run into the pending state. The file position is
    // maintained by SaveState/RestoreState.
    bool bErr;
    if( bSwitchToUCS2 && 0 == rInput.Tell() )
    {
        unsigned char c1;
        bool bSeekBack = true;

        rInput.ReadUChar( c1 );
        bErr = !rInput.good();
        if( !bErr )
        rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW);
        if (rInput.good())
        {
            if( 0xff == c1 || 0xfe == c1 )
            sal_uInt64 nPos = rInput.Tell();
            if (nPos == 2)
                eSrcEnc = RTL_TEXTENCODING_UCS2;
            else if (nPos == 3)
                SetSrcEncoding(RTL_TEXTENCODING_UTF8);
            else // Try to detect encoding without BOM
            {
                unsigned char c2;
                rInput.ReadUChar( c2 );
                bErr = !rInput.good();
                if( !bErr )
                std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer
                const size_t nSize = rInput.ReadBytes(buf.data(), buf.size());
                rInput.Seek(0);
                if (nSize > 0)
                {
                    if( 0xfe == c1 && 0xff == c2 )
                    UErrorCode uerr = U_ZERO_ERROR;
                    UCharsetDetector* ucd = ucsdet_open(&uerr);
                    ucsdet_setText(ucd, buf.data(), nSize, &uerr);
                    if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr))
                    {
                        eSrcEnc = RTL_TEXTENCODING_UCS2;
                        bUCS2BSrcEnc = true;
                        bSeekBack = false;
                    }
                    else if( 0xff == c1 && 0xfe == c2 )
                    {
                        eSrcEnc = RTL_TEXTENCODING_UCS2;
                        bUCS2BSrcEnc = false;
                        bSeekBack = false;
                    }
                }
            }
            else if( 0xef == c1 || 0xbb == c1 ) // check for UTF-8 BOM
            {
                unsigned char c2;
                rInput.ReadUChar( c2 );
                bErr = !rInput.good();
                if( !bErr )
                {
                    if( ( 0xef == c1 && 0xbb == c2 ) || ( 0xbb == c1 && 0xef == c2 ) )
                    {
                        unsigned char c3(0);
                        rInput.ReadUChar( c3 );
                        bErr = !rInput.good();
                        if( !bErr && ( 0xbf == c3 ) )
                        const char* pEncodingName = ucsdet_getName(match, &uerr);

                        if (U_SUCCESS(uerr))
                        {
                            SetSrcEncoding(RTL_TEXTENCODING_UTF8);
                            bSeekBack = false;
                            if (strcmp("UTF-8", pEncodingName) == 0)
                            {
                                SetSrcEncoding(RTL_TEXTENCODING_UTF8);
                            }
                            else if (strcmp("UTF-16LE", pEncodingName) == 0)
                            {
                                eSrcEnc = RTL_TEXTENCODING_UCS2;
                                rInput.SetEndian(SvStreamEndian::LITTLE);
                            }
                            else if (strcmp("UTF-16BE", pEncodingName) == 0)
                            {
                                eSrcEnc = RTL_TEXTENCODING_UCS2;
                                rInput.SetEndian(SvStreamEndian::BIG);
                            }
                        }
                    }

                    ucsdet_close(ucd);
                }
            }
        }
        if( bSeekBack )
            rInput.Seek( 0 );

        bSwitchToUCS2 = false;
    }

    bool bErr;
    nNextChPos = rInput.Tell();

    if( RTL_TEXTENCODING_UCS2 == eSrcEnc )
    {
        unsigned char c1, c2;

        rInput.ReadUChar( c1 ).ReadUChar( c2 );
        if( 2 == rInput.Tell() && rInput.good() &&
            ( (bUCS2BSrcEnc && 0xfe == c1 && 0xff == c2) ||
              (!bUCS2BSrcEnc && 0xff == c1 && 0xfe == c2) ) )
            rInput.ReadUChar( c1 ).ReadUChar( c2 );

        sal_Unicode cUC;
        rInput.ReadUtf16(cUC);
        bErr = !rInput.good();
        if( !bErr )
        {
            sal_Unicode cUC = USHRT_MAX;
            if( bUCS2BSrcEnc )
                cUC = (sal_Unicode(c1) << 8) | c2;
            else
                cUC = (sal_Unicode(c2) << 8) | c1;

            c = cUC;
        }
    }
    else
    {