tdf#140606 make PDF parsing more lenient and prevent a crash

If the external document can't be opened, it tried to continue
with the export anyway, which eventually lead to a crash. This
is fixed by handling this situation and prevent a crash, however
the part of the document in this case isn't exported.

The document couldn't be opened because of a parsing error - there
was a unexpected null character instead of a whitespace, which
made the parser panic. Fix this by making the parser more lenient
in such a situation when there is an unexpected null and try to
continue parsing.

Bug document seems to be created with a buggy PDF writer, but other
PDF readers don't complain when parsing the document so it looks to
be a valid. qpdf --check doesn't complain either.

Added a test that checks a document with a null parses.

Change-Id: I61eb281e821ccd195ef006d778556e25d1c7f5e3
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/111820
Tested-by: Jenkins
Reviewed-by: Tomaž Vajngerl <quikee@gmail.com>
diff --git a/vcl/inc/pdf/ExternalPDFStreams.hxx b/vcl/inc/pdf/ExternalPDFStreams.hxx
index dc484e1..7840217 100644
--- a/vcl/inc/pdf/ExternalPDFStreams.hxx
+++ b/vcl/inc/pdf/ExternalPDFStreams.hxx
@@ -34,21 +34,25 @@

    std::map<sal_Int32, sal_Int32>& getCopiedResources() { return maCopiedResources; }

    filter::PDFDocument& getPDFDocument()
    std::shared_ptr<filter::PDFDocument>& getPDFDocument()
    {
        if (!mpPDFDocument)
        {
            SvMemoryStream aPDFStream;
            aPDFStream.WriteBytes(maDataContainer.getData(), maDataContainer.getSize());
            aPDFStream.Seek(0);
            mpPDFDocument = std::make_shared<filter::PDFDocument>();
            if (!mpPDFDocument->Read(aPDFStream))
            auto pPDFDocument = std::make_shared<filter::PDFDocument>();
            if (!pPDFDocument->Read(aPDFStream))
            {
                SAL_WARN("vcl.pdfwriter",
                         "PDFWriterImpl::writeReferenceXObject: reading the PDF document failed");
            }
            else
            {
                mpPDFDocument = pPDFDocument;
            }
        }
        return *mpPDFDocument;
        return mpPDFDocument;
    }
};

diff --git a/vcl/qa/cppunit/PDFDocumentTest.cxx b/vcl/qa/cppunit/PDFDocumentTest.cxx
index 2246f8e..35af8e6 100644
--- a/vcl/qa/cppunit/PDFDocumentTest.cxx
+++ b/vcl/qa/cppunit/PDFDocumentTest.cxx
@@ -156,6 +156,18 @@
    }
}

CPPUNIT_TEST_FIXTURE(PDFDocumentTest, testParseDocumentWithNullAsWhitespace)
{
    // tdf#140606
    // Bug document contained a null, which cause the parser to panic,
    // but other PDF readers can handle the file well.

    OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "DocumentWithNull.pdf";
    vcl::filter::PDFDocument aDocument;
    SvFileStream aStream(aURL, StreamMode::READ);
    CPPUNIT_ASSERT(aDocument.Read(aStream));
}

namespace
{
vcl::filter::PDFObjectElement*
diff --git a/vcl/qa/cppunit/data/DocumentWithNull.pdf b/vcl/qa/cppunit/data/DocumentWithNull.pdf
new file mode 100644
index 0000000..f6d9269
--- /dev/null
+++ b/vcl/qa/cppunit/data/DocumentWithNull.pdf
Binary files differ
diff --git a/vcl/source/filter/ipdf/pdfdocument.cxx b/vcl/source/filter/ipdf/pdfdocument.cxx
index acf5849..5ebe05e 100644
--- a/vcl/source/filter/ipdf/pdfdocument.cxx
+++ b/vcl/source/filter/ipdf/pdfdocument.cxx
@@ -1324,12 +1324,18 @@
                }
                else
                {
                    if (!rtl::isAsciiWhiteSpace(static_cast<unsigned char>(ch)))
                    auto uChar = static_cast<unsigned char>(ch);
                    // Be more lenient and allow unexpected null char
                    if (!rtl::isAsciiWhiteSpace(uChar) && uChar != 0)
                    {
                        SAL_WARN("vcl.filter", "PDFDocument::Tokenize: unexpected character: "
                                                   << ch << " at byte position " << rStream.Tell());
                        SAL_WARN("vcl.filter",
                                 "PDFDocument::Tokenize: unexpected character with code "
                                     << sal_Int32(ch) << " at byte position " << rStream.Tell());
                        return false;
                    }
                    SAL_WARN_IF(uChar == 0, "vcl.filter",
                                "PDFDocument::Tokenize: unexpected null character at "
                                    << rStream.Tell() << " - ignoring");
                }
                break;
            }
diff --git a/vcl/source/gdi/pdfwriter_impl.cxx b/vcl/source/gdi/pdfwriter_impl.cxx
index 967dcf6..f7c6f82 100644
--- a/vcl/source/gdi/pdfwriter_impl.cxx
+++ b/vcl/source/gdi/pdfwriter_impl.cxx
@@ -8458,10 +8458,16 @@
        // object.
        if (rEmit.m_nExternalPDFDataIndex < 0)
            return;
        auto & rExternalPDFStream = m_aExternalPDFStreams.get(rEmit.m_nExternalPDFDataIndex);
        auto & rPDFDocument = rExternalPDFStream.getPDFDocument();
        auto& rExternalPDFStream = m_aExternalPDFStreams.get(rEmit.m_nExternalPDFDataIndex);
        auto& pPDFDocument = rExternalPDFStream.getPDFDocument();
        if (!pPDFDocument)
        {
            // Couldn't parse the document and can't continue
            SAL_WARN("vcl.pdfwriter", "PDFWriterImpl::writeReferenceXObject: failed to parse the document");
            return;
        }

        std::vector<filter::PDFObjectElement*> aPages = rPDFDocument.GetPages();
        std::vector<filter::PDFObjectElement*> aPages = pPDFDocument->GetPages();
        if (aPages.empty())
        {
            SAL_WARN("vcl.pdfwriter", "PDFWriterImpl::writeReferenceXObject: no pages");