tdf#107018 PDF export of PDF images: handle references in nested dictionaries

Also get rid of the GetKeyOffset() and GetKeyValueLength() calls when
copying dictionaries: the reference already knows its offset and length,
so no need to call them. This makes the dictionary and the array
handling more similar.

Change-Id: I65936acfaf857636a8d83da3a4cec69289eb89d8
Reviewed-on: https://gerrit.libreoffice.org/36282
Reviewed-by: Miklos Vajna <vmiklos@collabora.co.uk>
Tested-by: Jenkins <ci@libreoffice.org>
diff --git a/include/vcl/filter/pdfdocument.hxx b/include/vcl/filter/pdfdocument.hxx
index 595b4f0..d83cb83 100644
--- a/include/vcl/filter/pdfdocument.hxx
+++ b/include/vcl/filter/pdfdocument.hxx
@@ -71,6 +71,9 @@ class VCL_DLLPUBLIC PDFObjectElement : public PDFElement
    std::vector< std::unique_ptr<PDFElement> > m_aElements;
    /// Uncompressed buffer of an object in an object stream.
    std::unique_ptr<SvMemoryStream> m_pStreamBuffer;
    /// List of all reference elements inside this object's dictionary and
    /// nested dictionaries.
    std::vector<PDFReferenceElement*> m_aDictionaryReferences;

public:
    PDFObjectElement(PDFDocument& rDoc, double fObjectValue, double fGenerationValue);
@@ -88,8 +91,8 @@ public:
    PDFNumberElement* GetNumberElement() const;
    /// Get access to the parsed key-value items from the object dictionary.
    const std::map<OString, PDFElement*>& GetDictionaryItems();
    /// Same as GetDictionaryItems(), but entries are sorted by file offset.
    std::vector< std::pair<OString, PDFElement*> > GetDictionaryItemsByOffset();
    const std::vector<PDFReferenceElement*>& GetDictionaryReferences() const;
    void AddDictionaryReference(PDFReferenceElement* pReference);
    void SetArray(PDFArrayElement* pArrayElement);
    void SetStream(PDFStreamElement* pStreamElement);
    /// Access to the stream of the object, if it has any.
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf107018.odt b/vcl/qa/cppunit/pdfexport/data/tdf107018.odt
new file mode 100644
index 0000000..3bfc7b2
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf107018.odt
Binary files differ
diff --git a/vcl/qa/cppunit/pdfexport/pdfexport.cxx b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
index 31d0dfb..aacf36b 100644
--- a/vcl/qa/cppunit/pdfexport/pdfexport.cxx
+++ b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
@@ -53,6 +53,7 @@ public:
    void testTdf106972();
    void testTdf106972Pdf17();
    void testTdf107013();
    void testTdf107018();
#endif

    CPPUNIT_TEST_SUITE(PdfExportTest);
@@ -65,6 +66,7 @@ public:
    CPPUNIT_TEST(testTdf106972);
    CPPUNIT_TEST(testTdf106972Pdf17);
    CPPUNIT_TEST(testTdf107013);
    CPPUNIT_TEST(testTdf107018);
#endif
    CPPUNIT_TEST_SUITE_END();
};
@@ -402,6 +404,54 @@ void PdfExportTest::testTdf107013()
    // This failed, the reference to the image was created, but not the image.
    CPPUNIT_ASSERT(pXObject);
}

void PdfExportTest::testTdf107018()
{
    vcl::filter::PDFDocument aDocument;
    load("tdf107018.odt", aDocument);

    // Get access to the only image on the only page.
    std::vector<vcl::filter::PDFObjectElement*> aPages = aDocument.GetPages();
    CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), aPages.size());
    vcl::filter::PDFObjectElement* pResources = aPages[0]->LookupObject("Resources");
    CPPUNIT_ASSERT(pResources);
    auto pXObjects = dynamic_cast<vcl::filter::PDFDictionaryElement*>(pResources->Lookup("XObject"));
    CPPUNIT_ASSERT(pXObjects);
    CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), pXObjects->GetItems().size());
    vcl::filter::PDFObjectElement* pXObject = pXObjects->LookupObject(pXObjects->GetItems().begin()->first);
    CPPUNIT_ASSERT(pXObject);

    // Get access to the form object inside the image.
    auto pXObjectResources = dynamic_cast<vcl::filter::PDFDictionaryElement*>(pXObject->Lookup("Resources"));
    CPPUNIT_ASSERT(pXObjectResources);
    auto pXObjectForms = dynamic_cast<vcl::filter::PDFDictionaryElement*>(pXObjectResources->LookupElement("XObject"));
    CPPUNIT_ASSERT(pXObjectForms);
    vcl::filter::PDFObjectElement* pForm = pXObjectForms->LookupObject(pXObjectForms->GetItems().begin()->first);
    CPPUNIT_ASSERT(pForm);

    // Get access to Resources -> Font -> F1 of the form.
    auto pFormResources = dynamic_cast<vcl::filter::PDFDictionaryElement*>(pForm->Lookup("Resources"));
    CPPUNIT_ASSERT(pFormResources);
    auto pFonts = dynamic_cast<vcl::filter::PDFDictionaryElement*>(pFormResources->LookupElement("Font"));
    CPPUNIT_ASSERT(pFonts);
    auto pF1Ref = dynamic_cast<vcl::filter::PDFReferenceElement*>(pFonts->LookupElement("F1"));
    CPPUNIT_ASSERT(pF1Ref);
    vcl::filter::PDFObjectElement* pF1 = pF1Ref->LookupObject();
    CPPUNIT_ASSERT(pF1);

    // Check that Foo -> Bar of the font is of type Pages.
    auto pFontFoo = dynamic_cast<vcl::filter::PDFDictionaryElement*>(pF1->Lookup("Foo"));
    CPPUNIT_ASSERT(pFontFoo);
    auto pBar = dynamic_cast<vcl::filter::PDFReferenceElement*>(pFontFoo->LookupElement("Bar"));
    CPPUNIT_ASSERT(pBar);
    vcl::filter::PDFObjectElement* pObject = pBar->LookupObject();
    CPPUNIT_ASSERT(pObject);
    auto pName = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
    CPPUNIT_ASSERT(pName);
    // This was "XObject", reference in a nested dictionary wasn't updated when
    // copying the page stream of a PDF image.
    CPPUNIT_ASSERT_EQUAL(OString("Pages"), pName->GetValue());
}
#endif

CPPUNIT_TEST_SUITE_REGISTRATION(PdfExportTest);
diff --git a/vcl/source/filter/ipdf/pdfdocument.cxx b/vcl/source/filter/ipdf/pdfdocument.cxx
index 43d4248..b0bb8be 100644
--- a/vcl/source/filter/ipdf/pdfdocument.cxx
+++ b/vcl/source/filter/ipdf/pdfdocument.cxx
@@ -1071,10 +1071,14 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode, std::vector< s
                    }
                    else
                    {
                        rElements.push_back(std::unique_ptr<PDFElement>(new PDFReferenceElement(*this, *pObjectNumber, *pGenerationNumber)));
                        auto pReference = new PDFReferenceElement(*this, *pObjectNumber, *pGenerationNumber);
                        rElements.push_back(std::unique_ptr<PDFElement>(pReference));
                        if (pArray)
                            // Reference is part of a direct (non-dictionary) array, inform the array.
                            pArray->PushBack(rElements.back().get());
                        if (bInObject && nDictionaryDepth > 0 && pObject)
                            // Inform the object about a new in-dictionary reference.
                            pObject->AddDictionaryReference(pReference);
                    }
                    if (!rElements.back()->Read(rStream))
                    {
@@ -2512,23 +2516,14 @@ PDFNumberElement* PDFObjectElement::GetNumberElement() const
    return m_pNumberElement;
}

std::vector< std::pair<OString, PDFElement*> > PDFObjectElement::GetDictionaryItemsByOffset()
const std::vector<PDFReferenceElement*>& PDFObjectElement::GetDictionaryReferences() const
{
    std::vector< std::pair<OString, PDFElement*> > aRet;
    return m_aDictionaryReferences;
}

    for (const auto& rItem : m_aDictionary)
        aRet.push_back(rItem);

    PDFDictionaryElement* pDictionary = GetDictionary();
    if (!pDictionary)
        return aRet;

    std::sort(aRet.begin(), aRet.end(), [pDictionary](const std::pair<OString, PDFElement*>& a, const std::pair<OString, PDFElement*>& b) -> bool
    {
        return pDictionary->GetKeyOffset(a.first) < pDictionary->GetKeyOffset(b.first);
    });

    return aRet;
void PDFObjectElement::AddDictionaryReference(PDFReferenceElement* pReference)
{
    m_aDictionaryReferences.push_back(pReference);
}

const std::map<OString, PDFElement*>& PDFObjectElement::GetDictionaryItems()
diff --git a/vcl/source/gdi/pdfwriter_impl.cxx b/vcl/source/gdi/pdfwriter_impl.cxx
index 8445377..d5c1f6e 100644
--- a/vcl/source/gdi/pdfwriter_impl.cxx
+++ b/vcl/source/gdi/pdfwriter_impl.cxx
@@ -10899,17 +10899,15 @@ sal_Int32 PDFWriterImpl::copyExternalResource(SvMemoryStream& rDocBuffer, filter
    OStringBuffer aLine;
    aLine.append(nObject);
    aLine.append(" 0 obj\n");
    if (filter::PDFDictionaryElement* pDictionary = rObject.GetDictionary())
    if (rObject.GetDictionary())
    {
        aLine.append("<<");

        // Complex case: can't copy the dictionary byte array as is, as it may contain references.
        bool bDone = false;
        std::vector< std::pair<OString, filter::PDFElement*> > aItems = rObject.GetDictionaryItemsByOffset();
        sal_uInt64 nCopyStart = 0;
        for (const auto& rItem : aItems)
        for (auto pReference : rObject.GetDictionaryReferences())
        {
            auto pReference = dynamic_cast<filter::PDFReferenceElement*>(rItem.second);
            if (pReference)
            {
                filter::PDFObjectElement* pReferenced = pReference->LookupObject();
@@ -10918,8 +10916,8 @@ sal_Int32 PDFWriterImpl::copyExternalResource(SvMemoryStream& rDocBuffer, filter
                    // Copy the referenced object.
                    sal_Int32 nRef = copyExternalResource(rDocBuffer, *pReferenced, rCopiedResources);

                    sal_uInt64 nReferenceStart = pDictionary->GetKeyOffset(rItem.first) + rItem.first.getLength();
                    sal_uInt64 nReferenceEnd = pDictionary->GetKeyOffset(rItem.first) + pDictionary->GetKeyValueLength(rItem.first);
                    sal_uInt64 nReferenceStart = pReference->GetObjectElement().GetLocation();
                    sal_uInt64 nReferenceEnd = pReference->GetOffset();
                    sal_uInt64 nOffset = 0;
                    if (nCopyStart == 0)
                        // Dict start -> reference start.