1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 #ifndef INCLUDED_VCL_FILTER_PDFDOCUMENT_HXX
12 #define INCLUDED_VCL_FILTER_PDFDOCUMENT_HXX
18 #include <tools/stream.hxx>
19 #include <vcl/dllapi.h>
20 #include <rtl/strbuf.hxx>
22 #include <vcl/filter/pdfobjectcontainer.hxx>
24 namespace com::sun::star::security
29 namespace com::sun::star::uno
31 template <class interface_type
> class Reference
;
41 class PDFTrailerElement
;
42 class PDFReferenceElement
;
44 class PDFDictionaryElement
;
45 class PDFArrayElement
;
46 class PDFStreamElement
;
47 class PDFNumberElement
;
49 /// A byte range in a PDF file.
50 class VCL_DLLPUBLIC PDFElement
52 bool m_bVisiting
= false;
53 bool m_bParsing
= false;
56 PDFElement() = default;
57 virtual bool Read(SvStream
& rStream
) = 0;
58 virtual ~PDFElement() = default;
59 void setVisiting(bool bVisiting
) { m_bVisiting
= bVisiting
; }
60 bool alreadyVisiting() const { return m_bVisiting
; }
61 void setParsing(bool bParsing
) { m_bParsing
= bParsing
; }
62 bool alreadyParsing() const { return m_bParsing
; }
64 virtual void writeString(OStringBuffer
& rBuffer
) = 0;
67 /// Indirect object: something with a unique ID.
68 class VCL_DLLPUBLIC PDFObjectElement final
: public PDFElement
70 /// The document owning this element.
72 double m_fObjectValue
;
73 double m_fGenerationValue
;
74 /// If set, the object contains this number element (outside any dictionary/array).
75 PDFNumberElement
* m_pNumberElement
;
76 /// Position after the '<<' token.
77 sal_uInt64 m_nDictionaryOffset
;
78 /// Length of the dictionary buffer till (before) the '>>' token.
79 sal_uInt64 m_nDictionaryLength
;
80 PDFDictionaryElement
* m_pDictionaryElement
;
81 /// Position after the '[' token, if m_pArrayElement is set.
82 sal_uInt64 m_nArrayOffset
;
83 /// Length of the array buffer till (before) the ']' token.
84 sal_uInt64 m_nArrayLength
;
85 /// The contained direct array, if any.
86 PDFArrayElement
* m_pArrayElement
;
87 /// The stream of this object, used when this is an object stream.
88 PDFStreamElement
* m_pStreamElement
;
89 /// Objects of an object stream.
90 std::vector
<std::unique_ptr
<PDFObjectElement
>> m_aStoredElements
;
91 /// Elements of an object in an object stream.
92 std::vector
<std::unique_ptr
<PDFElement
>> m_aElements
;
93 /// Uncompressed buffer of an object in an object stream.
94 std::unique_ptr
<SvMemoryStream
> m_pStreamBuffer
;
95 /// List of all reference elements inside this object's dictionary and
96 /// nested dictionaries.
97 std::vector
<PDFReferenceElement
*> m_aDictionaryReferences
;
101 void parseIfNecessary();
104 PDFObjectElement(PDFDocument
& rDoc
, double fObjectValue
, double fGenerationValue
);
105 bool Read(SvStream
& rStream
) override
;
106 PDFElement
* Lookup(const OString
& rDictionaryKey
);
107 PDFObjectElement
* LookupObject(const OString
& rDictionaryKey
);
108 double GetObjectValue() const;
109 void SetDictionaryOffset(sal_uInt64 nDictionaryOffset
);
110 sal_uInt64
GetDictionaryOffset();
111 void SetDictionaryLength(sal_uInt64 nDictionaryLength
);
112 sal_uInt64
GetDictionaryLength();
113 PDFDictionaryElement
* GetDictionary();
114 void SetDictionary(PDFDictionaryElement
* pDictionaryElement
);
115 void SetNumberElement(PDFNumberElement
* pNumberElement
);
116 PDFNumberElement
* GetNumberElement() const;
117 /// Get access to the parsed key-value items from the object dictionary.
118 const std::map
<OString
, PDFElement
*>& GetDictionaryItems();
119 const std::vector
<PDFReferenceElement
*>& GetDictionaryReferences() const;
120 void AddDictionaryReference(PDFReferenceElement
* pReference
);
121 void SetArray(PDFArrayElement
* pArrayElement
);
122 void SetStream(PDFStreamElement
* pStreamElement
);
123 /// Access to the stream of the object, if it has any.
124 PDFStreamElement
* GetStream() const;
125 void SetArrayOffset(sal_uInt64 nArrayOffset
);
126 sal_uInt64
GetArrayOffset() const;
127 void SetArrayLength(sal_uInt64 nArrayLength
);
128 sal_uInt64
GetArrayLength() const;
129 PDFArrayElement
* GetArray();
130 /// Parse objects stored in this object stream.
131 void ParseStoredObjects();
132 std::vector
<std::unique_ptr
<PDFElement
>>& GetStoredElements();
133 SvMemoryStream
* GetStreamBuffer() const;
134 void SetStreamBuffer(std::unique_ptr
<SvMemoryStream
>& pStreamBuffer
);
135 PDFDocument
& GetDocument();
137 void writeString(OStringBuffer
& /*rBuffer*/) override
{ assert(false && "not implemented"); }
140 /// Array object: a list.
141 class VCL_DLLPUBLIC PDFArrayElement final
: public PDFElement
143 std::vector
<PDFElement
*> m_aElements
;
144 /// The object that contains this array.
145 PDFObjectElement
* m_pObject
;
148 PDFArrayElement(PDFObjectElement
* pObject
);
149 bool Read(SvStream
& rStream
) override
;
150 void PushBack(PDFElement
* pElement
);
151 const std::vector
<PDFElement
*>& GetElements() const;
152 PDFElement
* GetElement(size_t nIndex
) const { return m_aElements
[nIndex
]; }
154 void writeString(OStringBuffer
& rBuffer
) override
156 rBuffer
.append("[ ");
157 for (auto& rElement
: m_aElements
)
159 rElement
->writeString(rBuffer
);
166 /// Reference object: something with a unique ID.
167 class VCL_DLLPUBLIC PDFReferenceElement final
: public PDFElement
171 int m_fGenerationValue
;
172 /// Location after the 'R' token.
173 sal_uInt64 m_nOffset
= 0;
174 /// The element providing the object number.
175 PDFNumberElement
& m_rObject
;
178 PDFReferenceElement(PDFDocument
& rDoc
, PDFNumberElement
& rObject
,
179 PDFNumberElement
const& rGeneration
);
180 bool Read(SvStream
& rStream
) override
;
181 /// Assuming the reference points to a number object, return its value.
182 double LookupNumber(SvStream
& rStream
) const;
183 /// Lookup referenced object, without assuming anything about its contents.
184 PDFObjectElement
* LookupObject();
185 int GetObjectValue() const;
186 int GetGenerationValue() const;
187 sal_uInt64
GetOffset() const;
188 PDFNumberElement
& GetObjectElement() const;
190 void writeString(OStringBuffer
& rBuffer
) override
192 rBuffer
.append(sal_Int32(GetObjectValue()));
194 rBuffer
.append(sal_Int32(GetGenerationValue()));
195 rBuffer
.append(" R");
199 /// Stream object: a byte array with a known length.
200 class VCL_DLLPUBLIC PDFStreamElement final
: public PDFElement
203 sal_uInt64 m_nOffset
;
204 /// The byte array itself.
205 SvMemoryStream m_aMemory
;
208 explicit PDFStreamElement(size_t nLength
);
209 bool Read(SvStream
& rStream
) override
;
210 sal_uInt64
GetOffset() const;
211 SvMemoryStream
& GetMemory();
213 void writeString(OStringBuffer
& rBuffer
) override
215 rBuffer
.append("stream\n");
216 rBuffer
.append(static_cast<const char*>(m_aMemory
.GetData()), m_aMemory
.GetSize());
217 rBuffer
.append("\nendstream\n");
221 /// Name object: a key string.
222 class VCL_DLLPUBLIC PDFNameElement final
: public PDFElement
225 /// Offset after the '/' token.
226 sal_uInt64 m_nLocation
= 0;
230 bool Read(SvStream
& rStream
) override
;
231 void SetValue(const OString
& rValue
) { m_aValue
= rValue
; }
232 const OString
& GetValue() const;
233 sal_uInt64
GetLocation() const;
234 sal_uInt64
GetLength() const { return m_aValue
.getLength(); }
236 void writeString(OStringBuffer
& rBuffer
) override
239 rBuffer
.append(m_aValue
);
243 /// Dictionary object: a set key-value pairs.
244 class VCL_DLLPUBLIC PDFDictionaryElement final
: public PDFElement
246 /// Key-value pairs when the dictionary is a nested value.
247 std::map
<OString
, PDFElement
*> m_aItems
;
248 /// Offset after the '<<' token.
249 sal_uInt64 m_nLocation
= 0;
250 /// Position after the '/' token.
251 std::map
<OString
, sal_uInt64
> m_aDictionaryKeyOffset
;
252 /// Length of the dictionary key and value, till (before) the next token.
253 std::map
<OString
, sal_uInt64
> m_aDictionaryKeyValueLength
;
256 PDFDictionaryElement();
257 bool Read(SvStream
& rStream
) override
;
259 static PDFElement
* Lookup(const std::map
<OString
, PDFElement
*>& rDictionary
,
260 const OString
& rKey
);
261 void SetKeyOffset(const OString
& rKey
, sal_uInt64 nOffset
);
262 sal_uInt64
GetKeyOffset(const OString
& rKey
) const;
263 void SetKeyValueLength(const OString
& rKey
, sal_uInt64 nLength
);
264 sal_uInt64
GetKeyValueLength(const OString
& rKey
) const;
265 const std::map
<OString
, PDFElement
*>& GetItems() const;
266 /// Looks up an object which is only referenced in this dictionary.
267 PDFObjectElement
* LookupObject(const OString
& rDictionaryKey
);
268 /// Looks up an element which is contained in this dictionary.
269 PDFElement
* LookupElement(const OString
& rDictionaryKey
);
270 sal_uInt64
GetLocation() const { return m_nLocation
; }
271 void insert(OString
const& rKey
, PDFElement
* pPDFElement
)
273 m_aItems
.emplace(rKey
, pPDFElement
);
276 void writeString(OStringBuffer
& rBuffer
) override
278 rBuffer
.append("<< ");
279 for (auto& rPair
: m_aItems
)
282 rBuffer
.append(rPair
.first
);
284 rPair
.second
->writeString(rBuffer
);
287 rBuffer
.append(">>");
291 enum class TokenizeMode
295 /// Till the first %%EOF token.
297 /// Till the end of the current object.
299 /// Same as END_OF_OBJECT, but for object streams (no endobj keyword).
303 /// The type column of an entry in a cross-reference stream.
304 enum class XRefEntryType
306 /// xref "f" or xref stream "0".
308 /// xref "n" or xref stream "1".
314 /// An entry in a cross-reference stream.
317 XRefEntryType m_eType
= XRefEntryType::NOT_COMPRESSED
;
319 * Non-compressed: The byte offset of the object, starting from the
320 * beginning of the file.
321 * Compressed: The object number of the object stream in which this object is
324 sal_uInt64 m_nOffset
= 0;
325 /// Are changed as part of an incremental update?.
326 bool m_bDirty
= false;
331 void SetType(XRefEntryType eType
) { m_eType
= eType
; }
333 XRefEntryType
GetType() const { return m_eType
; }
335 void SetOffset(sal_uInt64 nOffset
) { m_nOffset
= nOffset
; }
337 sal_uInt64
GetOffset() const { return m_nOffset
; }
339 void SetDirty(bool bDirty
) { m_bDirty
= bDirty
; }
341 bool GetDirty() const { return m_bDirty
; }
344 /// Hex string: in <AABB> form.
345 class VCL_DLLPUBLIC PDFHexStringElement final
: public PDFElement
350 bool Read(SvStream
& rStream
) override
;
351 const OString
& GetValue() const;
353 void writeString(OStringBuffer
& rBuffer
) override
356 rBuffer
.append(m_aValue
);
361 /// Literal string: in (asdf) form.
362 class VCL_DLLPUBLIC PDFLiteralStringElement final
: public PDFElement
367 bool Read(SvStream
& rStream
) override
;
368 const OString
& GetValue() const;
370 void writeString(OStringBuffer
& rBuffer
) override
373 rBuffer
.append(m_aValue
);
378 /// Numbering object: an integer or a real.
379 class VCL_DLLPUBLIC PDFNumberElement final
: public PDFElement
381 /// Input file start location.
382 sal_uInt64 m_nOffset
= 0;
383 /// Input file token length.
384 sal_uInt64 m_nLength
= 0;
389 bool Read(SvStream
& rStream
) override
;
390 double GetValue() const;
391 void SetValue(double fValue
) { m_fValue
= fValue
; }
393 sal_uInt64
GetLocation() const;
394 sal_uInt64
GetLength() const;
396 void writeString(OStringBuffer
& rBuffer
) override
{ rBuffer
.append(m_fValue
); }
399 /// A one-liner comment.
400 class VCL_DLLPUBLIC PDFCommentElement final
: public PDFElement
406 explicit PDFCommentElement(PDFDocument
& rDoc
);
407 bool Read(SvStream
& rStream
) override
;
408 void writeString(OStringBuffer
& /*rBuffer*/) override
{}
411 /// End of a dictionary: '>>'.
412 class VCL_DLLPUBLIC PDFEndDictionaryElement final
: public PDFElement
414 /// Offset before the '>>' token.
415 sal_uInt64 m_nLocation
= 0;
418 PDFEndDictionaryElement();
419 bool Read(SvStream
& rStream
) override
;
420 sal_uInt64
GetLocation() const;
422 void writeString(OStringBuffer
& /*rBuffer*/) override
{}
425 /// End of a stream: 'endstream' keyword.
426 class VCL_DLLPUBLIC PDFEndStreamElement final
: public PDFElement
429 bool Read(SvStream
& rStream
) override
;
431 void writeString(OStringBuffer
& /*rBuffer*/) override
{}
434 /// End of an object: 'endobj' keyword.
435 class VCL_DLLPUBLIC PDFEndObjectElement final
: public PDFElement
438 bool Read(SvStream
& rStream
) override
;
440 void writeString(OStringBuffer
& /*rBuffer*/) override
{}
443 /// End of an array: ']'.
444 class VCL_DLLPUBLIC PDFEndArrayElement final
: public PDFElement
446 /// Location before the ']' token.
447 sal_uInt64 m_nOffset
= 0;
450 PDFEndArrayElement();
451 bool Read(SvStream
& rStream
) override
;
452 sal_uInt64
GetOffset() const;
454 void writeString(OStringBuffer
& /*rBuffer*/) override
{}
457 /// Boolean object: a 'true' or a 'false'.
458 class VCL_DLLPUBLIC PDFBooleanElement final
: public PDFElement
463 explicit PDFBooleanElement(bool bValue
)
468 bool Read(SvStream
& rStream
) override
;
470 void writeString(OStringBuffer
& rBuffer
) override
472 rBuffer
.append(m_aValue
? "true" : "false");
476 /// Null object: the 'null' singleton.
477 class VCL_DLLPUBLIC PDFNullElement final
: public PDFElement
480 bool Read(SvStream
& rStream
) override
;
482 void writeString(OStringBuffer
& rBuffer
) override
{ rBuffer
.append("null"); }
486 * In-memory representation of an on-disk PDF document.
488 * The PDF element list is not meant to be saved back to disk, but some
489 * elements remember their source offset / length, and based on that it's
490 * possible to modify the input file.
492 class VCL_DLLPUBLIC PDFDocument final
: public PDFObjectContainer
494 /// This vector owns all elements.
495 std::vector
<std::unique_ptr
<PDFElement
>> m_aElements
;
496 /// Object ID <-> object offset map.
497 std::map
<size_t, XRefEntry
> m_aXRef
;
498 /// Object offset <-> Object pointer map.
499 std::map
<size_t, PDFObjectElement
*> m_aOffsetObjects
;
500 /// Object ID <-> Object pointer map.
501 std::map
<size_t, PDFObjectElement
*> m_aIDObjects
;
502 /// List of xref offsets we know.
503 std::vector
<size_t> m_aStartXRefs
;
504 /// Offsets of trailers, from latest to oldest.
505 std::vector
<size_t> m_aTrailerOffsets
;
506 /// Trailer offset <-> Trailer pointer map.
507 std::map
<size_t, PDFTrailerElement
*> m_aOffsetTrailers
;
508 /// List of EOF offsets we know.
509 std::vector
<size_t> m_aEOFs
;
510 PDFTrailerElement
* m_pTrailer
= nullptr;
511 /// When m_pTrailer is nullptr, this can still have a dictionary.
512 PDFObjectElement
* m_pXRefStream
= nullptr;
513 /// All editing takes place in this buffer, if it happens.
514 SvMemoryStream m_aEditBuffer
;
516 /// Signature line in PDF format, to be consumed by the next Sign() invocation.
517 std::vector
<sal_Int8
> m_aSignatureLine
;
519 /// 0-based page number where m_aSignatureLine should be placed.
520 size_t m_nSignaturePage
= 0;
522 /// Suggest a minimal, yet free signature ID to use for the next signature.
523 sal_uInt32
GetNextSignature();
524 /// Write the signature object as part of signing.
525 sal_Int32
WriteSignatureObject(const OUString
& rDescription
, bool bAdES
,
526 sal_uInt64
& rLastByteRangeOffset
, sal_Int64
& rContentOffset
);
527 /// Write the appearance object as part of signing.
528 sal_Int32
WriteAppearanceObject(tools::Rectangle
& rSignatureRectangle
);
529 /// Write the annot object as part of signing.
530 sal_Int32
WriteAnnotObject(PDFObjectElement
const& rFirstPage
, sal_Int32 nSignatureId
,
531 sal_Int32 nAppearanceId
,
532 const tools::Rectangle
& rSignatureRectangle
);
533 /// Write the updated Page object as part of signing.
534 bool WritePageObject(PDFObjectElement
& rFirstPage
, sal_Int32 nAnnotId
);
535 /// Write the updated Catalog object as part of signing.
536 bool WriteCatalogObject(sal_Int32 nAnnotId
, PDFReferenceElement
*& pRoot
);
537 /// Write the updated cross-references as part of signing.
538 void WriteXRef(sal_uInt64 nXRefOffset
, PDFReferenceElement
const* pRoot
);
542 virtual ~PDFDocument();
543 PDFDocument
& operator=(const PDFDocument
&) = delete;
544 PDFDocument(const PDFDocument
&) = delete;
545 /// @name Low-level functions, to be used by PDFElement subclasses.
547 /// Decode a hex dump.
548 static std::vector
<unsigned char> DecodeHexString(PDFHexStringElement
const* pElement
);
549 static OUString
DecodeHexStringUTF16BE(PDFHexStringElement
const& rElement
);
550 static OString
ReadKeyword(SvStream
& rStream
);
551 static size_t FindStartXRef(SvStream
& rStream
);
552 void ReadXRef(SvStream
& rStream
);
553 void ReadXRefStream(SvStream
& rStream
);
554 static void SkipWhitespace(SvStream
& rStream
);
555 /// Instead of all whitespace, just skip CR and NL characters.
556 static void SkipLineBreaks(SvStream
& rStream
);
557 size_t GetObjectOffset(size_t nIndex
) const;
558 const std::vector
<std::unique_ptr
<PDFElement
>>& GetElements() const;
559 std::vector
<PDFObjectElement
*> GetPages();
560 PDFObjectElement
* GetCatalog();
561 /// Remember the end location of an EOF token.
562 void PushBackEOF(size_t nOffset
);
563 /// Look up object based on object number, possibly by parsing object streams.
564 PDFObjectElement
* LookupObject(size_t nObjectNumber
);
565 /// Access to the input document, even after the input stream is gone.
566 SvMemoryStream
& GetEditBuffer();
567 /// Tokenize elements from current offset.
568 bool Tokenize(SvStream
& rStream
, TokenizeMode eMode
,
569 std::vector
<std::unique_ptr
<PDFElement
>>& rElements
,
570 PDFObjectElement
* pObjectElement
);
571 /// Register an object (owned directly or indirectly by m_aElements) as a provider for a given ID.
572 void SetIDObject(size_t nID
, PDFObjectElement
* pObject
);
575 /// @name High-level functions, to be used by others.
577 /// Read elements from the start of the stream till its end.
578 bool Read(SvStream
& rStream
);
579 /// Calls Read() first and if it fails it tries to fixup and then retry.
580 bool ReadWithPossibleFixup(SvStream
& rStream
);
581 void SetSignatureLine(std::vector
<sal_Int8
>&& rSignatureLine
);
582 void SetSignaturePage(size_t nPage
);
583 /// Sign the read document with xCertificate in the edit buffer.
584 bool Sign(const css::uno::Reference
<css::security::XCertificate
>& xCertificate
,
585 const OUString
& rDescription
, bool bAdES
);
586 /// Serializes the contents of the edit buffer.
587 bool Write(SvStream
& rStream
);
588 /// Get a list of signatures embedded into this document.
589 std::vector
<PDFObjectElement
*> GetSignatureWidgets();
590 /// Remove the nth signature from read document in the edit buffer.
591 bool RemoveSignature(size_t nPosition
);
594 /// See vcl::PDFObjectContainer::createObject().
595 sal_Int32
createObject() override
;
596 /// See vcl::PDFObjectContainer::updateObject().
597 bool updateObject(sal_Int32 n
) override
;
598 /// See vcl::PDFObjectContainer::writeBuffer().
599 bool writeBufferBytes(const void* pBuffer
, sal_uInt64 nBytes
) override
;
600 void checkAndEnableStreamEncryption(sal_Int32
/*nObject*/) override
{}
601 void disableStreamEncryption() override
{}
604 /// The trailer singleton is at the end of the doc.
605 class VCL_DLLPUBLIC PDFTrailerElement final
: public PDFElement
608 PDFDictionaryElement
* m_pDictionaryElement
;
609 /// Location of the end of the trailer token.
610 sal_uInt64 m_nOffset
= 0;
613 explicit PDFTrailerElement(PDFDocument
& rDoc
);
614 bool Read(SvStream
& rStream
) override
;
615 PDFElement
* Lookup(const OString
& rDictionaryKey
);
616 sal_uInt64
GetLocation() const;
618 void SetDictionary(PDFDictionaryElement
* pDictionaryElement
)
620 m_pDictionaryElement
= pDictionaryElement
;
623 PDFDictionaryElement
* GetDictionary() { return m_pDictionaryElement
; }
625 void writeString(OStringBuffer
& /*rBuffer*/) override
{ assert(false && "not implemented"); }
628 class VCL_DLLPUBLIC PDFObjectParser final
630 const std::vector
<std::unique_ptr
<PDFElement
>>& mrElements
;
633 PDFObjectParser(std::vector
<std::unique_ptr
<PDFElement
>> const& rElements
)
634 : mrElements(rElements
)
638 size_t parse(PDFElement
* pParsingElement
, size_t nStartIndex
= 0, int nCurrentDepth
= 0);
641 } // namespace vcl::filter
643 #endif // INCLUDED_VCL_FILTER_PDFDOCUMENT_HXX
645 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */