1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <pdfparse.hxx>
23 // boost using obsolete stuff
26 #pragma warning(disable:4996)
27 #pragma warning(disable:4503)
30 // workaround windows compiler: do not include multi_pass.hpp
31 #include <boost/spirit/include/classic_core.hpp>
32 #include <boost/spirit/include/classic_utility.hpp>
33 #include <boost/spirit/include/classic_error_handling.hpp>
34 #include <boost/spirit/include/classic_file_iterator.hpp>
35 #include <boost/bind/bind.hpp>
39 #include <o3tl/safeint.hxx>
40 #include <rtl/strbuf.hxx>
41 #include <rtl/ustrbuf.hxx>
42 #include <sal/log.hxx>
45 // disable warnings again because someone along the line has enabled them
46 // (we have included boost headers, what did you expect?)
49 #pragma warning(disable:4996)
50 #pragma warning(disable:4503)
54 using namespace boost::spirit::classic
;
55 using namespace pdfparse
;
59 class StringEmitContext
: public EmitContext
63 StringEmitContext() : m_aBuf(256) {}
65 virtual bool write( const void* pBuf
, unsigned int nLen
) noexcept override
67 m_aBuf
.append( static_cast<const char*>(pBuf
), nLen
);
70 virtual unsigned int getCurPos() noexcept override
{ return m_aBuf
.getLength(); }
71 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) noexcept override
72 { return (nOrigOffset
+nLen
< o3tl::make_unsigned(m_aBuf
.getLength()) ) &&
73 write( m_aBuf
.getStr() + nOrigOffset
, nLen
); }
74 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) noexcept override
76 if( nOrigOffset
+nLen
< o3tl::make_unsigned(m_aBuf
.getLength()) )
78 memcpy( pBuf
, m_aBuf
.getStr()+nOrigOffset
, nLen
);
84 OString
getString() { return m_aBuf
.makeStringAndClear(); }
87 template< class iteratorT
>
88 class PDFGrammar
: public grammar
< PDFGrammar
<iteratorT
> >
92 explicit PDFGrammar( iteratorT first
)
93 : m_fDouble( 0.0 ), m_aGlobalBegin(std::move( first
)) {}
96 if( !m_aObjectStack
.empty() )
97 delete m_aObjectStack
.front();
101 std::vector
< unsigned int > m_aUIntStack
;
102 std::vector
< PDFEntry
* > m_aObjectStack
;
103 OString m_aErrorString
;
104 iteratorT m_aGlobalBegin
;
107 struct pdf_string_parser
109 typedef nil_t result_t
;
110 template <typename ScannerT
>
112 operator()(ScannerT
const& scan
, result_t
&) const
114 std::ptrdiff_t len
= 0;
117 while( ! scan
.at_end() )
123 if( nBraceLevel
< 0 )
128 else if( c
== '\\' ) // ignore escaped braces
131 ++scan
.first
; // tdf#63054: avoid skipping spaces
132 if( scan
.first
== scan
.last
) // tdf#63054: avoid skipping spaces
138 return scan
.at_end() ? -1 : len
;
142 template< typename ScannerT
>
145 explicit definition( const PDFGrammar
<iteratorT
>& rSelf
)
147 using namespace boost::placeholders
;
149 PDFGrammar
<iteratorT
>* pSelf
= const_cast< PDFGrammar
<iteratorT
>* >( &rSelf
);
151 // workaround workshop compiler: comment_p doesn't work
152 // comment = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )];
153 comment
= lexeme_d
[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p
)[boost::bind(&PDFGrammar::pushComment
, pSelf
, _1
, _2
)] ];
155 boolean
= (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool
, pSelf
, _1
, _2
)];
157 // workaround workshop compiler: confix_p doesn't work
158 //stream = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
159 stream
= (str_p("stream") >> *(anychar_p
- str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream
, pSelf
, _1
, _2
)];
163 >> (*(anychar_p
-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0')))
164 [boost::bind(&PDFGrammar::pushName
, pSelf
, _1
, _2
)] ];
166 // workaround workshop compiler: confix_p doesn't work
167 //stringtype = ( confix_p("(",*anychar_p, ")") |
168 // confix_p("<",*xdigit_p, ">") )
169 // [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
171 stringtype
= ( ( ch_p('(') >> functor_parser
<pdf_string_parser
>() >> ch_p(')') ) |
172 ( ch_p('<') >> *xdigit_p
>> ch_p('>') ) )
173 [boost::bind(&PDFGrammar::pushString
,pSelf
, _1
, _2
)];
175 null_object
= str_p( "null" )[boost::bind(&PDFGrammar::pushNull
, pSelf
, _1
, _2
)];
177 #ifdef USE_ASSIGN_ACTOR
178 objectref
= ( uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
179 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
182 )[boost::bind(&PDFGrammar::pushObjectRef
, pSelf
, _1
, _2
)];
184 objectref
= ( uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
185 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
188 )[boost::bind(&PDFGrammar::pushObjectRef
, pSelf
, _1
, _2
)];
191 #ifdef USE_ASSIGN_ACTOR
192 simple_type
= objectref
| name
|
193 ( real_p
[assign_a(pSelf
->m_fDouble
)] >> eps_p
)
194 [boost::bind(&PDFGrammar::pushDouble
, pSelf
, _1
, _2
)]
195 | stringtype
| boolean
| null_object
;
197 simple_type
= objectref
| name
|
198 ( real_p
[boost::bind(&PDFGrammar::assign_action_double
, pSelf
, _1
)] >> eps_p
)
199 [boost::bind(&PDFGrammar::pushDouble
, pSelf
, _1
, _2
)]
200 | stringtype
| boolean
| null_object
;
203 dict_begin
= str_p( "<<" )[boost::bind(&PDFGrammar::beginDict
, pSelf
, _1
, _2
)];
204 dict_end
= str_p( ">>" )[boost::bind(&PDFGrammar::endDict
, pSelf
, _1
, _2
)];
206 array_begin
= str_p("[")[boost::bind(&PDFGrammar::beginArray
,pSelf
, _1
, _2
)];
207 array_end
= str_p("]")[boost::bind(&PDFGrammar::endArray
,pSelf
, _1
, _2
)];
209 #ifdef USE_ASSIGN_ACTOR
210 object_begin
= uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
211 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
212 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject
, pSelf
, _1
, _2
)];
214 object_begin
= uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
215 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
216 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject
, pSelf
, _1
, _2
)];
218 object_end
= str_p( "endobj" )[boost::bind(&PDFGrammar::endObject
, pSelf
, _1
, _2
)];
220 xref
= str_p( "xref" ) >> uint_p
>> uint_p
222 +( repeat_p(10)[digit_p
]
224 >> repeat_p(5)[digit_p
]
226 >> ( ch_p('n') | ch_p('f') )
227 >> repeat_p(2)[space_p
]
230 dict_element
= dict_begin
| comment
| simple_type
231 | array_begin
| array_end
| dict_end
;
233 object
= object_begin
238 trailer
= str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer
,pSelf
,_1
,_2
)]
240 >> str_p("startxref")
242 >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer
,pSelf
,_1
,_2
)];
244 #ifdef USE_ASSIGN_ACTOR
245 pdfrule
= ! (lexeme_d
[
247 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
249 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
250 >> *((~ch_p('\r') & ~ch_p('\n')))
252 ])[boost::bind(&PDFGrammar::haveFile
,pSelf
, _1
, _2
)]
253 >> *( comment
| object
| ( xref
>> trailer
) );
255 pdfrule
= ! (lexeme_d
[
257 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
259 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
260 >> *(~ch_p('\r') & ~ch_p('\n'))
262 ])[boost::bind(&PDFGrammar::haveFile
,pSelf
, _1
, _2
)]
263 >> *( comment
| object
| ( xref
>> trailer
) );
266 rule
< ScannerT
> comment
, stream
, boolean
, name
, stringtype
, null_object
, simple_type
,
267 objectref
, array
, value
, dict_element
, dict_begin
, dict_end
,
268 array_begin
, array_end
, object
, object_begin
, object_end
,
269 xref
, trailer
, pdfrule
;
271 const rule
< ScannerT
>& start() const { return pdfrule
; }
274 #ifndef USE_ASSIGN_ACTOR
275 void push_back_action_uint( unsigned int i
)
277 m_aUIntStack
.push_back( i
);
279 void assign_action_double( double d
)
285 static void parseError( const char* pMessage
, iteratorT pLocation
)
287 throw_( pLocation
, pMessage
);
290 OString
iteratorToString( iteratorT first
, iteratorT last
) const
292 OStringBuffer
aStr( 32 );
293 while( first
!= last
)
295 aStr
.append( *first
);
298 return aStr
.makeStringAndClear();
301 void haveFile( iteratorT pBegin
, SAL_UNUSED_PARAMETER iteratorT
/*pEnd*/ )
303 if( m_aObjectStack
.empty() )
305 PDFFile
* pFile
= new PDFFile();
306 pFile
->m_nMinor
= m_aUIntStack
.back();
307 m_aUIntStack
.pop_back();
308 pFile
->m_nMajor
= m_aUIntStack
.back();
309 m_aUIntStack
.pop_back();
310 m_aObjectStack
.push_back( pFile
);
313 parseError( "found file header in unusual place", pBegin
);
316 void pushComment( iteratorT first
, iteratorT last
)
318 // add a comment to the current stack element
319 PDFComment
* pComment
=
320 new PDFComment(iteratorToString(first
,last
));
321 if( m_aObjectStack
.empty() )
322 m_aObjectStack
.push_back( new PDFPart() );
323 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
324 if( pContainer
== nullptr )
325 parseError( "comment without container", first
);
326 pContainer
->m_aSubElements
.emplace_back( pComment
);
329 void insertNewValue( std::unique_ptr
<PDFEntry
> pNewValue
, iteratorT pPos
)
331 PDFContainer
* pContainer
= nullptr;
332 const char* pMsg
= nullptr;
333 if( ! m_aObjectStack
.empty() )
335 pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
338 if( dynamic_cast<PDFDict
*>(pContainer
) == nullptr &&
339 dynamic_cast<PDFArray
*>(pContainer
) == nullptr )
341 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(pContainer
);
344 if( pObj
->m_pObject
== nullptr )
345 pObj
->m_pObject
= pNewValue
.get();
348 pMsg
= "second value for object";
349 pContainer
= nullptr;
352 else if( dynamic_cast<PDFDict
*>(pNewValue
.get()) )
354 PDFTrailer
* pTrailer
= dynamic_cast<PDFTrailer
*>(pContainer
);
357 if( pTrailer
->m_pDict
== nullptr )
358 pTrailer
->m_pDict
= dynamic_cast<PDFDict
*>(pNewValue
.get());
360 pContainer
= nullptr;
363 pContainer
= nullptr;
366 pContainer
= nullptr;
371 pContainer
->m_aSubElements
.emplace_back( std::move(pNewValue
) );
376 if( dynamic_cast<PDFContainer
*>(pNewValue
.get()) )
377 pMsg
= "array without container";
379 pMsg
= "value without container";
381 parseError( pMsg
, pPos
);
385 void pushName( iteratorT first
, iteratorT last
)
387 insertNewValue( std::make_unique
<PDFName
>(iteratorToString(first
,last
)), first
);
390 void pushDouble( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
/*last*/ )
392 insertNewValue( std::make_unique
<PDFNumber
>(m_fDouble
), first
);
395 void pushString( iteratorT first
, iteratorT last
)
397 insertNewValue( std::make_unique
<PDFString
>(iteratorToString(first
,last
)), first
);
400 void pushBool( iteratorT first
, iteratorT last
)
402 insertNewValue( std::make_unique
<PDFBool
>( last
-first
== 4 ), first
);
405 void pushNull( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
407 insertNewValue( std::make_unique
<PDFNull
>(), first
);
411 void beginObject( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
/*last*/ )
413 if( m_aObjectStack
.empty() )
414 m_aObjectStack
.push_back( new PDFPart() );
416 unsigned int nGeneration
= m_aUIntStack
.back();
417 m_aUIntStack
.pop_back();
418 unsigned int nObject
= m_aUIntStack
.back();
419 m_aUIntStack
.pop_back();
421 PDFObject
* pObj
= new PDFObject( nObject
, nGeneration
);
422 pObj
->m_nOffset
= first
- m_aGlobalBegin
;
424 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
426 ( dynamic_cast<PDFFile
*>(pContainer
) ||
427 dynamic_cast<PDFPart
*>(pContainer
) ) )
429 pContainer
->m_aSubElements
.emplace_back( pObj
);
430 m_aObjectStack
.push_back( pObj
);
433 parseError( "object in wrong place", first
);
436 void endObject( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
438 if( m_aObjectStack
.empty() )
439 parseError( "endobj without obj", first
);
440 else if( dynamic_cast<PDFObject
*>(m_aObjectStack
.back()) == nullptr )
441 parseError( "spurious endobj", first
);
443 m_aObjectStack
.pop_back();
446 void pushObjectRef( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
448 unsigned int nGeneration
= m_aUIntStack
.back();
449 m_aUIntStack
.pop_back();
450 unsigned int nObject
= m_aUIntStack
.back();
451 m_aUIntStack
.pop_back();
452 insertNewValue( std::make_unique
<PDFObjectRef
>(nObject
,nGeneration
), first
);
455 void beginDict( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
457 PDFDict
* pDict
= new PDFDict();
458 pDict
->m_nOffset
= first
- m_aGlobalBegin
;
460 insertNewValue( std::unique_ptr
<PDFEntry
>(pDict
), first
);
461 // will not come here if insertion fails (exception)
462 m_aObjectStack
.push_back( pDict
);
464 void endDict( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
466 PDFDict
* pDict
= nullptr;
467 if( m_aObjectStack
.empty() )
468 parseError( "dictionary end without begin", first
);
469 else if( (pDict
= dynamic_cast<PDFDict
*>(m_aObjectStack
.back())) == nullptr )
470 parseError( "spurious dictionary end", first
);
472 m_aObjectStack
.pop_back();
474 PDFEntry
* pOffender
= pDict
->buildMap();
477 StringEmitContext aCtx
;
478 aCtx
.write( "offending dictionary element: ", 30 );
479 pOffender
->emit( aCtx
);
480 m_aErrorString
= aCtx
.getString();
481 parseError( m_aErrorString
.getStr(), first
);
485 void beginArray( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
487 PDFArray
* pArray
= new PDFArray();
488 pArray
->m_nOffset
= first
- m_aGlobalBegin
;
490 insertNewValue( std::unique_ptr
<PDFEntry
>(pArray
), first
);
491 // will not come here if insertion fails (exception)
492 m_aObjectStack
.push_back( pArray
);
495 void endArray( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
497 if( m_aObjectStack
.empty() )
498 parseError( "array end without begin", first
);
499 else if( dynamic_cast<PDFArray
*>(m_aObjectStack
.back()) == nullptr )
500 parseError( "spurious array end", first
);
502 m_aObjectStack
.pop_back();
505 void emitStream( iteratorT first
, iteratorT last
)
507 if( m_aObjectStack
.empty() )
508 parseError( "stream without object", first
);
509 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(m_aObjectStack
.back());
510 if( pObj
&& pObj
->m_pObject
)
512 if( pObj
->m_pStream
)
513 parseError( "multiple streams in object", first
);
515 PDFDict
* pDict
= dynamic_cast<PDFDict
*>(pObj
->m_pObject
);
518 PDFStream
* pStream
= new PDFStream( first
- m_aGlobalBegin
, last
- m_aGlobalBegin
, pDict
);
520 pObj
->m_pStream
= pStream
;
521 pObj
->m_aSubElements
.emplace_back( pStream
);
525 parseError( "stream without object", first
);
528 void beginTrailer( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
530 if( m_aObjectStack
.empty() )
531 m_aObjectStack
.push_back( new PDFPart() );
533 PDFTrailer
* pTrailer
= new PDFTrailer();
534 pTrailer
->m_nOffset
= first
- m_aGlobalBegin
;
536 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
538 ( dynamic_cast<PDFFile
*>(pContainer
) ||
539 dynamic_cast<PDFPart
*>(pContainer
) ) )
541 pContainer
->m_aSubElements
.emplace_back( pTrailer
);
542 m_aObjectStack
.push_back( pTrailer
);
545 parseError( "trailer in wrong place", first
);
548 void endTrailer( iteratorT first
, SAL_UNUSED_PARAMETER iteratorT
)
550 if( m_aObjectStack
.empty() )
551 parseError( "%%EOF without trailer", first
);
552 else if( dynamic_cast<PDFTrailer
*>(m_aObjectStack
.back()) == nullptr )
553 parseError( "spurious %%EOF", first
);
555 m_aObjectStack
.pop_back();
561 std::unique_ptr
<PDFEntry
> PDFReader::read( const char* pFileName
)
563 file_iterator
<> file_start( pFileName
);
566 file_iterator
<> file_end
= file_start
.make_end();
567 PDFGrammar
< file_iterator
<> > aGrammar( file_start
);
571 #if OSL_DEBUG_LEVEL > 0
572 boost::spirit::classic::parse_info
< file_iterator
<> > aInfo
=
574 boost::spirit::classic::parse( file_start
,
577 boost::spirit::classic::space_p
);
578 #if OSL_DEBUG_LEVEL > 0
579 SAL_INFO("sdext.pdfimport.pdfparse", "parseinfo: stop at offset = " << aInfo
.stop
- file_start
<< ", hit = " << (aInfo
.hit
? "true" : "false") << ", full = " << (aInfo
.full
? "true" : "false") << ", length = " << aInfo
.length
);
582 catch( const parser_error
< const char*, file_iterator
<> >& rError
)
584 SAL_WARN("sdext.pdfimport.pdfparse", "parse error: " << rError
.descriptor
<< " at buffer pos " << rError
.where
- file_start
);
585 #if OSL_DEBUG_LEVEL > 0
587 unsigned int nElem
= aGrammar
.m_aObjectStack
.size();
588 for( unsigned int i
= 0; i
< nElem
; i
++ )
591 aTmp
.appendAscii(typeid( *(aGrammar
.m_aObjectStack
[i
]) ).name());
593 SAL_WARN("sdext.pdfimport.pdfparse", "parse error object stack: " << aTmp
.makeStringAndClear());
597 std::unique_ptr
<PDFEntry
> pRet
;
598 unsigned int nEntries
= aGrammar
.m_aObjectStack
.size();
601 pRet
.reset(aGrammar
.m_aObjectStack
.back());
602 aGrammar
.m_aObjectStack
.pop_back();
604 else if( nEntries
> 1 )
606 // It is possible that there are multiple trailers, which is OK.
607 // But still keep the warnings, just in case.
608 SAL_WARN("sdext.pdfimport.pdfparse", "error got " << nEntries
<< " stack objects in parse");
611 PDFEntry
* pEntry
= aGrammar
.m_aObjectStack
.back();
612 aGrammar
.m_aObjectStack
.pop_back();
613 SAL_WARN("sdext.pdfimport.pdfparse", typeid(*pEntry
).name());
614 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(pEntry
);
616 SAL_WARN("sdext.pdfimport.pdfparse", " -> object " << pObj
->m_nNumber
<< " generation " << pObj
->m_nGeneration
);
617 if (aGrammar
.m_aObjectStack
.empty())
619 pRet
.reset(pEntry
); // The first entry references all others - see PDFGrammar dtor
627 #if defined(_MSC_VER)
631 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */