1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <pdfparse.hxx>
23 #include <boost/spirit/include/classic.hpp>
24 #include <boost/bind/bind.hpp>
28 #include <o3tl/char16_t2wchar_t.hxx>
29 #include <o3tl/safeint.hxx>
30 #include <osl/thread.h>
31 #include <rtl/strbuf.hxx>
32 #include <rtl/ustrbuf.hxx>
33 #include <sal/log.hxx>
37 using namespace boost::spirit::classic
;
38 using namespace pdfparse
;
42 class StringEmitContext
: public EmitContext
46 StringEmitContext() : m_aBuf(256) {}
48 virtual bool write( const void* pBuf
, unsigned int nLen
) noexcept override
50 m_aBuf
.append( static_cast<const char*>(pBuf
), nLen
);
53 virtual unsigned int getCurPos() noexcept override
{ return m_aBuf
.getLength(); }
54 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) noexcept override
55 { return (nOrigOffset
+nLen
< o3tl::make_unsigned(m_aBuf
.getLength()) ) &&
56 write( m_aBuf
.getStr() + nOrigOffset
, nLen
); }
57 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) noexcept override
59 if( nOrigOffset
+nLen
< o3tl::make_unsigned(m_aBuf
.getLength()) )
61 memcpy( pBuf
, m_aBuf
.getStr()+nOrigOffset
, nLen
);
67 OString
getString() { return m_aBuf
.makeStringAndClear(); }
70 template< class iteratorT
>
71 class PDFGrammar
: public grammar
< PDFGrammar
<iteratorT
> >
75 explicit PDFGrammar( iteratorT first
)
76 : m_fDouble( 0.0 ), m_aGlobalBegin(std::move( first
)) {}
79 if( !m_aObjectStack
.empty() )
80 delete m_aObjectStack
.front();
84 std::vector
< unsigned int > m_aUIntStack
;
85 std::vector
< PDFEntry
* > m_aObjectStack
;
86 OString m_aErrorString
;
87 iteratorT m_aGlobalBegin
;
90 struct pdf_string_parser
92 typedef nil_t result_t
;
93 template <typename ScannerT
>
95 operator()(ScannerT
const& scan
, result_t
&) const
97 std::ptrdiff_t len
= 0;
100 while( ! scan
.at_end() )
106 if( nBraceLevel
< 0 )
111 else if( c
== '\\' ) // ignore escaped braces
114 ++scan
.first
; // tdf#63054: avoid skipping spaces
115 if( scan
.first
== scan
.last
) // tdf#63054: avoid skipping spaces
121 return scan
.at_end() ? -1 : len
;
125 template< typename ScannerT
>
128 explicit definition( const PDFGrammar
<iteratorT
>& rSelf
)
130 using namespace boost::placeholders
;
132 PDFGrammar
<iteratorT
>* pSelf
= const_cast< PDFGrammar
<iteratorT
>* >( &rSelf
);
134 // workaround workshop compiler: comment_p doesn't work
135 // comment = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )];
136 comment
= lexeme_d
[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p
)[boost::bind(&PDFGrammar::pushComment
, pSelf
, _1
, _2
)] ];
138 boolean
= (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool
, pSelf
, _1
, _2
)];
140 // workaround workshop compiler: confix_p doesn't work
141 //stream = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
142 stream
= (str_p("stream") >> *(anychar_p
- str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream
, pSelf
, _1
, _2
)];
146 >> (*(anychar_p
-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0')))
147 [boost::bind(&PDFGrammar::pushName
, pSelf
, _1
, _2
)] ];
149 // workaround workshop compiler: confix_p doesn't work
150 //stringtype = ( confix_p("(",*anychar_p, ")") |
151 // confix_p("<",*xdigit_p, ">") )
152 // [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
154 stringtype
= ( ( ch_p('(') >> functor_parser
<pdf_string_parser
>() >> ch_p(')') ) |
155 ( ch_p('<') >> *xdigit_p
>> ch_p('>') ) )
156 [boost::bind(&PDFGrammar::pushString
,pSelf
, _1
, _2
)];
158 null_object
= str_p( "null" )[boost::bind(&PDFGrammar::pushNull
, pSelf
, _1
, _2
)];
160 #ifdef USE_ASSIGN_ACTOR
161 objectref
= ( uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
162 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
165 )[boost::bind(&PDFGrammar::pushObjectRef
, pSelf
, _1
, _2
)];
167 objectref
= ( uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
168 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
171 )[boost::bind(&PDFGrammar::pushObjectRef
, pSelf
, _1
, _2
)];
174 #ifdef USE_ASSIGN_ACTOR
175 simple_type
= objectref
| name
|
176 ( real_p
[assign_a(pSelf
->m_fDouble
)] >> eps_p
)
177 [boost::bind(&PDFGrammar::pushDouble
, pSelf
, _1
, _2
)]
178 | stringtype
| boolean
| null_object
;
180 simple_type
= objectref
| name
|
181 ( real_p
[boost::bind(&PDFGrammar::assign_action_double
, pSelf
, _1
)] >> eps_p
)
182 [boost::bind(&PDFGrammar::pushDouble
, pSelf
, _1
, _2
)]
183 | stringtype
| boolean
| null_object
;
186 dict_begin
= str_p( "<<" )[boost::bind(&PDFGrammar::beginDict
, pSelf
, _1
, _2
)];
187 dict_end
= str_p( ">>" )[boost::bind(&PDFGrammar::endDict
, pSelf
, _1
, _2
)];
189 array_begin
= str_p("[")[boost::bind(&PDFGrammar::beginArray
,pSelf
, _1
, _2
)];
190 array_end
= str_p("]")[boost::bind(&PDFGrammar::endArray
,pSelf
, _1
, _2
)];
192 #ifdef USE_ASSIGN_ACTOR
193 object_begin
= uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
194 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
195 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject
, pSelf
, _1
, _2
)];
197 object_begin
= uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
198 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
199 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject
, pSelf
, _1
, _2
)];
201 object_end
= str_p( "endobj" )[boost::bind(&PDFGrammar::endObject
, pSelf
, _1
, _2
)];
203 xref
= str_p( "xref" ) >> uint_p
>> uint_p
205 +( repeat_p(10)[digit_p
]
207 >> repeat_p(5)[digit_p
]
209 >> ( ch_p('n') | ch_p('f') )
210 >> repeat_p(2)[space_p
]
213 dict_element
= dict_begin
| comment
| simple_type
214 | array_begin
| array_end
| dict_end
;
216 object
= object_begin
221 trailer
= str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer
,pSelf
,_1
,_2
)]
223 >> str_p("startxref")
225 >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer
,pSelf
,_1
,_2
)];
227 #ifdef USE_ASSIGN_ACTOR
228 pdfrule
= ! (lexeme_d
[
230 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
232 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
233 >> *((~ch_p('\r') & ~ch_p('\n')))
235 ])[boost::bind(&PDFGrammar::haveFile
,pSelf
, _1
, _2
)]
236 >> *( comment
| object
| ( xref
>> trailer
) );
238 pdfrule
= ! (lexeme_d
[
240 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
242 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
243 >> *(~ch_p('\r') & ~ch_p('\n'))
245 ])[boost::bind(&PDFGrammar::haveFile
,pSelf
, _1
, _2
)]
246 >> *( comment
| object
| ( xref
>> trailer
) );
249 rule
< ScannerT
> comment
, stream
, boolean
, name
, stringtype
, null_object
, simple_type
,
250 objectref
, array
, value
, dict_element
, dict_begin
, dict_end
,
251 array_begin
, array_end
, object
, object_begin
, object_end
,
252 xref
, trailer
, pdfrule
;
254 const rule
< ScannerT
>& start() const { return pdfrule
; }
257 #ifndef USE_ASSIGN_ACTOR
258 void push_back_action_uint( unsigned int i
)
260 m_aUIntStack
.push_back( i
);
262 void assign_action_double( double d
)
268 [[noreturn
]] static void parseError( const char* pMessage
, const iteratorT
& pLocation
)
270 throw_( pLocation
, pMessage
);
273 OString
iteratorToString( iteratorT first
, const iteratorT
& last
) const
275 OStringBuffer
aStr( 32 );
276 while( first
!= last
)
278 aStr
.append( *first
);
281 return aStr
.makeStringAndClear();
284 void haveFile( const iteratorT
& pBegin
, SAL_UNUSED_PARAMETER iteratorT
/*pEnd*/ )
286 if( m_aObjectStack
.empty() )
288 PDFFile
* pFile
= new PDFFile();
289 pFile
->m_nMinor
= m_aUIntStack
.back();
290 m_aUIntStack
.pop_back();
291 pFile
->m_nMajor
= m_aUIntStack
.back();
292 m_aUIntStack
.pop_back();
293 m_aObjectStack
.push_back( pFile
);
296 parseError( "found file header in unusual place", pBegin
);
299 void pushComment(const iteratorT
& first
, const iteratorT
& last
)
301 // add a comment to the current stack element
302 PDFComment
* pComment
=
303 new PDFComment(iteratorToString(first
,last
));
304 if( m_aObjectStack
.empty() )
305 m_aObjectStack
.push_back( new PDFPart() );
306 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
307 if( pContainer
== nullptr )
308 parseError( "comment without container", first
);
309 pContainer
->m_aSubElements
.emplace_back( pComment
);
312 void insertNewValue( std::unique_ptr
<PDFEntry
> pNewValue
, const iteratorT
& pPos
)
314 PDFContainer
* pContainer
= nullptr;
315 const char* pMsg
= nullptr;
316 if( ! m_aObjectStack
.empty() )
318 pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
321 if( dynamic_cast<PDFDict
*>(pContainer
) == nullptr &&
322 dynamic_cast<PDFArray
*>(pContainer
) == nullptr )
324 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(pContainer
);
327 if( pObj
->m_pObject
== nullptr )
328 pObj
->m_pObject
= pNewValue
.get();
331 pMsg
= "second value for object";
332 pContainer
= nullptr;
335 else if( dynamic_cast<PDFDict
*>(pNewValue
.get()) )
337 PDFTrailer
* pTrailer
= dynamic_cast<PDFTrailer
*>(pContainer
);
340 if( pTrailer
->m_pDict
== nullptr )
341 pTrailer
->m_pDict
= dynamic_cast<PDFDict
*>(pNewValue
.get());
343 pContainer
= nullptr;
346 pContainer
= nullptr;
349 pContainer
= nullptr;
354 pContainer
->m_aSubElements
.emplace_back( std::move(pNewValue
) );
359 if( dynamic_cast<PDFContainer
*>(pNewValue
.get()) )
360 pMsg
= "array without container";
362 pMsg
= "value without container";
364 parseError( pMsg
, pPos
);
368 void pushName(const iteratorT
& first
, const iteratorT
& last
)
370 insertNewValue( std::make_unique
<PDFName
>(iteratorToString(first
,last
)), first
);
373 void pushDouble( const iteratorT
& first
, SAL_UNUSED_PARAMETER
const iteratorT
& /*last*/ )
375 insertNewValue( std::make_unique
<PDFNumber
>(m_fDouble
), first
);
378 void pushString( const iteratorT
& first
, const iteratorT
& last
)
380 insertNewValue( std::make_unique
<PDFString
>(iteratorToString(first
,last
)), first
);
383 void pushBool( const iteratorT
& first
, const iteratorT
& last
)
385 insertNewValue( std::make_unique
<PDFBool
>( last
-first
== 4 ), first
);
388 void pushNull( const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
390 insertNewValue( std::make_unique
<PDFNull
>(), first
);
393 void beginObject( const iteratorT
& first
, SAL_UNUSED_PARAMETER
const iteratorT
& /*last*/ )
395 if( m_aObjectStack
.empty() )
396 m_aObjectStack
.push_back( new PDFPart() );
398 unsigned int nGeneration
= m_aUIntStack
.back();
399 m_aUIntStack
.pop_back();
400 unsigned int nObject
= m_aUIntStack
.back();
401 m_aUIntStack
.pop_back();
403 PDFObject
* pObj
= new PDFObject( nObject
, nGeneration
);
404 pObj
->m_nOffset
= first
- m_aGlobalBegin
;
406 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
408 ( dynamic_cast<PDFFile
*>(pContainer
) ||
409 dynamic_cast<PDFPart
*>(pContainer
) ) )
411 pContainer
->m_aSubElements
.emplace_back( pObj
);
412 m_aObjectStack
.push_back( pObj
);
415 parseError( "object in wrong place", first
);
418 void endObject( const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
420 if( m_aObjectStack
.empty() )
421 parseError( "endobj without obj", first
);
422 else if( dynamic_cast<PDFObject
*>(m_aObjectStack
.back()) == nullptr )
423 parseError( "spurious endobj", first
);
425 m_aObjectStack
.pop_back();
428 void pushObjectRef( const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
430 unsigned int nGeneration
= m_aUIntStack
.back();
431 m_aUIntStack
.pop_back();
432 unsigned int nObject
= m_aUIntStack
.back();
433 m_aUIntStack
.pop_back();
434 insertNewValue( std::make_unique
<PDFObjectRef
>(nObject
,nGeneration
), first
);
437 void beginDict( const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
439 PDFDict
* pDict
= new PDFDict();
440 pDict
->m_nOffset
= first
- m_aGlobalBegin
;
442 insertNewValue( std::unique_ptr
<PDFEntry
>(pDict
), first
);
443 // will not come here if insertion fails (exception)
444 m_aObjectStack
.push_back( pDict
);
447 void endDict( const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
449 PDFDict
* pDict
= nullptr;
450 if( m_aObjectStack
.empty() )
451 parseError( "dictionary end without begin", first
);
452 else if( (pDict
= dynamic_cast<PDFDict
*>(m_aObjectStack
.back())) == nullptr )
453 parseError( "spurious dictionary end", first
);
455 m_aObjectStack
.pop_back();
457 PDFEntry
* pOffender
= pDict
->buildMap();
460 StringEmitContext aCtx
;
461 aCtx
.write( "offending dictionary element: ", 30 );
462 pOffender
->emit( aCtx
);
463 m_aErrorString
= aCtx
.getString();
464 parseError( m_aErrorString
.getStr(), first
);
468 void beginArray( const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
470 PDFArray
* pArray
= new PDFArray();
471 pArray
->m_nOffset
= first
- m_aGlobalBegin
;
473 insertNewValue( std::unique_ptr
<PDFEntry
>(pArray
), first
);
474 // will not come here if insertion fails (exception)
475 m_aObjectStack
.push_back( pArray
);
478 void endArray( const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
480 if( m_aObjectStack
.empty() )
481 parseError( "array end without begin", first
);
482 else if( dynamic_cast<PDFArray
*>(m_aObjectStack
.back()) == nullptr )
483 parseError( "spurious array end", first
);
485 m_aObjectStack
.pop_back();
488 void emitStream(const iteratorT
& first
, const iteratorT
& last
)
490 if( m_aObjectStack
.empty() )
491 parseError( "stream without object", first
);
492 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(m_aObjectStack
.back());
493 if( pObj
&& pObj
->m_pObject
)
495 if( pObj
->m_pStream
)
496 parseError( "multiple streams in object", first
);
498 PDFDict
* pDict
= dynamic_cast<PDFDict
*>(pObj
->m_pObject
);
501 PDFStream
* pStream
= new PDFStream( first
- m_aGlobalBegin
, last
- m_aGlobalBegin
, pDict
);
503 pObj
->m_pStream
= pStream
;
504 pObj
->m_aSubElements
.emplace_back( pStream
);
508 parseError( "stream without object", first
);
511 void beginTrailer( const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
513 if( m_aObjectStack
.empty() )
514 m_aObjectStack
.push_back( new PDFPart() );
516 PDFTrailer
* pTrailer
= new PDFTrailer();
517 pTrailer
->m_nOffset
= first
- m_aGlobalBegin
;
519 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
521 ( dynamic_cast<PDFFile
*>(pContainer
) ||
522 dynamic_cast<PDFPart
*>(pContainer
) ) )
524 pContainer
->m_aSubElements
.emplace_back( pTrailer
);
525 m_aObjectStack
.push_back( pTrailer
);
528 parseError( "trailer in wrong place", first
);
531 void endTrailer(const iteratorT
& first
, SAL_UNUSED_PARAMETER iteratorT
)
533 if( m_aObjectStack
.empty() )
534 parseError( "%%EOF without trailer", first
);
535 else if( dynamic_cast<PDFTrailer
*>(m_aObjectStack
.back()) == nullptr )
536 parseError( "spurious %%EOF", first
);
538 m_aObjectStack
.pop_back();
544 std::unique_ptr
<PDFEntry
> PDFReader::read(std::u16string_view aFileName
)
547 file_iterator
<> file_start(std::wstring(o3tl::toW(aFileName
)));
549 file_iterator
<> file_start(
550 std::string(OUStringToOString(aFileName
, osl_getThreadTextEncoding())));
554 file_iterator
<> file_end
= file_start
.make_end();
555 PDFGrammar
< file_iterator
<> > aGrammar( file_start
);
559 #if OSL_DEBUG_LEVEL > 0
560 boost::spirit::classic::parse_info
< file_iterator
<> > aInfo
=
562 boost::spirit::classic::parse( file_start
,
565 boost::spirit::classic::space_p
);
566 #if OSL_DEBUG_LEVEL > 0
567 SAL_INFO("sdext.pdfimport.pdfparse", "parseinfo: stop at offset = " << aInfo
.stop
- file_start
<< ", hit = " << (aInfo
.hit
? "true" : "false") << ", full = " << (aInfo
.full
? "true" : "false") << ", length = " << aInfo
.length
);
570 catch( const parser_error
< const char*, file_iterator
<> >& rError
)
572 SAL_WARN("sdext.pdfimport.pdfparse", "parse error: " << rError
.descriptor
<< " at buffer pos " << rError
.where
- file_start
);
573 #if OSL_DEBUG_LEVEL > 0
575 unsigned int nElem
= aGrammar
.m_aObjectStack
.size();
576 for( unsigned int i
= 0; i
< nElem
; i
++ )
579 aTmp
.appendAscii(typeid( *(aGrammar
.m_aObjectStack
[i
]) ).name());
581 SAL_WARN("sdext.pdfimport.pdfparse", "parse error object stack: " << aTmp
.makeStringAndClear());
585 std::unique_ptr
<PDFEntry
> pRet
;
586 unsigned int nEntries
= aGrammar
.m_aObjectStack
.size();
589 pRet
.reset(aGrammar
.m_aObjectStack
.back());
590 aGrammar
.m_aObjectStack
.pop_back();
592 else if( nEntries
> 1 )
594 // It is possible that there are multiple trailers, which is OK.
595 // But still keep the warnings, just in case.
596 SAL_WARN("sdext.pdfimport.pdfparse", "error got " << nEntries
<< " stack objects in parse");
599 PDFEntry
* pEntry
= aGrammar
.m_aObjectStack
.back();
600 aGrammar
.m_aObjectStack
.pop_back();
601 SAL_WARN("sdext.pdfimport.pdfparse", typeid(*pEntry
).name());
602 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(pEntry
);
604 SAL_WARN("sdext.pdfimport.pdfparse", " -> object " << pObj
->m_nNumber
<< " generation " << pObj
->m_nGeneration
);
605 if (aGrammar
.m_aObjectStack
.empty())
607 pRet
.reset(pEntry
); // The first entry references all others - see PDFGrammar dtor
615 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */