Update git submodules
[LibreOffice.git] / sdext / source / pdfimport / pdfparse / pdfparse.cxx
bloba340d0579e49fc230fd092abedc7389aa86f0531
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <pdfparse.hxx>
23 #include <boost/spirit/include/classic.hpp>
24 #include <boost/bind/bind.hpp>
26 #include <string.h>
28 #include <o3tl/char16_t2wchar_t.hxx>
29 #include <o3tl/safeint.hxx>
30 #include <osl/thread.h>
31 #include <rtl/strbuf.hxx>
32 #include <rtl/ustrbuf.hxx>
33 #include <sal/log.hxx>
34 #include <utility>
37 using namespace boost::spirit::classic;
38 using namespace pdfparse;
40 namespace {
42 class StringEmitContext : public EmitContext
44 OStringBuffer m_aBuf;
45 public:
46 StringEmitContext() : m_aBuf(256) {}
48 virtual bool write( const void* pBuf, unsigned int nLen ) noexcept override
50 m_aBuf.append( static_cast<const char*>(pBuf), nLen );
51 return true;
53 virtual unsigned int getCurPos() noexcept override { return m_aBuf.getLength(); }
54 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept override
55 { return (nOrigOffset+nLen < o3tl::make_unsigned(m_aBuf.getLength()) ) &&
56 write( m_aBuf.getStr() + nOrigOffset, nLen ); }
57 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept override
59 if( nOrigOffset+nLen < o3tl::make_unsigned(m_aBuf.getLength()) )
61 memcpy( pBuf, m_aBuf.getStr()+nOrigOffset, nLen );
62 return nLen;
64 return 0;
67 OString getString() { return m_aBuf.makeStringAndClear(); }
70 template< class iteratorT >
71 class PDFGrammar : public grammar< PDFGrammar<iteratorT> >
73 public:
75 explicit PDFGrammar( iteratorT first )
76 : m_fDouble( 0.0 ), m_aGlobalBegin(std::move( first )) {}
77 ~PDFGrammar()
79 if( !m_aObjectStack.empty() )
80 delete m_aObjectStack.front();
83 double m_fDouble;
84 std::vector< unsigned int > m_aUIntStack;
85 std::vector< PDFEntry* > m_aObjectStack;
86 OString m_aErrorString;
87 iteratorT m_aGlobalBegin;
89 public:
90 struct pdf_string_parser
92 typedef nil_t result_t;
93 template <typename ScannerT>
94 std::ptrdiff_t
95 operator()(ScannerT const& scan, result_t&) const
97 std::ptrdiff_t len = 0;
99 int nBraceLevel = 0;
100 while( ! scan.at_end() )
102 char c = *scan;
103 if( c == ')' )
105 nBraceLevel--;
106 if( nBraceLevel < 0 )
107 break;
109 else if( c == '(' )
110 nBraceLevel++;
111 else if( c == '\\' ) // ignore escaped braces
113 ++len;
114 ++scan.first; // tdf#63054: avoid skipping spaces
115 if( scan.first == scan.last ) // tdf#63054: avoid skipping spaces
116 break;
118 ++len;
119 ++scan;
121 return scan.at_end() ? -1 : len;
125 template< typename ScannerT >
126 struct definition
128 explicit definition( const PDFGrammar<iteratorT>& rSelf )
130 using namespace boost::placeholders;
132 PDFGrammar<iteratorT>* pSelf = const_cast< PDFGrammar<iteratorT>* >( &rSelf );
134 // workaround workshop compiler: comment_p doesn't work
135 // comment = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )];
136 comment = lexeme_d[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p)[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )] ];
138 boolean = (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool, pSelf, _1, _2)];
140 // workaround workshop compiler: confix_p doesn't work
141 //stream = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
142 stream = (str_p("stream") >> *(anychar_p - str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
144 name = lexeme_d[
145 ch_p('/')
146 >> (*(anychar_p-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0')))
147 [boost::bind(&PDFGrammar::pushName, pSelf, _1, _2)] ];
149 // workaround workshop compiler: confix_p doesn't work
150 //stringtype = ( confix_p("(",*anychar_p, ")") |
151 // confix_p("<",*xdigit_p, ">") )
152 // [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
154 stringtype = ( ( ch_p('(') >> functor_parser<pdf_string_parser>() >> ch_p(')') ) |
155 ( ch_p('<') >> *xdigit_p >> ch_p('>') ) )
156 [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
158 null_object = str_p( "null" )[boost::bind(&PDFGrammar::pushNull, pSelf, _1, _2)];
160 #ifdef USE_ASSIGN_ACTOR
161 objectref = ( uint_p[push_back_a(pSelf->m_aUIntStack)]
162 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
163 >> ch_p('R')
164 >> eps_p
165 )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
166 #else
167 objectref = ( uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
168 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
169 >> ch_p('R')
170 >> eps_p
171 )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
172 #endif
174 #ifdef USE_ASSIGN_ACTOR
175 simple_type = objectref | name |
176 ( real_p[assign_a(pSelf->m_fDouble)] >> eps_p )
177 [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
178 | stringtype | boolean | null_object;
179 #else
180 simple_type = objectref | name |
181 ( real_p[boost::bind(&PDFGrammar::assign_action_double, pSelf, _1)] >> eps_p )
182 [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
183 | stringtype | boolean | null_object;
184 #endif
186 dict_begin = str_p( "<<" )[boost::bind(&PDFGrammar::beginDict, pSelf, _1, _2)];
187 dict_end = str_p( ">>" )[boost::bind(&PDFGrammar::endDict, pSelf, _1, _2)];
189 array_begin = str_p("[")[boost::bind(&PDFGrammar::beginArray,pSelf, _1, _2)];
190 array_end = str_p("]")[boost::bind(&PDFGrammar::endArray,pSelf, _1, _2)];
192 #ifdef USE_ASSIGN_ACTOR
193 object_begin= uint_p[push_back_a(pSelf->m_aUIntStack)]
194 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
195 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
196 #else
197 object_begin= uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
198 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
199 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
200 #endif
201 object_end = str_p( "endobj" )[boost::bind(&PDFGrammar::endObject, pSelf, _1, _2)];
203 xref = str_p( "xref" ) >> uint_p >> uint_p
204 >> lexeme_d[
205 +( repeat_p(10)[digit_p]
206 >> blank_p
207 >> repeat_p(5)[digit_p]
208 >> blank_p
209 >> ( ch_p('n') | ch_p('f') )
210 >> repeat_p(2)[space_p]
211 ) ];
213 dict_element= dict_begin | comment | simple_type
214 | array_begin | array_end | dict_end;
216 object = object_begin
217 >> *dict_element
218 >> !stream
219 >> object_end;
221 trailer = str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer,pSelf,_1,_2)]
222 >> *dict_element
223 >> str_p("startxref")
224 >> uint_p
225 >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer,pSelf,_1,_2)];
227 #ifdef USE_ASSIGN_ACTOR
228 pdfrule = ! (lexeme_d[
229 str_p( "%PDF-" )
230 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
231 >> ch_p('.')
232 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
233 >> *((~ch_p('\r') & ~ch_p('\n')))
234 >> eol_p
235 ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
236 >> *( comment | object | ( xref >> trailer ) );
237 #else
238 pdfrule = ! (lexeme_d[
239 str_p( "%PDF-" )
240 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
241 >> ch_p('.')
242 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
243 >> *(~ch_p('\r') & ~ch_p('\n'))
244 >> eol_p
245 ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
246 >> *( comment | object | ( xref >> trailer ) );
247 #endif
249 rule< ScannerT > comment, stream, boolean, name, stringtype, null_object, simple_type,
250 objectref, array, value, dict_element, dict_begin, dict_end,
251 array_begin, array_end, object, object_begin, object_end,
252 xref, trailer, pdfrule;
254 const rule< ScannerT >& start() const { return pdfrule; }
257 #ifndef USE_ASSIGN_ACTOR
258 void push_back_action_uint( unsigned int i )
260 m_aUIntStack.push_back( i );
262 void assign_action_double( double d )
264 m_fDouble = d;
266 #endif
268 [[noreturn]] static void parseError( const char* pMessage, const iteratorT& pLocation )
270 throw_( pLocation, pMessage );
273 OString iteratorToString( iteratorT first, const iteratorT& last ) const
275 OStringBuffer aStr( 32 );
276 while( first != last )
278 aStr.append( *first );
279 ++first;
281 return aStr.makeStringAndClear();
284 void haveFile( const iteratorT& pBegin, SAL_UNUSED_PARAMETER iteratorT /*pEnd*/ )
286 if( m_aObjectStack.empty() )
288 PDFFile* pFile = new PDFFile();
289 pFile->m_nMinor = m_aUIntStack.back();
290 m_aUIntStack.pop_back();
291 pFile->m_nMajor = m_aUIntStack.back();
292 m_aUIntStack.pop_back();
293 m_aObjectStack.push_back( pFile );
295 else
296 parseError( "found file header in unusual place", pBegin );
299 void pushComment(const iteratorT& first, const iteratorT& last)
301 // add a comment to the current stack element
302 PDFComment* pComment =
303 new PDFComment(iteratorToString(first,last));
304 if( m_aObjectStack.empty() )
305 m_aObjectStack.push_back( new PDFPart() );
306 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
307 if( pContainer == nullptr )
308 parseError( "comment without container", first );
309 pContainer->m_aSubElements.emplace_back( pComment );
312 void insertNewValue( std::unique_ptr<PDFEntry> pNewValue, const iteratorT& pPos )
314 PDFContainer* pContainer = nullptr;
315 const char* pMsg = nullptr;
316 if( ! m_aObjectStack.empty() )
318 pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
319 if (pContainer)
321 if( dynamic_cast<PDFDict*>(pContainer) == nullptr &&
322 dynamic_cast<PDFArray*>(pContainer) == nullptr )
324 PDFObject* pObj = dynamic_cast<PDFObject*>(pContainer);
325 if( pObj )
327 if( pObj->m_pObject == nullptr )
328 pObj->m_pObject = pNewValue.get();
329 else
331 pMsg = "second value for object";
332 pContainer = nullptr;
335 else if( dynamic_cast<PDFDict*>(pNewValue.get()) )
337 PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pContainer);
338 if( pTrailer )
340 if( pTrailer->m_pDict == nullptr )
341 pTrailer->m_pDict = dynamic_cast<PDFDict*>(pNewValue.get());
342 else
343 pContainer = nullptr;
345 else
346 pContainer = nullptr;
348 else
349 pContainer = nullptr;
353 if( pContainer )
354 pContainer->m_aSubElements.emplace_back( std::move(pNewValue) );
355 else
357 if( ! pMsg )
359 if( dynamic_cast<PDFContainer*>(pNewValue.get()) )
360 pMsg = "array without container";
361 else
362 pMsg = "value without container";
364 parseError( pMsg, pPos );
368 void pushName(const iteratorT& first, const iteratorT& last )
370 insertNewValue( std::make_unique<PDFName>(iteratorToString(first,last)), first );
373 void pushDouble( const iteratorT& first, SAL_UNUSED_PARAMETER const iteratorT& /*last*/ )
375 insertNewValue( std::make_unique<PDFNumber>(m_fDouble), first );
378 void pushString( const iteratorT& first, const iteratorT& last )
380 insertNewValue( std::make_unique<PDFString>(iteratorToString(first,last)), first );
383 void pushBool( const iteratorT& first, const iteratorT& last )
385 insertNewValue( std::make_unique<PDFBool>( last-first == 4 ), first );
388 void pushNull( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
390 insertNewValue( std::make_unique<PDFNull>(), first );
393 void beginObject( const iteratorT& first, SAL_UNUSED_PARAMETER const iteratorT& /*last*/ )
395 if( m_aObjectStack.empty() )
396 m_aObjectStack.push_back( new PDFPart() );
398 unsigned int nGeneration = m_aUIntStack.back();
399 m_aUIntStack.pop_back();
400 unsigned int nObject = m_aUIntStack.back();
401 m_aUIntStack.pop_back();
403 PDFObject* pObj = new PDFObject( nObject, nGeneration );
404 pObj->m_nOffset = first - m_aGlobalBegin;
406 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
407 if( pContainer &&
408 ( dynamic_cast<PDFFile*>(pContainer) ||
409 dynamic_cast<PDFPart*>(pContainer) ) )
411 pContainer->m_aSubElements.emplace_back( pObj );
412 m_aObjectStack.push_back( pObj );
414 else
415 parseError( "object in wrong place", first );
418 void endObject( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
420 if( m_aObjectStack.empty() )
421 parseError( "endobj without obj", first );
422 else if( dynamic_cast<PDFObject*>(m_aObjectStack.back()) == nullptr )
423 parseError( "spurious endobj", first );
424 else
425 m_aObjectStack.pop_back();
428 void pushObjectRef( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
430 unsigned int nGeneration = m_aUIntStack.back();
431 m_aUIntStack.pop_back();
432 unsigned int nObject = m_aUIntStack.back();
433 m_aUIntStack.pop_back();
434 insertNewValue( std::make_unique<PDFObjectRef>(nObject,nGeneration), first );
437 void beginDict( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
439 PDFDict* pDict = new PDFDict();
440 pDict->m_nOffset = first - m_aGlobalBegin;
442 insertNewValue( std::unique_ptr<PDFEntry>(pDict), first );
443 // will not come here if insertion fails (exception)
444 m_aObjectStack.push_back( pDict );
447 void endDict( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
449 PDFDict* pDict = nullptr;
450 if( m_aObjectStack.empty() )
451 parseError( "dictionary end without begin", first );
452 else if( (pDict = dynamic_cast<PDFDict*>(m_aObjectStack.back())) == nullptr )
453 parseError( "spurious dictionary end", first );
454 else
455 m_aObjectStack.pop_back();
457 PDFEntry* pOffender = pDict->buildMap();
458 if( pOffender )
460 StringEmitContext aCtx;
461 aCtx.write( "offending dictionary element: ", 30 );
462 pOffender->emit( aCtx );
463 m_aErrorString = aCtx.getString();
464 parseError( m_aErrorString.getStr(), first );
468 void beginArray( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
470 PDFArray* pArray = new PDFArray();
471 pArray->m_nOffset = first - m_aGlobalBegin;
473 insertNewValue( std::unique_ptr<PDFEntry>(pArray), first );
474 // will not come here if insertion fails (exception)
475 m_aObjectStack.push_back( pArray );
478 void endArray( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
480 if( m_aObjectStack.empty() )
481 parseError( "array end without begin", first );
482 else if( dynamic_cast<PDFArray*>(m_aObjectStack.back()) == nullptr )
483 parseError( "spurious array end", first );
484 else
485 m_aObjectStack.pop_back();
488 void emitStream(const iteratorT& first, const iteratorT& last)
490 if( m_aObjectStack.empty() )
491 parseError( "stream without object", first );
492 PDFObject* pObj = dynamic_cast<PDFObject*>(m_aObjectStack.back());
493 if( pObj && pObj->m_pObject )
495 if( pObj->m_pStream )
496 parseError( "multiple streams in object", first );
498 PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
499 if( pDict )
501 PDFStream* pStream = new PDFStream( first - m_aGlobalBegin, last - m_aGlobalBegin, pDict );
503 pObj->m_pStream = pStream;
504 pObj->m_aSubElements.emplace_back( pStream );
507 else
508 parseError( "stream without object", first );
511 void beginTrailer( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
513 if( m_aObjectStack.empty() )
514 m_aObjectStack.push_back( new PDFPart() );
516 PDFTrailer* pTrailer = new PDFTrailer();
517 pTrailer->m_nOffset = first - m_aGlobalBegin;
519 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
520 if( pContainer &&
521 ( dynamic_cast<PDFFile*>(pContainer) ||
522 dynamic_cast<PDFPart*>(pContainer) ) )
524 pContainer->m_aSubElements.emplace_back( pTrailer );
525 m_aObjectStack.push_back( pTrailer );
527 else
528 parseError( "trailer in wrong place", first );
531 void endTrailer(const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
533 if( m_aObjectStack.empty() )
534 parseError( "%%EOF without trailer", first );
535 else if( dynamic_cast<PDFTrailer*>(m_aObjectStack.back()) == nullptr )
536 parseError( "spurious %%EOF", first );
537 else
538 m_aObjectStack.pop_back();
544 std::unique_ptr<PDFEntry> PDFReader::read(std::u16string_view aFileName)
546 #ifdef _WIN32
547 file_iterator<> file_start(std::wstring(o3tl::toW(aFileName)));
548 #else
549 file_iterator<> file_start(
550 std::string(OUStringToOString(aFileName, osl_getThreadTextEncoding())));
551 #endif
552 if( ! file_start )
553 return nullptr;
554 file_iterator<> file_end = file_start.make_end();
555 PDFGrammar< file_iterator<> > aGrammar( file_start );
559 #if OSL_DEBUG_LEVEL > 0
560 boost::spirit::classic::parse_info< file_iterator<> > aInfo =
561 #endif
562 boost::spirit::classic::parse( file_start,
563 file_end,
564 aGrammar,
565 boost::spirit::classic::space_p );
566 #if OSL_DEBUG_LEVEL > 0
567 SAL_INFO("sdext.pdfimport.pdfparse", "parseinfo: stop at offset = " << aInfo.stop - file_start << ", hit = " << (aInfo.hit ? "true" : "false") << ", full = " << (aInfo.full ? "true" : "false") << ", length = " << aInfo.length);
568 #endif
570 catch( const parser_error< const char*, file_iterator<> >& rError )
572 SAL_WARN("sdext.pdfimport.pdfparse", "parse error: " << rError.descriptor << " at buffer pos " << rError.where - file_start);
573 #if OSL_DEBUG_LEVEL > 0
574 OUStringBuffer aTmp;
575 unsigned int nElem = aGrammar.m_aObjectStack.size();
576 for( unsigned int i = 0; i < nElem; i++ )
578 aTmp.append(" ");
579 aTmp.appendAscii(typeid( *(aGrammar.m_aObjectStack[i]) ).name());
581 SAL_WARN("sdext.pdfimport.pdfparse", "parse error object stack: " << aTmp.makeStringAndClear());
582 #endif
585 std::unique_ptr<PDFEntry> pRet;
586 unsigned int nEntries = aGrammar.m_aObjectStack.size();
587 if( nEntries == 1 )
589 pRet.reset(aGrammar.m_aObjectStack.back());
590 aGrammar.m_aObjectStack.pop_back();
592 else if( nEntries > 1 )
594 // It is possible that there are multiple trailers, which is OK.
595 // But still keep the warnings, just in case.
596 SAL_WARN("sdext.pdfimport.pdfparse", "error got " << nEntries << " stack objects in parse");
597 for (;;)
599 PDFEntry* pEntry = aGrammar.m_aObjectStack.back();
600 aGrammar.m_aObjectStack.pop_back();
601 SAL_WARN("sdext.pdfimport.pdfparse", typeid(*pEntry).name());
602 PDFObject* pObj = dynamic_cast<PDFObject*>(pEntry);
603 if( pObj )
604 SAL_WARN("sdext.pdfimport.pdfparse", " -> object " << pObj->m_nNumber << " generation " << pObj->m_nGeneration);
605 if (aGrammar.m_aObjectStack.empty())
607 pRet.reset(pEntry); // The first entry references all others - see PDFGrammar dtor
608 break;
612 return pRet;
615 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */