Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / sdext / source / pdfimport / pdfparse / pdfparse.cxx
blobcdd3ac13ff3542646e7774c61e0d42f3946553d5
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <pdfparse.hxx>
23 // boost using obsolete stuff
24 #if defined(_MSC_VER)
25 #pragma warning(push)
26 #pragma warning(disable:4996)
27 #pragma warning(disable:4503)
28 #endif
30 // workaround windows compiler: do not include multi_pass.hpp
31 #include <boost/spirit/include/classic_core.hpp>
32 #include <boost/spirit/include/classic_utility.hpp>
33 #include <boost/spirit/include/classic_error_handling.hpp>
34 #include <boost/spirit/include/classic_file_iterator.hpp>
35 #include <boost/bind/bind.hpp>
37 #include <string.h>
39 #include <o3tl/safeint.hxx>
40 #include <rtl/strbuf.hxx>
41 #include <rtl/ustrbuf.hxx>
42 #include <sal/log.hxx>
43 #include <utility>
45 // disable warnings again because someone along the line has enabled them
46 // (we have included boost headers, what did you expect?)
47 #if defined(_MSC_VER)
48 #pragma warning(push)
49 #pragma warning(disable:4996)
50 #pragma warning(disable:4503)
51 #endif
54 using namespace boost::spirit::classic;
55 using namespace pdfparse;
57 namespace {
59 class StringEmitContext : public EmitContext
61 OStringBuffer m_aBuf;
62 public:
63 StringEmitContext() : m_aBuf(256) {}
65 virtual bool write( const void* pBuf, unsigned int nLen ) noexcept override
67 m_aBuf.append( static_cast<const char*>(pBuf), nLen );
68 return true;
70 virtual unsigned int getCurPos() noexcept override { return m_aBuf.getLength(); }
71 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept override
72 { return (nOrigOffset+nLen < o3tl::make_unsigned(m_aBuf.getLength()) ) &&
73 write( m_aBuf.getStr() + nOrigOffset, nLen ); }
74 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept override
76 if( nOrigOffset+nLen < o3tl::make_unsigned(m_aBuf.getLength()) )
78 memcpy( pBuf, m_aBuf.getStr()+nOrigOffset, nLen );
79 return nLen;
81 return 0;
84 OString getString() { return m_aBuf.makeStringAndClear(); }
87 template< class iteratorT >
88 class PDFGrammar : public grammar< PDFGrammar<iteratorT> >
90 public:
92 explicit PDFGrammar( iteratorT first )
93 : m_fDouble( 0.0 ), m_aGlobalBegin(std::move( first )) {}
94 ~PDFGrammar()
96 if( !m_aObjectStack.empty() )
97 delete m_aObjectStack.front();
100 double m_fDouble;
101 std::vector< unsigned int > m_aUIntStack;
102 std::vector< PDFEntry* > m_aObjectStack;
103 OString m_aErrorString;
104 iteratorT m_aGlobalBegin;
106 public:
107 struct pdf_string_parser
109 typedef nil_t result_t;
110 template <typename ScannerT>
111 std::ptrdiff_t
112 operator()(ScannerT const& scan, result_t&) const
114 std::ptrdiff_t len = 0;
116 int nBraceLevel = 0;
117 while( ! scan.at_end() )
119 char c = *scan;
120 if( c == ')' )
122 nBraceLevel--;
123 if( nBraceLevel < 0 )
124 break;
126 else if( c == '(' )
127 nBraceLevel++;
128 else if( c == '\\' ) // ignore escaped braces
130 ++len;
131 ++scan.first; // tdf#63054: avoid skipping spaces
132 if( scan.first == scan.last ) // tdf#63054: avoid skipping spaces
133 break;
135 ++len;
136 ++scan;
138 return scan.at_end() ? -1 : len;
142 template< typename ScannerT >
143 struct definition
145 explicit definition( const PDFGrammar<iteratorT>& rSelf )
147 using namespace boost::placeholders;
149 PDFGrammar<iteratorT>* pSelf = const_cast< PDFGrammar<iteratorT>* >( &rSelf );
151 // workaround workshop compiler: comment_p doesn't work
152 // comment = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )];
153 comment = lexeme_d[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p)[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )] ];
155 boolean = (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool, pSelf, _1, _2)];
157 // workaround workshop compiler: confix_p doesn't work
158 //stream = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
159 stream = (str_p("stream") >> *(anychar_p - str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
161 name = lexeme_d[
162 ch_p('/')
163 >> (*(anychar_p-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0')))
164 [boost::bind(&PDFGrammar::pushName, pSelf, _1, _2)] ];
166 // workaround workshop compiler: confix_p doesn't work
167 //stringtype = ( confix_p("(",*anychar_p, ")") |
168 // confix_p("<",*xdigit_p, ">") )
169 // [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
171 stringtype = ( ( ch_p('(') >> functor_parser<pdf_string_parser>() >> ch_p(')') ) |
172 ( ch_p('<') >> *xdigit_p >> ch_p('>') ) )
173 [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
175 null_object = str_p( "null" )[boost::bind(&PDFGrammar::pushNull, pSelf, _1, _2)];
177 #ifdef USE_ASSIGN_ACTOR
178 objectref = ( uint_p[push_back_a(pSelf->m_aUIntStack)]
179 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
180 >> ch_p('R')
181 >> eps_p
182 )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
183 #else
184 objectref = ( uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
185 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
186 >> ch_p('R')
187 >> eps_p
188 )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
189 #endif
191 #ifdef USE_ASSIGN_ACTOR
192 simple_type = objectref | name |
193 ( real_p[assign_a(pSelf->m_fDouble)] >> eps_p )
194 [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
195 | stringtype | boolean | null_object;
196 #else
197 simple_type = objectref | name |
198 ( real_p[boost::bind(&PDFGrammar::assign_action_double, pSelf, _1)] >> eps_p )
199 [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
200 | stringtype | boolean | null_object;
201 #endif
203 dict_begin = str_p( "<<" )[boost::bind(&PDFGrammar::beginDict, pSelf, _1, _2)];
204 dict_end = str_p( ">>" )[boost::bind(&PDFGrammar::endDict, pSelf, _1, _2)];
206 array_begin = str_p("[")[boost::bind(&PDFGrammar::beginArray,pSelf, _1, _2)];
207 array_end = str_p("]")[boost::bind(&PDFGrammar::endArray,pSelf, _1, _2)];
209 #ifdef USE_ASSIGN_ACTOR
210 object_begin= uint_p[push_back_a(pSelf->m_aUIntStack)]
211 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
212 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
213 #else
214 object_begin= uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
215 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
216 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
217 #endif
218 object_end = str_p( "endobj" )[boost::bind(&PDFGrammar::endObject, pSelf, _1, _2)];
220 xref = str_p( "xref" ) >> uint_p >> uint_p
221 >> lexeme_d[
222 +( repeat_p(10)[digit_p]
223 >> blank_p
224 >> repeat_p(5)[digit_p]
225 >> blank_p
226 >> ( ch_p('n') | ch_p('f') )
227 >> repeat_p(2)[space_p]
228 ) ];
230 dict_element= dict_begin | comment | simple_type
231 | array_begin | array_end | dict_end;
233 object = object_begin
234 >> *dict_element
235 >> !stream
236 >> object_end;
238 trailer = str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer,pSelf,_1,_2)]
239 >> *dict_element
240 >> str_p("startxref")
241 >> uint_p
242 >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer,pSelf,_1,_2)];
244 #ifdef USE_ASSIGN_ACTOR
245 pdfrule = ! (lexeme_d[
246 str_p( "%PDF-" )
247 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
248 >> ch_p('.')
249 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
250 >> *((~ch_p('\r') & ~ch_p('\n')))
251 >> eol_p
252 ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
253 >> *( comment | object | ( xref >> trailer ) );
254 #else
255 pdfrule = ! (lexeme_d[
256 str_p( "%PDF-" )
257 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
258 >> ch_p('.')
259 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
260 >> *(~ch_p('\r') & ~ch_p('\n'))
261 >> eol_p
262 ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
263 >> *( comment | object | ( xref >> trailer ) );
264 #endif
266 rule< ScannerT > comment, stream, boolean, name, stringtype, null_object, simple_type,
267 objectref, array, value, dict_element, dict_begin, dict_end,
268 array_begin, array_end, object, object_begin, object_end,
269 xref, trailer, pdfrule;
271 const rule< ScannerT >& start() const { return pdfrule; }
274 #ifndef USE_ASSIGN_ACTOR
275 void push_back_action_uint( unsigned int i )
277 m_aUIntStack.push_back( i );
279 void assign_action_double( double d )
281 m_fDouble = d;
283 #endif
285 static void parseError( const char* pMessage, iteratorT pLocation )
287 throw_( pLocation, pMessage );
290 OString iteratorToString( iteratorT first, iteratorT last ) const
292 OStringBuffer aStr( 32 );
293 while( first != last )
295 aStr.append( *first );
296 ++first;
298 return aStr.makeStringAndClear();
301 void haveFile( iteratorT pBegin, SAL_UNUSED_PARAMETER iteratorT /*pEnd*/ )
303 if( m_aObjectStack.empty() )
305 PDFFile* pFile = new PDFFile();
306 pFile->m_nMinor = m_aUIntStack.back();
307 m_aUIntStack.pop_back();
308 pFile->m_nMajor = m_aUIntStack.back();
309 m_aUIntStack.pop_back();
310 m_aObjectStack.push_back( pFile );
312 else
313 parseError( "found file header in unusual place", pBegin );
316 void pushComment( iteratorT first, iteratorT last )
318 // add a comment to the current stack element
319 PDFComment* pComment =
320 new PDFComment(iteratorToString(first,last));
321 if( m_aObjectStack.empty() )
322 m_aObjectStack.push_back( new PDFPart() );
323 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
324 if( pContainer == nullptr )
325 parseError( "comment without container", first );
326 pContainer->m_aSubElements.emplace_back( pComment );
329 void insertNewValue( std::unique_ptr<PDFEntry> pNewValue, iteratorT pPos )
331 PDFContainer* pContainer = nullptr;
332 const char* pMsg = nullptr;
333 if( ! m_aObjectStack.empty() )
335 pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
336 if (pContainer)
338 if( dynamic_cast<PDFDict*>(pContainer) == nullptr &&
339 dynamic_cast<PDFArray*>(pContainer) == nullptr )
341 PDFObject* pObj = dynamic_cast<PDFObject*>(pContainer);
342 if( pObj )
344 if( pObj->m_pObject == nullptr )
345 pObj->m_pObject = pNewValue.get();
346 else
348 pMsg = "second value for object";
349 pContainer = nullptr;
352 else if( dynamic_cast<PDFDict*>(pNewValue.get()) )
354 PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pContainer);
355 if( pTrailer )
357 if( pTrailer->m_pDict == nullptr )
358 pTrailer->m_pDict = dynamic_cast<PDFDict*>(pNewValue.get());
359 else
360 pContainer = nullptr;
362 else
363 pContainer = nullptr;
365 else
366 pContainer = nullptr;
370 if( pContainer )
371 pContainer->m_aSubElements.emplace_back( std::move(pNewValue) );
372 else
374 if( ! pMsg )
376 if( dynamic_cast<PDFContainer*>(pNewValue.get()) )
377 pMsg = "array without container";
378 else
379 pMsg = "value without container";
381 parseError( pMsg, pPos );
385 void pushName( iteratorT first, iteratorT last )
387 insertNewValue( std::make_unique<PDFName>(iteratorToString(first,last)), first );
390 void pushDouble( iteratorT first, SAL_UNUSED_PARAMETER iteratorT /*last*/ )
392 insertNewValue( std::make_unique<PDFNumber>(m_fDouble), first );
395 void pushString( iteratorT first, iteratorT last )
397 insertNewValue( std::make_unique<PDFString>(iteratorToString(first,last)), first );
400 void pushBool( iteratorT first, iteratorT last )
402 insertNewValue( std::make_unique<PDFBool>( last-first == 4 ), first );
405 void pushNull( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
407 insertNewValue( std::make_unique<PDFNull>(), first );
411 void beginObject( iteratorT first, SAL_UNUSED_PARAMETER iteratorT /*last*/ )
413 if( m_aObjectStack.empty() )
414 m_aObjectStack.push_back( new PDFPart() );
416 unsigned int nGeneration = m_aUIntStack.back();
417 m_aUIntStack.pop_back();
418 unsigned int nObject = m_aUIntStack.back();
419 m_aUIntStack.pop_back();
421 PDFObject* pObj = new PDFObject( nObject, nGeneration );
422 pObj->m_nOffset = first - m_aGlobalBegin;
424 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
425 if( pContainer &&
426 ( dynamic_cast<PDFFile*>(pContainer) ||
427 dynamic_cast<PDFPart*>(pContainer) ) )
429 pContainer->m_aSubElements.emplace_back( pObj );
430 m_aObjectStack.push_back( pObj );
432 else
433 parseError( "object in wrong place", first );
436 void endObject( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
438 if( m_aObjectStack.empty() )
439 parseError( "endobj without obj", first );
440 else if( dynamic_cast<PDFObject*>(m_aObjectStack.back()) == nullptr )
441 parseError( "spurious endobj", first );
442 else
443 m_aObjectStack.pop_back();
446 void pushObjectRef( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
448 unsigned int nGeneration = m_aUIntStack.back();
449 m_aUIntStack.pop_back();
450 unsigned int nObject = m_aUIntStack.back();
451 m_aUIntStack.pop_back();
452 insertNewValue( std::make_unique<PDFObjectRef>(nObject,nGeneration), first );
455 void beginDict( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
457 PDFDict* pDict = new PDFDict();
458 pDict->m_nOffset = first - m_aGlobalBegin;
460 insertNewValue( std::unique_ptr<PDFEntry>(pDict), first );
461 // will not come here if insertion fails (exception)
462 m_aObjectStack.push_back( pDict );
464 void endDict( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
466 PDFDict* pDict = nullptr;
467 if( m_aObjectStack.empty() )
468 parseError( "dictionary end without begin", first );
469 else if( (pDict = dynamic_cast<PDFDict*>(m_aObjectStack.back())) == nullptr )
470 parseError( "spurious dictionary end", first );
471 else
472 m_aObjectStack.pop_back();
474 PDFEntry* pOffender = pDict->buildMap();
475 if( pOffender )
477 StringEmitContext aCtx;
478 aCtx.write( "offending dictionary element: ", 30 );
479 pOffender->emit( aCtx );
480 m_aErrorString = aCtx.getString();
481 parseError( m_aErrorString.getStr(), first );
485 void beginArray( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
487 PDFArray* pArray = new PDFArray();
488 pArray->m_nOffset = first - m_aGlobalBegin;
490 insertNewValue( std::unique_ptr<PDFEntry>(pArray), first );
491 // will not come here if insertion fails (exception)
492 m_aObjectStack.push_back( pArray );
495 void endArray( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
497 if( m_aObjectStack.empty() )
498 parseError( "array end without begin", first );
499 else if( dynamic_cast<PDFArray*>(m_aObjectStack.back()) == nullptr )
500 parseError( "spurious array end", first );
501 else
502 m_aObjectStack.pop_back();
505 void emitStream( iteratorT first, iteratorT last )
507 if( m_aObjectStack.empty() )
508 parseError( "stream without object", first );
509 PDFObject* pObj = dynamic_cast<PDFObject*>(m_aObjectStack.back());
510 if( pObj && pObj->m_pObject )
512 if( pObj->m_pStream )
513 parseError( "multiple streams in object", first );
515 PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
516 if( pDict )
518 PDFStream* pStream = new PDFStream( first - m_aGlobalBegin, last - m_aGlobalBegin, pDict );
520 pObj->m_pStream = pStream;
521 pObj->m_aSubElements.emplace_back( pStream );
524 else
525 parseError( "stream without object", first );
528 void beginTrailer( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
530 if( m_aObjectStack.empty() )
531 m_aObjectStack.push_back( new PDFPart() );
533 PDFTrailer* pTrailer = new PDFTrailer();
534 pTrailer->m_nOffset = first - m_aGlobalBegin;
536 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
537 if( pContainer &&
538 ( dynamic_cast<PDFFile*>(pContainer) ||
539 dynamic_cast<PDFPart*>(pContainer) ) )
541 pContainer->m_aSubElements.emplace_back( pTrailer );
542 m_aObjectStack.push_back( pTrailer );
544 else
545 parseError( "trailer in wrong place", first );
548 void endTrailer( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
550 if( m_aObjectStack.empty() )
551 parseError( "%%EOF without trailer", first );
552 else if( dynamic_cast<PDFTrailer*>(m_aObjectStack.back()) == nullptr )
553 parseError( "spurious %%EOF", first );
554 else
555 m_aObjectStack.pop_back();
561 std::unique_ptr<PDFEntry> PDFReader::read( const char* pFileName )
563 file_iterator<> file_start( pFileName );
564 if( ! file_start )
565 return nullptr;
566 file_iterator<> file_end = file_start.make_end();
567 PDFGrammar< file_iterator<> > aGrammar( file_start );
571 #if OSL_DEBUG_LEVEL > 0
572 boost::spirit::classic::parse_info< file_iterator<> > aInfo =
573 #endif
574 boost::spirit::classic::parse( file_start,
575 file_end,
576 aGrammar,
577 boost::spirit::classic::space_p );
578 #if OSL_DEBUG_LEVEL > 0
579 SAL_INFO("sdext.pdfimport.pdfparse", "parseinfo: stop at offset = " << aInfo.stop - file_start << ", hit = " << (aInfo.hit ? "true" : "false") << ", full = " << (aInfo.full ? "true" : "false") << ", length = " << aInfo.length);
580 #endif
582 catch( const parser_error< const char*, file_iterator<> >& rError )
584 SAL_WARN("sdext.pdfimport.pdfparse", "parse error: " << rError.descriptor << " at buffer pos " << rError.where - file_start);
585 #if OSL_DEBUG_LEVEL > 0
586 OUStringBuffer aTmp;
587 unsigned int nElem = aGrammar.m_aObjectStack.size();
588 for( unsigned int i = 0; i < nElem; i++ )
590 aTmp.append(" ");
591 aTmp.appendAscii(typeid( *(aGrammar.m_aObjectStack[i]) ).name());
593 SAL_WARN("sdext.pdfimport.pdfparse", "parse error object stack: " << aTmp.makeStringAndClear());
594 #endif
597 std::unique_ptr<PDFEntry> pRet;
598 unsigned int nEntries = aGrammar.m_aObjectStack.size();
599 if( nEntries == 1 )
601 pRet.reset(aGrammar.m_aObjectStack.back());
602 aGrammar.m_aObjectStack.pop_back();
604 else if( nEntries > 1 )
606 // It is possible that there are multiple trailers, which is OK.
607 // But still keep the warnings, just in case.
608 SAL_WARN("sdext.pdfimport.pdfparse", "error got " << nEntries << " stack objects in parse");
609 for (;;)
611 PDFEntry* pEntry = aGrammar.m_aObjectStack.back();
612 aGrammar.m_aObjectStack.pop_back();
613 SAL_WARN("sdext.pdfimport.pdfparse", typeid(*pEntry).name());
614 PDFObject* pObj = dynamic_cast<PDFObject*>(pEntry);
615 if( pObj )
616 SAL_WARN("sdext.pdfimport.pdfparse", " -> object " << pObj->m_nNumber << " generation " << pObj->m_nGeneration);
617 if (aGrammar.m_aObjectStack.empty())
619 pRet.reset(pEntry); // The first entry references all others - see PDFGrammar dtor
620 break;
624 return pRet;
627 #if defined(_MSC_VER)
628 #pragma warning(pop)
629 #endif
631 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */