Update ooo320-m1
[ooovba.git] / sdext / source / pdfimport / pdfparse / pdfparse.cxx
blobabfc647382d8ddd27338f2f1ff92e3d9d879e0e3
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: pdfparse.cxx,v $
11 * $Revision: 1.2 $
13 * This file is part of OpenOffice.org.
15 * OpenOffice.org is free software: you can redistribute it and/or modify
16 * it under the terms of the GNU Lesser General Public License version 3
17 * only, as published by the Free Software Foundation.
19 * OpenOffice.org is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU Lesser General Public License version 3 for more details
23 * (a copy is included in the LICENSE file that accompanied this code).
25 * You should have received a copy of the GNU Lesser General Public License
26 * version 3 along with OpenOffice.org. If not, see
27 * <http://www.openoffice.org/license.html>
28 * for a copy of the LGPLv3 License.
30 ************************************************************************/
32 // MARKER(update_precomp.py): autogen include statement, do not remove
33 #include "precompiled_sdext.hxx"
35 #if defined __SUNPRO_CC
36 #pragma disable_warn
37 #elif defined _MSC_VER
38 #pragma warning(push, 1)
39 #endif
41 #include "pdfparse.hxx"
43 // workaround windows compiler: do not include multi_pass.hpp
44 //#include <boost/spirit.hpp>
45 #include <boost/spirit/include/classic_core.hpp>
46 #include <boost/spirit/include/classic_utility.hpp>
47 #include <boost/spirit/include/classic_error_handling.hpp>
48 #include <boost/spirit/include/classic_file_iterator.hpp>
49 #include <boost/bind.hpp>
50 #include <string>
52 #include <rtl/strbuf.hxx>
53 #include <rtl/memory.h>
55 // disable warnings again because someone along the line has enabled them
56 #if defined __SUNPRO_CC
57 #pragma disable_warn
58 #elif defined _MSC_VER
59 #pragma warning(push, 1)
60 #endif
62 using namespace boost::spirit;
63 using namespace rtl;
64 using namespace pdfparse;
66 class StringEmitContext : public EmitContext
68 OStringBuffer m_aBuf;
69 public:
70 StringEmitContext() : EmitContext(), m_aBuf(256) {}
71 virtual ~StringEmitContext() {}
72 virtual bool write( const void* pBuf, unsigned int nLen ) throw()
74 m_aBuf.append( (const sal_Char*)pBuf, nLen );
75 return true;
77 virtual unsigned int getCurPos() throw() { return m_aBuf.getLength(); }
78 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw()
79 { return (nOrigOffset+nLen < static_cast<unsigned int>(m_aBuf.getLength()) ) ?
80 write( m_aBuf.getStr() + nOrigOffset, nLen ) : false; }
81 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw()
83 if( nOrigOffset+nLen < static_cast<unsigned int>(m_aBuf.getLength()) )
85 rtl_copyMemory( pBuf, m_aBuf.getStr()+nOrigOffset, nLen );
86 return nLen;
88 return 0;
91 OString getString() { return m_aBuf.makeStringAndClear(); }
94 template< class iteratorT >
95 class PDFGrammar : public grammar< PDFGrammar<iteratorT> >
97 public:
99 PDFGrammar( const iteratorT& first )
100 : m_fDouble( 0.0 ), m_aGlobalBegin( first ) {}
101 ~PDFGrammar()
103 if( !m_aObjectStack.empty() )
104 delete m_aObjectStack.front();
107 double m_fDouble;
108 std::vector< unsigned int > m_aUIntStack;
109 std::vector< PDFEntry* > m_aObjectStack;
110 rtl::OString m_aErrorString;
111 iteratorT m_aGlobalBegin;
113 public:
115 template< typename ScannerT >
116 struct definition
118 definition( const PDFGrammar<iteratorT>& rSelf )
120 PDFGrammar<iteratorT>* pSelf = const_cast< PDFGrammar<iteratorT>* >( &rSelf );
122 // workaround workshop compiler: comment_p doesn't work
123 // comment = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )];
124 comment = lexeme_d[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p)[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )] ];
126 boolean = (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool, pSelf, _1, _2)];
128 // workaround workshop compiler: confix_p doesn't work
129 //stream = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
130 stream = (str_p("stream") >> *(anychar_p - str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
132 name = lexeme_d[
133 ch_p('/')
134 >> (*(anychar_p-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0')))
135 [boost::bind(&PDFGrammar::pushName, pSelf, _1, _2)] ];
137 // workaround workshop compiler: confix_p doesn't work
138 //stringtype = ( confix_p("(",*anychar_p, ")") |
139 // confix_p("<",*xdigit_p, ">") )
140 // [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
141 stringtype = ( ( ch_p('(') >> *(str_p("\\)")|(anychar_p - ch_p(')'))) >> ch_p(')') ) |
142 ( ch_p('<') >> *xdigit_p >> ch_p('>') ) )
143 [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
145 null_object = str_p( "null" )[boost::bind(&PDFGrammar::pushNull, pSelf, _1, _2)];
147 #ifdef USE_ASSIGN_ACTOR
148 objectref = ( uint_p[push_back_a(pSelf->m_aUIntStack)]
149 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
150 >> ch_p('R')
151 >> eps_p
152 )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
153 #else
154 objectref = ( uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
155 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
156 >> ch_p('R')
157 >> eps_p
158 )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
159 #endif
161 #ifdef USE_ASSIGN_ACTOR
162 simple_type = objectref | name |
163 ( real_p[assign_a(pSelf->m_fDouble)] >> eps_p )
164 [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
165 | stringtype | boolean | null_object;
166 #else
167 simple_type = objectref | name |
168 ( real_p[boost::bind(&PDFGrammar::assign_action_double, pSelf, _1)] >> eps_p )
169 [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
170 | stringtype | boolean | null_object;
171 #endif
173 dict_begin = str_p( "<<" )[boost::bind(&PDFGrammar::beginDict, pSelf, _1, _2)];
174 dict_end = str_p( ">>" )[boost::bind(&PDFGrammar::endDict, pSelf, _1, _2)];
176 array_begin = str_p("[")[boost::bind(&PDFGrammar::beginArray,pSelf, _1, _2)];
177 array_end = str_p("]")[boost::bind(&PDFGrammar::endArray,pSelf, _1, _2)];
179 #ifdef USE_ASSIGN_ACTOR
180 object_begin= uint_p[push_back_a(pSelf->m_aUIntStack)]
181 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
182 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
183 #else
184 object_begin= uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
185 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
186 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
187 #endif
188 object_end = str_p( "endobj" )[boost::bind(&PDFGrammar::endObject, pSelf, _1, _2)];
190 xref = str_p( "xref" ) >> uint_p >> uint_p
191 >> lexeme_d[
192 +( repeat_p(10)[digit_p]
193 >> blank_p
194 >> repeat_p(5)[digit_p]
195 >> blank_p
196 >> ( ch_p('n') | ch_p('f') )
197 >> repeat_p(2)[space_p]
198 ) ];
200 dict_element= dict_begin | comment | simple_type
201 | array_begin | array_end | dict_end;
203 object = object_begin
204 >> *dict_element
205 >> !stream
206 >> object_end;
208 trailer = str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer,pSelf,_1,_2)]
209 >> *dict_element
210 >> str_p("startxref")
211 >> uint_p
212 >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer,pSelf,_1,_2)];
214 #ifdef USE_ASSIGN_ACTOR
215 pdfrule = ! (lexeme_d[
216 str_p( "%PDF-" )
217 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
218 >> ch_p('.')
219 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
220 >> *((~ch_p('\r') & ~ch_p('\n')))
221 >> eol_p
222 ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
223 >> *( comment | object | ( xref >> trailer ) );
224 #else
225 pdfrule = ! (lexeme_d[
226 str_p( "%PDF-" )
227 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
228 >> ch_p('.')
229 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
230 >> *((~ch_p('\r') & ~ch_p('\n')))
231 >> eol_p
232 ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
233 >> *( comment | object | ( xref >> trailer ) );
234 #endif
236 rule< ScannerT > comment, stream, boolean, name, stringtype, null_object, simple_type,
237 objectref, array, value, dict_element, dict_begin, dict_end,
238 array_begin, array_end, object, object_begin, object_end,
239 xref, trailer, pdfrule;
241 const rule< ScannerT >& start() const { return pdfrule; }
244 #ifndef USE_ASSIGN_ACTOR
245 void push_back_action_uint( unsigned int i )
247 m_aUIntStack.push_back( i );
249 void assign_action_double( double d )
251 m_fDouble = d;
253 #endif
255 void parseError( const char* pMessage, iteratorT pLocation )
257 throw_( pLocation, pMessage );
260 rtl::OString iteratorToString( iteratorT first, iteratorT last ) const
262 rtl::OStringBuffer aStr( 32 );
263 while( first != last )
265 aStr.append( *first );
266 ++first;
268 return aStr.makeStringAndClear();
271 void haveFile( iteratorT pBegin, iteratorT /*pEnd*/ )
273 if( m_aObjectStack.empty() )
275 PDFFile* pFile = new PDFFile();
276 pFile->m_nMinor = m_aUIntStack.back();
277 m_aUIntStack.pop_back();
278 pFile->m_nMajor = m_aUIntStack.back();
279 m_aUIntStack.pop_back();
280 m_aObjectStack.push_back( pFile );
282 else
283 parseError( "found file header in unusual place", pBegin );
286 void pushComment( iteratorT first, iteratorT last )
288 // add a comment to the current stack element
289 PDFComment* pComment =
290 new PDFComment(iteratorToString(first,last));
291 if( m_aObjectStack.empty() )
292 m_aObjectStack.push_back( new PDFPart() );
293 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
294 if( pContainer == NULL )
295 parseError( "comment without container", first );
296 pContainer->m_aSubElements.push_back( pComment );
299 void insertNewValue( PDFEntry* pNewValue, iteratorT pPos )
301 PDFContainer* pContainer = NULL;
302 const char* pMsg = NULL;
303 if( ! m_aObjectStack.empty() &&
304 (pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back())) != NULL )
306 if( dynamic_cast<PDFDict*>(pContainer) == NULL &&
307 dynamic_cast<PDFArray*>(pContainer) == NULL )
309 PDFObject* pObj = dynamic_cast<PDFObject*>(pContainer);
310 if( pObj )
312 if( pObj->m_pObject == NULL )
313 pObj->m_pObject = pNewValue;
314 else
316 pMsg = "second value for object";
317 pContainer = NULL;
320 else if( dynamic_cast<PDFDict*>(pNewValue) )
322 PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pContainer);
323 if( pTrailer )
325 if( pTrailer->m_pDict == NULL )
326 pTrailer->m_pDict = dynamic_cast<PDFDict*>(pNewValue);
327 else
328 pContainer = NULL;
330 else
331 pContainer = NULL;
333 else
334 pContainer = NULL;
337 if( pContainer )
338 pContainer->m_aSubElements.push_back( pNewValue );
339 else
341 if( ! pMsg )
343 if( dynamic_cast<PDFContainer*>(pNewValue) )
344 pMsg = "array without container";
345 else
346 pMsg = "value without container";
348 delete pNewValue;
349 parseError( pMsg, pPos );
353 void pushName( iteratorT first, iteratorT last )
355 insertNewValue( new PDFName(iteratorToString(first,last)), first );
358 void pushDouble( iteratorT first, iteratorT /*last*/ )
360 insertNewValue( new PDFNumber(m_fDouble), first );
363 void pushString( iteratorT first, iteratorT last )
365 insertNewValue( new PDFString(iteratorToString(first,last)), first );
368 void pushBool( iteratorT first, iteratorT last )
370 insertNewValue( new PDFBool( (last-first == 4) ), first );
373 void pushNull( iteratorT first, iteratorT )
375 insertNewValue( new PDFNull(), first );
379 void beginObject( iteratorT first, iteratorT /*last*/ )
381 if( m_aObjectStack.empty() )
382 m_aObjectStack.push_back( new PDFPart() );
384 unsigned int nGeneration = m_aUIntStack.back();
385 m_aUIntStack.pop_back();
386 unsigned int nObject = m_aUIntStack.back();
387 m_aUIntStack.pop_back();
389 PDFObject* pObj = new PDFObject( nObject, nGeneration );
390 pObj->m_nOffset = first - m_aGlobalBegin;
392 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
393 if( pContainer &&
394 ( dynamic_cast<PDFFile*>(pContainer) ||
395 dynamic_cast<PDFPart*>(pContainer) ) )
397 pContainer->m_aSubElements.push_back( pObj );
398 m_aObjectStack.push_back( pObj );
400 else
401 parseError( "object in wrong place", first );
404 void endObject( iteratorT first, iteratorT )
406 if( m_aObjectStack.empty() )
407 parseError( "endobj without obj", first );
408 else if( dynamic_cast<PDFObject*>(m_aObjectStack.back()) == NULL )
409 parseError( "spurious endobj", first );
410 else
411 m_aObjectStack.pop_back();
414 void pushObjectRef( iteratorT first, iteratorT )
416 unsigned int nGeneration = m_aUIntStack.back();
417 m_aUIntStack.pop_back();
418 unsigned int nObject = m_aUIntStack.back();
419 m_aUIntStack.pop_back();
420 insertNewValue( new PDFObjectRef(nObject,nGeneration), first );
423 void beginDict( iteratorT first, iteratorT )
425 PDFDict* pDict = new PDFDict();
426 pDict->m_nOffset = first - m_aGlobalBegin;
428 insertNewValue( pDict, first );
429 // will not come here if insertion fails (exception)
430 m_aObjectStack.push_back( pDict );
432 void endDict( iteratorT first, iteratorT )
434 PDFDict* pDict = NULL;
435 if( m_aObjectStack.empty() )
436 parseError( "dictionary end without begin", first );
437 else if( (pDict = dynamic_cast<PDFDict*>(m_aObjectStack.back())) == NULL )
438 parseError( "spurious dictionary end", first );
439 else
440 m_aObjectStack.pop_back();
442 PDFEntry* pOffender = pDict->buildMap();
443 if( pOffender )
445 StringEmitContext aCtx;
446 aCtx.write( "offending dictionary element: ", 30 );
447 pOffender->emit( aCtx );
448 m_aErrorString = aCtx.getString();
449 parseError( m_aErrorString.getStr(), first );
453 void beginArray( iteratorT first, iteratorT )
455 PDFArray* pArray = new PDFArray();
456 pArray->m_nOffset = first - m_aGlobalBegin;
458 insertNewValue( pArray, first );
459 // will not come here if insertion fails (exception)
460 m_aObjectStack.push_back( pArray );
463 void endArray( iteratorT first, iteratorT )
465 if( m_aObjectStack.empty() )
466 parseError( "array end without begin", first );
467 else if( dynamic_cast<PDFArray*>(m_aObjectStack.back()) == NULL )
468 parseError( "spurious array end", first );
469 else
470 m_aObjectStack.pop_back();
473 void emitStream( iteratorT first, iteratorT last )
475 if( m_aObjectStack.empty() )
476 parseError( "stream without object", first );
477 PDFObject* pObj = dynamic_cast<PDFObject*>(m_aObjectStack.back());
478 if( pObj && pObj->m_pObject )
480 if( pObj->m_pStream )
481 parseError( "multiple streams in object", first );
483 PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
484 if( pDict )
486 PDFStream* pStream = new PDFStream( first - m_aGlobalBegin, last - m_aGlobalBegin, pDict );
488 pObj->m_pStream = pStream;
489 pObj->m_aSubElements.push_back( pStream );
492 else
493 parseError( "stream without object", first );
496 void beginTrailer( iteratorT first, iteratorT )
498 if( m_aObjectStack.empty() )
499 m_aObjectStack.push_back( new PDFPart() );
501 PDFTrailer* pTrailer = new PDFTrailer();
502 pTrailer->m_nOffset = first - m_aGlobalBegin;
504 PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
505 if( pContainer &&
506 ( dynamic_cast<PDFFile*>(pContainer) ||
507 dynamic_cast<PDFPart*>(pContainer) ) )
509 pContainer->m_aSubElements.push_back( pTrailer );
510 m_aObjectStack.push_back( pTrailer );
512 else
513 parseError( "trailer in wrong place", first );
516 void endTrailer( iteratorT first, iteratorT )
518 if( m_aObjectStack.empty() )
519 parseError( "%%EOF without trailer", first );
520 else if( dynamic_cast<PDFTrailer*>(m_aObjectStack.back()) == NULL )
521 parseError( "spurious %%EOF", first );
522 else
523 m_aObjectStack.pop_back();
527 PDFEntry* PDFReader::read( const char* pBuffer, unsigned int nLen )
529 PDFGrammar<const char*> aGrammar( pBuffer );
533 boost::spirit::parse_info<const char*> aInfo =
534 boost::spirit::parse( pBuffer,
535 pBuffer+nLen,
536 aGrammar,
537 boost::spirit::space_p );
538 #if OSL_DEBUG_LEVEL > 1
539 fprintf( stderr, "parseinfo: stop = %p (buff=%p, offset = %d), hit = %s, full = %s, length = %d\n",
540 aInfo.stop, pBuffer, aInfo.stop - pBuffer,
541 aInfo.hit ? "true" : "false",
542 aInfo.full ? "true" : "false",
543 aInfo.length );
544 #endif
546 catch( parser_error<const char*, const char*>& rError )
548 #if OSL_DEBUG_LEVEL > 1
549 fprintf( stderr, "parse error: %s at buffer pos %u\nobject stack:\n",
550 rError.descriptor, rError.where - pBuffer );
551 unsigned int nElem = aGrammar.m_aObjectStack.size();
552 for( unsigned int i = 0; i < nElem; i++ )
554 fprintf( stderr, " %s\n", typeid( *(aGrammar.m_aObjectStack[i]) ).name() );
556 #endif
559 PDFEntry* pRet = NULL;
560 unsigned int nEntries = aGrammar.m_aObjectStack.size();
561 if( nEntries == 1 )
563 pRet = aGrammar.m_aObjectStack.back();
564 aGrammar.m_aObjectStack.pop_back();
566 #if OSL_DEBUG_LEVEL > 1
567 else if( nEntries > 1 )
568 fprintf( stderr, "error got %u stack objects in parse\n", nEntries );
569 #endif
571 return pRet;
574 PDFEntry* PDFReader::read( const char* pFileName )
576 file_iterator<> file_start( pFileName );
577 if( ! file_start )
578 return NULL;
579 file_iterator<> file_end = file_start.make_end();
580 PDFGrammar< file_iterator<> > aGrammar( file_start );
584 boost::spirit::parse_info< file_iterator<> > aInfo =
585 boost::spirit::parse( file_start,
586 file_end,
587 aGrammar,
588 boost::spirit::space_p );
589 #if OSL_DEBUG_LEVEL > 1
590 fprintf( stderr, "parseinfo: stop at offset = %d, hit = %s, full = %s, length = %d\n",
591 aInfo.stop - file_start,
592 aInfo.hit ? "true" : "false",
593 aInfo.full ? "true" : "false",
594 aInfo.length );
595 #endif
597 catch( parser_error< const char*, file_iterator<> >& rError )
599 #if OSL_DEBUG_LEVEL > 1
600 fprintf( stderr, "parse error: %s at buffer pos %u\nobject stack:\n",
601 rError.descriptor, rError.where - file_start );
602 unsigned int nElem = aGrammar.m_aObjectStack.size();
603 for( unsigned int i = 0; i < nElem; i++ )
605 fprintf( stderr, " %s\n", typeid( *(aGrammar.m_aObjectStack[i]) ).name() );
607 #endif
610 PDFEntry* pRet = NULL;
611 unsigned int nEntries = aGrammar.m_aObjectStack.size();
612 if( nEntries == 1 )
614 pRet = aGrammar.m_aObjectStack.back();
615 aGrammar.m_aObjectStack.pop_back();
617 #if OSL_DEBUG_LEVEL > 1
618 else if( nEntries > 1 )
620 fprintf( stderr, "error got %u stack objects in parse\n", nEntries );
621 for( unsigned int i = 0; i < nEntries; i++ )
623 fprintf( stderr, "%s\n", typeid(*aGrammar.m_aObjectStack[i]).name() );
624 PDFObject* pObj = dynamic_cast<PDFObject*>(aGrammar.m_aObjectStack[i]);
625 if( pObj )
626 fprintf( stderr, " -> object %d generation %d\n", pObj->m_nNumber, pObj->m_nGeneration );
627 else
628 fprintf( stderr, "(type %s)\n", typeid(*aGrammar.m_aObjectStack[i]).name() );
631 #endif
633 return pRet;
636 #if defined __SUNPRO_CC
637 #pragma enable_warn
638 #elif defined _MSC_VER
639 #pragma warning(pop)
640 #endif