1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: pdfparse.cxx,v $
13 * This file is part of OpenOffice.org.
15 * OpenOffice.org is free software: you can redistribute it and/or modify
16 * it under the terms of the GNU Lesser General Public License version 3
17 * only, as published by the Free Software Foundation.
19 * OpenOffice.org is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU Lesser General Public License version 3 for more details
23 * (a copy is included in the LICENSE file that accompanied this code).
25 * You should have received a copy of the GNU Lesser General Public License
26 * version 3 along with OpenOffice.org. If not, see
27 * <http://www.openoffice.org/license.html>
28 * for a copy of the LGPLv3 License.
30 ************************************************************************/
32 // MARKER(update_precomp.py): autogen include statement, do not remove
33 #include "precompiled_sdext.hxx"
35 #if defined __SUNPRO_CC
37 #elif defined _MSC_VER
38 #pragma warning(push, 1)
41 #include "pdfparse.hxx"
43 // workaround windows compiler: do not include multi_pass.hpp
44 //#include <boost/spirit.hpp>
45 #include <boost/spirit/include/classic_core.hpp>
46 #include <boost/spirit/include/classic_utility.hpp>
47 #include <boost/spirit/include/classic_error_handling.hpp>
48 #include <boost/spirit/include/classic_file_iterator.hpp>
49 #include <boost/bind.hpp>
52 #include <rtl/strbuf.hxx>
53 #include <rtl/memory.h>
55 // disable warnings again because someone along the line has enabled them
56 #if defined __SUNPRO_CC
58 #elif defined _MSC_VER
59 #pragma warning(push, 1)
62 using namespace boost::spirit
;
64 using namespace pdfparse
;
66 class StringEmitContext
: public EmitContext
70 StringEmitContext() : EmitContext(), m_aBuf(256) {}
71 virtual ~StringEmitContext() {}
72 virtual bool write( const void* pBuf
, unsigned int nLen
) throw()
74 m_aBuf
.append( (const sal_Char
*)pBuf
, nLen
);
77 virtual unsigned int getCurPos() throw() { return m_aBuf
.getLength(); }
78 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) throw()
79 { return (nOrigOffset
+nLen
< static_cast<unsigned int>(m_aBuf
.getLength()) ) ?
80 write( m_aBuf
.getStr() + nOrigOffset
, nLen
) : false; }
81 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) throw()
83 if( nOrigOffset
+nLen
< static_cast<unsigned int>(m_aBuf
.getLength()) )
85 rtl_copyMemory( pBuf
, m_aBuf
.getStr()+nOrigOffset
, nLen
);
91 OString
getString() { return m_aBuf
.makeStringAndClear(); }
94 template< class iteratorT
>
95 class PDFGrammar
: public grammar
< PDFGrammar
<iteratorT
> >
99 PDFGrammar( const iteratorT
& first
)
100 : m_fDouble( 0.0 ), m_aGlobalBegin( first
) {}
103 if( !m_aObjectStack
.empty() )
104 delete m_aObjectStack
.front();
108 std::vector
< unsigned int > m_aUIntStack
;
109 std::vector
< PDFEntry
* > m_aObjectStack
;
110 rtl::OString m_aErrorString
;
111 iteratorT m_aGlobalBegin
;
115 template< typename ScannerT
>
118 definition( const PDFGrammar
<iteratorT
>& rSelf
)
120 PDFGrammar
<iteratorT
>* pSelf
= const_cast< PDFGrammar
<iteratorT
>* >( &rSelf
);
122 // workaround workshop compiler: comment_p doesn't work
123 // comment = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )];
124 comment
= lexeme_d
[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p
)[boost::bind(&PDFGrammar::pushComment
, pSelf
, _1
, _2
)] ];
126 boolean
= (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool
, pSelf
, _1
, _2
)];
128 // workaround workshop compiler: confix_p doesn't work
129 //stream = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
130 stream
= (str_p("stream") >> *(anychar_p
- str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream
, pSelf
, _1
, _2
)];
134 >> (*(anychar_p
-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0')))
135 [boost::bind(&PDFGrammar::pushName
, pSelf
, _1
, _2
)] ];
137 // workaround workshop compiler: confix_p doesn't work
138 //stringtype = ( confix_p("(",*anychar_p, ")") |
139 // confix_p("<",*xdigit_p, ">") )
140 // [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
141 stringtype
= ( ( ch_p('(') >> *(str_p("\\)")|(anychar_p
- ch_p(')'))) >> ch_p(')') ) |
142 ( ch_p('<') >> *xdigit_p
>> ch_p('>') ) )
143 [boost::bind(&PDFGrammar::pushString
,pSelf
, _1
, _2
)];
145 null_object
= str_p( "null" )[boost::bind(&PDFGrammar::pushNull
, pSelf
, _1
, _2
)];
147 #ifdef USE_ASSIGN_ACTOR
148 objectref
= ( uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
149 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
152 )[boost::bind(&PDFGrammar::pushObjectRef
, pSelf
, _1
, _2
)];
154 objectref
= ( uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
155 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
158 )[boost::bind(&PDFGrammar::pushObjectRef
, pSelf
, _1
, _2
)];
161 #ifdef USE_ASSIGN_ACTOR
162 simple_type
= objectref
| name
|
163 ( real_p
[assign_a(pSelf
->m_fDouble
)] >> eps_p
)
164 [boost::bind(&PDFGrammar::pushDouble
, pSelf
, _1
, _2
)]
165 | stringtype
| boolean
| null_object
;
167 simple_type
= objectref
| name
|
168 ( real_p
[boost::bind(&PDFGrammar::assign_action_double
, pSelf
, _1
)] >> eps_p
)
169 [boost::bind(&PDFGrammar::pushDouble
, pSelf
, _1
, _2
)]
170 | stringtype
| boolean
| null_object
;
173 dict_begin
= str_p( "<<" )[boost::bind(&PDFGrammar::beginDict
, pSelf
, _1
, _2
)];
174 dict_end
= str_p( ">>" )[boost::bind(&PDFGrammar::endDict
, pSelf
, _1
, _2
)];
176 array_begin
= str_p("[")[boost::bind(&PDFGrammar::beginArray
,pSelf
, _1
, _2
)];
177 array_end
= str_p("]")[boost::bind(&PDFGrammar::endArray
,pSelf
, _1
, _2
)];
179 #ifdef USE_ASSIGN_ACTOR
180 object_begin
= uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
181 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
182 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject
, pSelf
, _1
, _2
)];
184 object_begin
= uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
185 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
186 >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject
, pSelf
, _1
, _2
)];
188 object_end
= str_p( "endobj" )[boost::bind(&PDFGrammar::endObject
, pSelf
, _1
, _2
)];
190 xref
= str_p( "xref" ) >> uint_p
>> uint_p
192 +( repeat_p(10)[digit_p
]
194 >> repeat_p(5)[digit_p
]
196 >> ( ch_p('n') | ch_p('f') )
197 >> repeat_p(2)[space_p
]
200 dict_element
= dict_begin
| comment
| simple_type
201 | array_begin
| array_end
| dict_end
;
203 object
= object_begin
208 trailer
= str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer
,pSelf
,_1
,_2
)]
210 >> str_p("startxref")
212 >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer
,pSelf
,_1
,_2
)];
214 #ifdef USE_ASSIGN_ACTOR
215 pdfrule
= ! (lexeme_d
[
217 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
219 >> uint_p
[push_back_a(pSelf
->m_aUIntStack
)]
220 >> *((~ch_p('\r') & ~ch_p('\n')))
222 ])[boost::bind(&PDFGrammar::haveFile
,pSelf
, _1
, _2
)]
223 >> *( comment
| object
| ( xref
>> trailer
) );
225 pdfrule
= ! (lexeme_d
[
227 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
229 >> uint_p
[boost::bind(&PDFGrammar::push_back_action_uint
, pSelf
, _1
)]
230 >> *((~ch_p('\r') & ~ch_p('\n')))
232 ])[boost::bind(&PDFGrammar::haveFile
,pSelf
, _1
, _2
)]
233 >> *( comment
| object
| ( xref
>> trailer
) );
236 rule
< ScannerT
> comment
, stream
, boolean
, name
, stringtype
, null_object
, simple_type
,
237 objectref
, array
, value
, dict_element
, dict_begin
, dict_end
,
238 array_begin
, array_end
, object
, object_begin
, object_end
,
239 xref
, trailer
, pdfrule
;
241 const rule
< ScannerT
>& start() const { return pdfrule
; }
244 #ifndef USE_ASSIGN_ACTOR
245 void push_back_action_uint( unsigned int i
)
247 m_aUIntStack
.push_back( i
);
249 void assign_action_double( double d
)
255 void parseError( const char* pMessage
, iteratorT pLocation
)
257 throw_( pLocation
, pMessage
);
260 rtl::OString
iteratorToString( iteratorT first
, iteratorT last
) const
262 rtl::OStringBuffer
aStr( 32 );
263 while( first
!= last
)
265 aStr
.append( *first
);
268 return aStr
.makeStringAndClear();
271 void haveFile( iteratorT pBegin
, iteratorT
/*pEnd*/ )
273 if( m_aObjectStack
.empty() )
275 PDFFile
* pFile
= new PDFFile();
276 pFile
->m_nMinor
= m_aUIntStack
.back();
277 m_aUIntStack
.pop_back();
278 pFile
->m_nMajor
= m_aUIntStack
.back();
279 m_aUIntStack
.pop_back();
280 m_aObjectStack
.push_back( pFile
);
283 parseError( "found file header in unusual place", pBegin
);
286 void pushComment( iteratorT first
, iteratorT last
)
288 // add a comment to the current stack element
289 PDFComment
* pComment
=
290 new PDFComment(iteratorToString(first
,last
));
291 if( m_aObjectStack
.empty() )
292 m_aObjectStack
.push_back( new PDFPart() );
293 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
294 if( pContainer
== NULL
)
295 parseError( "comment without container", first
);
296 pContainer
->m_aSubElements
.push_back( pComment
);
299 void insertNewValue( PDFEntry
* pNewValue
, iteratorT pPos
)
301 PDFContainer
* pContainer
= NULL
;
302 const char* pMsg
= NULL
;
303 if( ! m_aObjectStack
.empty() &&
304 (pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back())) != NULL
)
306 if( dynamic_cast<PDFDict
*>(pContainer
) == NULL
&&
307 dynamic_cast<PDFArray
*>(pContainer
) == NULL
)
309 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(pContainer
);
312 if( pObj
->m_pObject
== NULL
)
313 pObj
->m_pObject
= pNewValue
;
316 pMsg
= "second value for object";
320 else if( dynamic_cast<PDFDict
*>(pNewValue
) )
322 PDFTrailer
* pTrailer
= dynamic_cast<PDFTrailer
*>(pContainer
);
325 if( pTrailer
->m_pDict
== NULL
)
326 pTrailer
->m_pDict
= dynamic_cast<PDFDict
*>(pNewValue
);
338 pContainer
->m_aSubElements
.push_back( pNewValue
);
343 if( dynamic_cast<PDFContainer
*>(pNewValue
) )
344 pMsg
= "array without container";
346 pMsg
= "value without container";
349 parseError( pMsg
, pPos
);
353 void pushName( iteratorT first
, iteratorT last
)
355 insertNewValue( new PDFName(iteratorToString(first
,last
)), first
);
358 void pushDouble( iteratorT first
, iteratorT
/*last*/ )
360 insertNewValue( new PDFNumber(m_fDouble
), first
);
363 void pushString( iteratorT first
, iteratorT last
)
365 insertNewValue( new PDFString(iteratorToString(first
,last
)), first
);
368 void pushBool( iteratorT first
, iteratorT last
)
370 insertNewValue( new PDFBool( (last
-first
== 4) ), first
);
373 void pushNull( iteratorT first
, iteratorT
)
375 insertNewValue( new PDFNull(), first
);
379 void beginObject( iteratorT first
, iteratorT
/*last*/ )
381 if( m_aObjectStack
.empty() )
382 m_aObjectStack
.push_back( new PDFPart() );
384 unsigned int nGeneration
= m_aUIntStack
.back();
385 m_aUIntStack
.pop_back();
386 unsigned int nObject
= m_aUIntStack
.back();
387 m_aUIntStack
.pop_back();
389 PDFObject
* pObj
= new PDFObject( nObject
, nGeneration
);
390 pObj
->m_nOffset
= first
- m_aGlobalBegin
;
392 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
394 ( dynamic_cast<PDFFile
*>(pContainer
) ||
395 dynamic_cast<PDFPart
*>(pContainer
) ) )
397 pContainer
->m_aSubElements
.push_back( pObj
);
398 m_aObjectStack
.push_back( pObj
);
401 parseError( "object in wrong place", first
);
404 void endObject( iteratorT first
, iteratorT
)
406 if( m_aObjectStack
.empty() )
407 parseError( "endobj without obj", first
);
408 else if( dynamic_cast<PDFObject
*>(m_aObjectStack
.back()) == NULL
)
409 parseError( "spurious endobj", first
);
411 m_aObjectStack
.pop_back();
414 void pushObjectRef( iteratorT first
, iteratorT
)
416 unsigned int nGeneration
= m_aUIntStack
.back();
417 m_aUIntStack
.pop_back();
418 unsigned int nObject
= m_aUIntStack
.back();
419 m_aUIntStack
.pop_back();
420 insertNewValue( new PDFObjectRef(nObject
,nGeneration
), first
);
423 void beginDict( iteratorT first
, iteratorT
)
425 PDFDict
* pDict
= new PDFDict();
426 pDict
->m_nOffset
= first
- m_aGlobalBegin
;
428 insertNewValue( pDict
, first
);
429 // will not come here if insertion fails (exception)
430 m_aObjectStack
.push_back( pDict
);
432 void endDict( iteratorT first
, iteratorT
)
434 PDFDict
* pDict
= NULL
;
435 if( m_aObjectStack
.empty() )
436 parseError( "dictionary end without begin", first
);
437 else if( (pDict
= dynamic_cast<PDFDict
*>(m_aObjectStack
.back())) == NULL
)
438 parseError( "spurious dictionary end", first
);
440 m_aObjectStack
.pop_back();
442 PDFEntry
* pOffender
= pDict
->buildMap();
445 StringEmitContext aCtx
;
446 aCtx
.write( "offending dictionary element: ", 30 );
447 pOffender
->emit( aCtx
);
448 m_aErrorString
= aCtx
.getString();
449 parseError( m_aErrorString
.getStr(), first
);
453 void beginArray( iteratorT first
, iteratorT
)
455 PDFArray
* pArray
= new PDFArray();
456 pArray
->m_nOffset
= first
- m_aGlobalBegin
;
458 insertNewValue( pArray
, first
);
459 // will not come here if insertion fails (exception)
460 m_aObjectStack
.push_back( pArray
);
463 void endArray( iteratorT first
, iteratorT
)
465 if( m_aObjectStack
.empty() )
466 parseError( "array end without begin", first
);
467 else if( dynamic_cast<PDFArray
*>(m_aObjectStack
.back()) == NULL
)
468 parseError( "spurious array end", first
);
470 m_aObjectStack
.pop_back();
473 void emitStream( iteratorT first
, iteratorT last
)
475 if( m_aObjectStack
.empty() )
476 parseError( "stream without object", first
);
477 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(m_aObjectStack
.back());
478 if( pObj
&& pObj
->m_pObject
)
480 if( pObj
->m_pStream
)
481 parseError( "multiple streams in object", first
);
483 PDFDict
* pDict
= dynamic_cast<PDFDict
*>(pObj
->m_pObject
);
486 PDFStream
* pStream
= new PDFStream( first
- m_aGlobalBegin
, last
- m_aGlobalBegin
, pDict
);
488 pObj
->m_pStream
= pStream
;
489 pObj
->m_aSubElements
.push_back( pStream
);
493 parseError( "stream without object", first
);
496 void beginTrailer( iteratorT first
, iteratorT
)
498 if( m_aObjectStack
.empty() )
499 m_aObjectStack
.push_back( new PDFPart() );
501 PDFTrailer
* pTrailer
= new PDFTrailer();
502 pTrailer
->m_nOffset
= first
- m_aGlobalBegin
;
504 PDFContainer
* pContainer
= dynamic_cast<PDFContainer
*>(m_aObjectStack
.back());
506 ( dynamic_cast<PDFFile
*>(pContainer
) ||
507 dynamic_cast<PDFPart
*>(pContainer
) ) )
509 pContainer
->m_aSubElements
.push_back( pTrailer
);
510 m_aObjectStack
.push_back( pTrailer
);
513 parseError( "trailer in wrong place", first
);
516 void endTrailer( iteratorT first
, iteratorT
)
518 if( m_aObjectStack
.empty() )
519 parseError( "%%EOF without trailer", first
);
520 else if( dynamic_cast<PDFTrailer
*>(m_aObjectStack
.back()) == NULL
)
521 parseError( "spurious %%EOF", first
);
523 m_aObjectStack
.pop_back();
527 PDFEntry
* PDFReader::read( const char* pBuffer
, unsigned int nLen
)
529 PDFGrammar
<const char*> aGrammar( pBuffer
);
533 boost::spirit::parse_info
<const char*> aInfo
=
534 boost::spirit::parse( pBuffer
,
537 boost::spirit::space_p
);
538 #if OSL_DEBUG_LEVEL > 1
539 fprintf( stderr
, "parseinfo: stop = %p (buff=%p, offset = %d), hit = %s, full = %s, length = %d\n",
540 aInfo
.stop
, pBuffer
, aInfo
.stop
- pBuffer
,
541 aInfo
.hit
? "true" : "false",
542 aInfo
.full
? "true" : "false",
546 catch( parser_error
<const char*, const char*>& rError
)
548 #if OSL_DEBUG_LEVEL > 1
549 fprintf( stderr
, "parse error: %s at buffer pos %u\nobject stack:\n",
550 rError
.descriptor
, rError
.where
- pBuffer
);
551 unsigned int nElem
= aGrammar
.m_aObjectStack
.size();
552 for( unsigned int i
= 0; i
< nElem
; i
++ )
554 fprintf( stderr
, " %s\n", typeid( *(aGrammar
.m_aObjectStack
[i
]) ).name() );
559 PDFEntry
* pRet
= NULL
;
560 unsigned int nEntries
= aGrammar
.m_aObjectStack
.size();
563 pRet
= aGrammar
.m_aObjectStack
.back();
564 aGrammar
.m_aObjectStack
.pop_back();
566 #if OSL_DEBUG_LEVEL > 1
567 else if( nEntries
> 1 )
568 fprintf( stderr
, "error got %u stack objects in parse\n", nEntries
);
574 PDFEntry
* PDFReader::read( const char* pFileName
)
576 file_iterator
<> file_start( pFileName
);
579 file_iterator
<> file_end
= file_start
.make_end();
580 PDFGrammar
< file_iterator
<> > aGrammar( file_start
);
584 boost::spirit::parse_info
< file_iterator
<> > aInfo
=
585 boost::spirit::parse( file_start
,
588 boost::spirit::space_p
);
589 #if OSL_DEBUG_LEVEL > 1
590 fprintf( stderr
, "parseinfo: stop at offset = %d, hit = %s, full = %s, length = %d\n",
591 aInfo
.stop
- file_start
,
592 aInfo
.hit
? "true" : "false",
593 aInfo
.full
? "true" : "false",
597 catch( parser_error
< const char*, file_iterator
<> >& rError
)
599 #if OSL_DEBUG_LEVEL > 1
600 fprintf( stderr
, "parse error: %s at buffer pos %u\nobject stack:\n",
601 rError
.descriptor
, rError
.where
- file_start
);
602 unsigned int nElem
= aGrammar
.m_aObjectStack
.size();
603 for( unsigned int i
= 0; i
< nElem
; i
++ )
605 fprintf( stderr
, " %s\n", typeid( *(aGrammar
.m_aObjectStack
[i
]) ).name() );
610 PDFEntry
* pRet
= NULL
;
611 unsigned int nEntries
= aGrammar
.m_aObjectStack
.size();
614 pRet
= aGrammar
.m_aObjectStack
.back();
615 aGrammar
.m_aObjectStack
.pop_back();
617 #if OSL_DEBUG_LEVEL > 1
618 else if( nEntries
> 1 )
620 fprintf( stderr
, "error got %u stack objects in parse\n", nEntries
);
621 for( unsigned int i
= 0; i
< nEntries
; i
++ )
623 fprintf( stderr
, "%s\n", typeid(*aGrammar
.m_aObjectStack
[i
]).name() );
624 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(aGrammar
.m_aObjectStack
[i
]);
626 fprintf( stderr
, " -> object %d generation %d\n", pObj
->m_nNumber
, pObj
->m_nGeneration
);
628 fprintf( stderr
, "(type %s)\n", typeid(*aGrammar
.m_aObjectStack
[i
]).name() );
636 #if defined __SUNPRO_CC
638 #elif defined _MSC_VER