1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: pdfunzip.cxx,v $
13 * This file is part of OpenOffice.org.
15 * OpenOffice.org is free software: you can redistribute it and/or modify
16 * it under the terms of the GNU Lesser General Public License version 3
17 * only, as published by the Free Software Foundation.
19 * OpenOffice.org is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU Lesser General Public License version 3 for more details
23 * (a copy is included in the LICENSE file that accompanied this code).
25 * You should have received a copy of the GNU Lesser General Public License
26 * version 3 along with OpenOffice.org. If not, see
27 * <http://www.openoffice.org/license.html>
28 * for a copy of the LGPLv3 License.
30 ************************************************************************/
32 // MARKER(update_precomp.py): autogen include statement, do not remove
33 #include "precompiled_sdext.hxx"
38 #include <osl/thread.h>
39 #include <rtl/alloc.h>
40 #include <rtl/ustring.hxx>
41 #include <rtl/strbuf.hxx>
43 #include "pdfparse.hxx"
46 using namespace pdfparse
;
48 void printHelp( const char* pExe
)
51 "USAGE: %s [-h,--help]\n"
52 " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
53 " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
54 " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
55 " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
56 " -h, --help: show help\n"
57 " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
58 " and prints the mimetype found to stdout\n"
59 " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
60 " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
61 " object numbers, where object number and generation number are separated by \':\'\n"
62 " an omitted generation number defaults to 0\n"
63 " -pw, --password: use password for decryption\n"
65 "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
66 , pExe
, pExe
, pExe
, pExe
, pExe
);
69 class FileEmitContext
: public EmitContext
71 oslFileHandle m_aHandle
;
72 oslFileHandle m_aReadHandle
;
73 unsigned int m_nReadLen
;
75 void openReadFile( const char* pOrigName
);
78 FileEmitContext( const char* pFileName
, const char* pOrigName
, const PDFContainer
* pTop
);
79 virtual ~FileEmitContext();
81 virtual bool write( const void* pBuf
, unsigned int nLen
) throw();
82 virtual unsigned int getCurPos() throw();
83 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) throw();
84 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) throw();
87 FileEmitContext::FileEmitContext( const char* pFileName
, const char* pOrigName
, const PDFContainer
* pTop
)
88 : EmitContext( pTop
),
90 m_aReadHandle( NULL
),
93 OUString
aSysFile( OStringToOUString( OString( pFileName
), osl_getThreadTextEncoding() ) );
95 if( osl_getFileURLFromSystemPath( aSysFile
.pData
, &aURL
.pData
) != osl_File_E_None
)
97 fprintf( stderr
, "filename conversion \"%s\" failed\n", pFileName
);
101 if( osl_openFile( aURL
.pData
, &m_aHandle
, osl_File_OpenFlag_Write
) == osl_File_E_None
)
103 if( osl_setFileSize( m_aHandle
, 0 ) != osl_File_E_None
)
105 fprintf( stderr
, "could not truncate %s\n", pFileName
);
106 osl_closeFile( m_aHandle
);
110 else if( osl_openFile( aURL
.pData
, &m_aHandle
,
111 osl_File_OpenFlag_Write
|osl_File_OpenFlag_Create
) != osl_File_E_None
)
113 fprintf( stderr
, "could not open %s\n", pFileName
);
118 openReadFile( pOrigName
);
121 FileEmitContext::~FileEmitContext()
124 osl_closeFile( m_aHandle
);
126 osl_closeFile( m_aReadHandle
);
129 void FileEmitContext::openReadFile( const char* pInFile
)
131 OUString
aSysFile( OStringToOUString( OString( pInFile
), osl_getThreadTextEncoding() ) );
133 if( osl_getFileURLFromSystemPath( aSysFile
.pData
, &aURL
.pData
) != osl_File_E_None
)
135 fprintf( stderr
, "filename conversion \"%s\" failed\n", pInFile
);
139 if( osl_openFile( aURL
.pData
, &m_aReadHandle
, osl_File_OpenFlag_Read
) != osl_File_E_None
)
141 fprintf( stderr
, "could not open %s\n", pInFile
);
145 if( osl_setFilePos( m_aReadHandle
, osl_Pos_End
, 0 ) != osl_File_E_None
)
147 fprintf( stderr
, "could not seek to end of %s\n", pInFile
);
148 osl_closeFile( m_aReadHandle
);
152 sal_uInt64 nFileSize
= 0;
153 if( osl_getFilePos( m_aReadHandle
, &nFileSize
) != osl_File_E_None
)
155 fprintf( stderr
, "could not get end pos of %s\n", pInFile
);
156 osl_closeFile( m_aReadHandle
);
160 m_nReadLen
= static_cast<unsigned int>(nFileSize
);
163 bool FileEmitContext::write( const void* pBuf
, unsigned int nLen
) throw()
168 sal_uInt64 nWrite
= static_cast<sal_uInt64
>(nLen
);
169 sal_uInt64 nWritten
= 0;
170 return (osl_writeFile( m_aHandle
, pBuf
, nWrite
, &nWritten
) == osl_File_E_None
)
171 && nWrite
== nWritten
;
174 unsigned int FileEmitContext::getCurPos() throw()
176 sal_uInt64 nFileSize
= 0;
179 if( osl_getFilePos( m_aHandle
, &nFileSize
) != osl_File_E_None
)
182 return static_cast<unsigned int>(nFileSize
);
185 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) throw()
187 if( nOrigOffset
+ nLen
> m_nReadLen
)
190 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
192 fprintf( stderr
, "could not seek to offset %u\n", nOrigOffset
);
195 void* pBuf
= rtl_allocateMemory( nLen
);
198 sal_uInt64 nBytesRead
= 0;
199 if( osl_readFile( m_aReadHandle
, pBuf
, nLen
, &nBytesRead
) != osl_File_E_None
200 || nBytesRead
!= static_cast<sal_uInt64
>(nLen
) )
202 fprintf( stderr
, "could not read %u bytes\n", nLen
);
203 rtl_freeMemory( pBuf
);
206 bool bRet
= write( pBuf
, nLen
);
207 rtl_freeMemory( pBuf
);
211 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) throw()
213 if( nOrigOffset
+ nLen
> m_nReadLen
)
216 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
218 fprintf( stderr
, "could not seek to offset %u\n", nOrigOffset
);
221 sal_uInt64 nBytesRead
= 0;
222 if( osl_readFile( m_aReadHandle
, pBuf
, nLen
, &nBytesRead
) != osl_File_E_None
)
224 return static_cast<unsigned int>(nBytesRead
);
227 typedef int(*PDFFileHdl
)(const char*, const char*, PDFFile
*);
229 int handleFile( const char* pInFile
, const char* pOutFile
, const char* pPassword
, PDFFileHdl pHdl
)
234 PDFEntry
* pEntry
= aParser
.read( pInFile
);
237 PDFFile
* pPDFFile
= dynamic_cast<PDFFile
*>(pEntry
);
240 fprintf( stdout
, "have a %s PDF file\n", pPDFFile
->isEncrypted() ? "encrypted" : "unencrypted" );
242 fprintf( stdout
, "password %s\n",
243 pPDFFile
->setupDecryptionData( pPassword
) ? "matches" : "does not match" );
244 nRet
= pHdl( pInFile
, pOutFile
, pPDFFile
);
253 int write_unzipFile( const char* pInFile
, const char* pOutFile
, PDFFile
* pPDFFile
)
255 FileEmitContext
aContext( pOutFile
, pInFile
, pPDFFile
);
256 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
257 pPDFFile
->emit(aContext
);
261 int write_addStreamArray( const char* pOutFile
, PDFArray
* pStreams
, PDFFile
* pPDFFile
, const char* pInFile
)
264 unsigned int nArrayElements
= pStreams
->m_aSubElements
.size();
265 for( unsigned int i
= 0; i
< nArrayElements
-1 && nRet
== 0; i
++ )
267 PDFName
* pMimeType
= dynamic_cast<PDFName
*>(pStreams
->m_aSubElements
[i
]);
268 PDFObjectRef
* pStreamRef
= dynamic_cast<PDFObjectRef
*>(pStreams
->m_aSubElements
[i
+1]);
270 fprintf( stderr
, "error: no mimetype element\n" );
272 fprintf( stderr
, "error: no stream ref element\n" );
273 if( pMimeType
&& pStreamRef
)
275 fprintf( stdout
, "found stream %d %d with mimetype %s\n",
276 pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
,
277 pMimeType
->m_aName
.getStr() );
278 PDFObject
* pObject
= pPDFFile
->findObject( pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
);
281 rtl::OStringBuffer
aOutStream( pOutFile
);
282 aOutStream
.append( "_stream_" );
283 aOutStream
.append( sal_Int32(pStreamRef
->m_nNumber
) );
284 aOutStream
.append( "_" );
285 aOutStream
.append( sal_Int32(pStreamRef
->m_nGeneration
) );
286 FileEmitContext
aContext( aOutStream
.getStr(), pInFile
, pPDFFile
);
287 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
288 pObject
->writeStream( aContext
, pPDFFile
);
292 fprintf( stderr
, "object not found\n" );
302 int write_addStreams( const char* pInFile
, const char* pOutFile
, PDFFile
* pPDFFile
)
306 unsigned int nElements
= pPDFFile
->m_aSubElements
.size();
307 for( unsigned i
= 0; i
< nElements
&& nRet
== 0; i
++ )
309 PDFTrailer
* pTrailer
= dynamic_cast<PDFTrailer
*>(pPDFFile
->m_aSubElements
[i
]);
310 if( pTrailer
&& pTrailer
->m_pDict
)
312 // search for AdditionalStreams entry
313 std::hash_map
<rtl::OString
,PDFEntry
*,rtl::OStringHash
>::iterator add_stream
;
314 add_stream
= pTrailer
->m_pDict
->m_aMap
.find( "AdditionalStreams" );
315 if( add_stream
!= pTrailer
->m_pDict
->m_aMap
.end() )
317 PDFArray
* pStreams
= dynamic_cast<PDFArray
*>(add_stream
->second
);
319 nRet
= write_addStreamArray( pOutFile
, pStreams
, pPDFFile
, pInFile
);
326 int write_fonts( const char* i_pInFile
, const char* i_pOutFile
, PDFFile
* i_pPDFFile
)
329 unsigned int nElements
= i_pPDFFile
->m_aSubElements
.size();
330 for( unsigned i
= 0; i
< nElements
&& nRet
== 0; i
++ )
332 // search FontDescriptors
333 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(i_pPDFFile
->m_aSubElements
[i
]);
336 PDFDict
* pDict
= dynamic_cast<PDFDict
*>(pObj
->m_pObject
);
340 std::hash_map
<rtl::OString
,PDFEntry
*,rtl::OStringHash
>::iterator map_it
=
341 pDict
->m_aMap
.find( "Type" );
342 if( map_it
== pDict
->m_aMap
.end() )
345 PDFName
* pName
= dynamic_cast<PDFName
*>(map_it
->second
);
348 if( ! pName
->m_aName
.equals( "FontDescriptor" ) )
351 // the font name will be helpful, also there must be one in
353 map_it
= pDict
->m_aMap
.find( "FontName" );
354 if( map_it
== pDict
->m_aMap
.end() )
356 pName
= dynamic_cast<PDFName
*>(map_it
->second
);
359 rtl::OString
aFontName( pName
->m_aName
);
361 PDFObjectRef
* pStreamRef
= 0;
362 const char* pFileType
= NULL
;
363 // we have a font descriptor, try for a type 1 font
364 map_it
= pDict
->m_aMap
.find( "FontFile" );
365 if( map_it
!= pDict
->m_aMap
.end() )
367 pStreamRef
= dynamic_cast<PDFObjectRef
*>(map_it
->second
);
372 // perhaps it's a truetype file ?
375 map_it
= pDict
->m_aMap
.find( "FontFile2" );
376 if( map_it
!= pDict
->m_aMap
.end() )
378 pStreamRef
= dynamic_cast<PDFObjectRef
*>(map_it
->second
);
387 PDFObject
* pStream
= i_pPDFFile
->findObject( pStreamRef
);
391 rtl::OStringBuffer
aOutStream( i_pOutFile
);
392 aOutStream
.append( "_font_" );
393 aOutStream
.append( sal_Int32(pStreamRef
->m_nNumber
) );
394 aOutStream
.append( "_" );
395 aOutStream
.append( sal_Int32(pStreamRef
->m_nGeneration
) );
396 aOutStream
.append( "_" );
397 aOutStream
.append( aFontName
);
400 aOutStream
.append( "." );
401 aOutStream
.append( pFileType
);
403 FileEmitContext
aContext( aOutStream
.getStr(), i_pInFile
, i_pPDFFile
);
404 aContext
.m_bDecrypt
= i_pPDFFile
->isEncrypted();
405 pStream
->writeStream( aContext
, i_pPDFFile
);
410 std::vector
< std::pair
< sal_Int32
, sal_Int32
> > s_aEmitObjects
;
412 int write_objects( const char* i_pInFile
, const char* i_pOutFile
, PDFFile
* i_pPDFFile
)
415 unsigned int nElements
= s_aEmitObjects
.size();
416 for( unsigned i
= 0; i
< nElements
&& nRet
== 0; i
++ )
418 sal_Int32 nObject
= s_aEmitObjects
[i
].first
;
419 sal_Int32 nGeneration
= s_aEmitObjects
[i
].second
;
420 PDFObject
* pStream
= i_pPDFFile
->findObject( nObject
, nGeneration
);
423 fprintf( stderr
, "object %d %d not found !\n", (int)nObject
, (int)nGeneration
);
427 rtl::OStringBuffer
aOutStream( i_pOutFile
);
428 aOutStream
.append( "_stream_" );
429 aOutStream
.append( nObject
);
430 aOutStream
.append( "_" );
431 aOutStream
.append( nGeneration
);
432 FileEmitContext
aContext( aOutStream
.getStr(), i_pInFile
, i_pPDFFile
);
433 aContext
.m_bDecrypt
= i_pPDFFile
->isEncrypted();
434 pStream
->writeStream( aContext
, i_pPDFFile
);
439 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc
, argv
)
441 const char* pInFile
= NULL
;
442 const char* pOutFile
= NULL
;
443 const char* pPassword
= NULL
;
444 OStringBuffer
aOutFile( 256 );
445 PDFFileHdl aHdl
= write_unzipFile
;
447 for( int nArg
= 1; nArg
< argc
; nArg
++ )
449 if( argv
[nArg
][0] == '-' )
451 if( ! rtl_str_compare( "-pw", argv
[nArg
] ) ||
452 ! rtl_str_compare( "--password" , argv
[nArg
] ) )
456 fprintf( stderr
, "no password given\n" );
460 pPassword
= argv
[nArg
];
462 else if( ! rtl_str_compare( "-h", argv
[nArg
] ) ||
463 ! rtl_str_compare( "--help", argv
[nArg
] ) )
465 printHelp( argv
[0] );
468 else if( ! rtl_str_compare( "-a", argv
[nArg
] ) ||
469 ! rtl_str_compare( "--extract-add-streams", argv
[nArg
] ) )
471 aHdl
= write_addStreams
;
473 else if( ! rtl_str_compare( "-f", argv
[nArg
] ) ||
474 ! rtl_str_compare( "--extract-fonts", argv
[nArg
] ) )
478 else if( ! rtl_str_compare( "-o", argv
[nArg
] ) ||
479 ! rtl_str_compare( "--extract-objects", argv
[nArg
] ) )
481 aHdl
= write_objects
;
485 rtl::OString
aObjs( argv
[nArg
] );
486 sal_Int32 nIndex
= 0;
487 while( nIndex
!= -1 )
489 rtl::OString
aToken( aObjs
.getToken( 0, ',', nIndex
) );
490 sal_Int32 nObject
= 0;
491 sal_Int32 nGeneration
= 0;
492 sal_Int32 nGenIndex
= 0;
493 nObject
= aToken
.getToken( 0, ':', nGenIndex
).toInt32();
494 if( nGenIndex
!= -1 )
495 nGeneration
= aToken
.getToken( 0, ':', nGenIndex
).toInt32();
496 s_aEmitObjects
.push_back( std::pair
<sal_Int32
,sal_Int32
>(nObject
,nGeneration
) );
502 fprintf( stderr
, "unrecognized option \"%s\"\n",
504 printHelp( argv
[0] );
508 else if( pInFile
== NULL
)
509 pInFile
= argv
[nArg
];
510 else if( pOutFile
== NULL
)
511 pOutFile
= argv
[nArg
];
515 fprintf( stderr
, "no input file given\n" );
520 OString
aFile( pInFile
);
521 if( aFile
.getLength() > 0 )
523 if( aFile
.getLength() > 4 )
525 if( aFile
.matchIgnoreAsciiCase( OString( ".pdf" ), aFile
.getLength()-4 ) )
526 aOutFile
.append( pInFile
, aFile
.getLength() - 4 );
528 aOutFile
.append( aFile
);
530 aOutFile
.append( "_unzip.pdf" );
531 pOutFile
= aOutFile
.getStr();
535 fprintf( stderr
, "no output file given\n" );
540 return handleFile( pInFile
, pOutFile
, pPassword
, aHdl
);