1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
24 #include <osl/thread.h>
25 #include <rtl/alloc.h>
26 #include <rtl/ustring.hxx>
27 #include <rtl/strbuf.hxx>
29 #include "pdfparse.hxx"
31 using namespace pdfparse
;
34 void printHelp( const char* pExe
)
37 "USAGE: %s [-h,--help]\n"
38 " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
39 " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
40 " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
41 " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
42 " -h, --help: show help\n"
43 " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
44 " and prints the mimetype found to stdout\n"
45 " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
46 " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
47 " object numbers, where object number and generation number are separated by \':\'\n"
48 " an omitted generation number defaults to 0\n"
49 " -pw, --password: use password for decryption\n"
51 "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
52 , pExe
, pExe
, pExe
, pExe
, pExe
);
55 class FileEmitContext
: public EmitContext
57 oslFileHandle m_aHandle
;
58 oslFileHandle m_aReadHandle
;
59 unsigned int m_nReadLen
;
61 void openReadFile( const char* pOrigName
);
64 FileEmitContext( const char* pFileName
, const char* pOrigName
, const PDFContainer
* pTop
);
65 virtual ~FileEmitContext();
67 virtual bool write( const void* pBuf
, unsigned int nLen
) throw() SAL_OVERRIDE
;
68 virtual unsigned int getCurPos() throw() SAL_OVERRIDE
;
69 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) throw() SAL_OVERRIDE
;
70 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) throw() SAL_OVERRIDE
;
73 FileEmitContext::FileEmitContext( const char* pFileName
, const char* pOrigName
, const PDFContainer
* pTop
)
74 : EmitContext( pTop
),
76 m_aReadHandle( NULL
),
79 OUString
aSysFile( OStringToOUString( OString( pFileName
), osl_getThreadTextEncoding() ) );
81 if( osl_getFileURLFromSystemPath( aSysFile
.pData
, &aURL
.pData
) != osl_File_E_None
)
83 fprintf( stderr
, "filename conversion \"%s\" failed\n", pFileName
);
87 if( osl_openFile( aURL
.pData
, &m_aHandle
, osl_File_OpenFlag_Write
) == osl_File_E_None
)
89 if( osl_setFileSize( m_aHandle
, 0 ) != osl_File_E_None
)
91 fprintf( stderr
, "could not truncate %s\n", pFileName
);
92 osl_closeFile( m_aHandle
);
96 else if( osl_openFile( aURL
.pData
, &m_aHandle
,
97 osl_File_OpenFlag_Write
|osl_File_OpenFlag_Create
) != osl_File_E_None
)
99 fprintf( stderr
, "could not open %s\n", pFileName
);
104 openReadFile( pOrigName
);
107 FileEmitContext::~FileEmitContext()
110 osl_closeFile( m_aHandle
);
112 osl_closeFile( m_aReadHandle
);
115 void FileEmitContext::openReadFile( const char* pInFile
)
117 OUString
aSysFile( OStringToOUString( OString( pInFile
), osl_getThreadTextEncoding() ) );
119 if( osl_getFileURLFromSystemPath( aSysFile
.pData
, &aURL
.pData
) != osl_File_E_None
)
121 fprintf( stderr
, "filename conversion \"%s\" failed\n", pInFile
);
125 if( osl_openFile( aURL
.pData
, &m_aReadHandle
, osl_File_OpenFlag_Read
) != osl_File_E_None
)
127 fprintf( stderr
, "could not open %s\n", pInFile
);
131 if( osl_setFilePos( m_aReadHandle
, osl_Pos_End
, 0 ) != osl_File_E_None
)
133 fprintf( stderr
, "could not seek to end of %s\n", pInFile
);
134 osl_closeFile( m_aReadHandle
);
138 sal_uInt64 nFileSize
= 0;
139 if( osl_getFilePos( m_aReadHandle
, &nFileSize
) != osl_File_E_None
)
141 fprintf( stderr
, "could not get end pos of %s\n", pInFile
);
142 osl_closeFile( m_aReadHandle
);
146 m_nReadLen
= static_cast<unsigned int>(nFileSize
);
149 bool FileEmitContext::write( const void* pBuf
, unsigned int nLen
) throw()
154 sal_uInt64 nWrite
= static_cast<sal_uInt64
>(nLen
);
155 sal_uInt64 nWritten
= 0;
156 return (osl_writeFile( m_aHandle
, pBuf
, nWrite
, &nWritten
) == osl_File_E_None
)
157 && nWrite
== nWritten
;
160 unsigned int FileEmitContext::getCurPos() throw()
162 sal_uInt64 nFileSize
= 0;
165 if( osl_getFilePos( m_aHandle
, &nFileSize
) != osl_File_E_None
)
168 return static_cast<unsigned int>(nFileSize
);
171 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) throw()
173 if( nOrigOffset
+ nLen
> m_nReadLen
)
176 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
178 fprintf( stderr
, "could not seek to offset %u\n", nOrigOffset
);
181 void* pBuf
= rtl_allocateMemory( nLen
);
184 sal_uInt64 nBytesRead
= 0;
185 if( osl_readFile( m_aReadHandle
, pBuf
, nLen
, &nBytesRead
) != osl_File_E_None
186 || nBytesRead
!= static_cast<sal_uInt64
>(nLen
) )
188 fprintf( stderr
, "could not read %u bytes\n", nLen
);
189 rtl_freeMemory( pBuf
);
192 bool bRet
= write( pBuf
, nLen
);
193 rtl_freeMemory( pBuf
);
197 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) throw()
199 if( nOrigOffset
+ nLen
> m_nReadLen
)
202 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
204 fprintf( stderr
, "could not seek to offset %u\n", nOrigOffset
);
207 sal_uInt64 nBytesRead
= 0;
208 if( osl_readFile( m_aReadHandle
, pBuf
, nLen
, &nBytesRead
) != osl_File_E_None
)
210 return static_cast<unsigned int>(nBytesRead
);
213 typedef int(*PDFFileHdl
)(const char*, const char*, PDFFile
*);
215 int handleFile( const char* pInFile
, const char* pOutFile
, const char* pPassword
, PDFFileHdl pHdl
)
220 PDFEntry
* pEntry
= pdfparse::PDFReader::read( pInFile
);
223 PDFFile
* pPDFFile
= dynamic_cast<PDFFile
*>(pEntry
);
226 fprintf( stdout
, "have a %s PDF file\n", pPDFFile
->isEncrypted() ? "encrypted" : "unencrypted" );
228 fprintf( stdout
, "password %s\n",
229 pPDFFile
->setupDecryptionData( pPassword
) ? "matches" : "does not match" );
230 nRet
= pHdl( pInFile
, pOutFile
, pPDFFile
);
239 int write_unzipFile( const char* pInFile
, const char* pOutFile
, PDFFile
* pPDFFile
)
241 FileEmitContext
aContext( pOutFile
, pInFile
, pPDFFile
);
242 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
243 pPDFFile
->emit(aContext
);
247 int write_addStreamArray( const char* pOutFile
, PDFArray
* pStreams
, PDFFile
* pPDFFile
, const char* pInFile
)
250 unsigned int nArrayElements
= pStreams
->m_aSubElements
.size();
251 for( unsigned int i
= 0; i
< nArrayElements
-1 && nRet
== 0; i
++ )
253 PDFName
* pMimeType
= dynamic_cast<PDFName
*>(pStreams
->m_aSubElements
[i
]);
254 PDFObjectRef
* pStreamRef
= dynamic_cast<PDFObjectRef
*>(pStreams
->m_aSubElements
[i
+1]);
256 fprintf( stderr
, "error: no mimetype element\n" );
258 fprintf( stderr
, "error: no stream ref element\n" );
259 if( pMimeType
&& pStreamRef
)
261 fprintf( stdout
, "found stream %d %d with mimetype %s\n",
262 pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
,
263 pMimeType
->m_aName
.getStr() );
264 PDFObject
* pObject
= pPDFFile
->findObject( pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
);
267 OStringBuffer
aOutStream( pOutFile
);
268 aOutStream
.append( "_stream_" );
269 aOutStream
.append( sal_Int32(pStreamRef
->m_nNumber
) );
270 aOutStream
.append( "_" );
271 aOutStream
.append( sal_Int32(pStreamRef
->m_nGeneration
) );
272 FileEmitContext
aContext( aOutStream
.getStr(), pInFile
, pPDFFile
);
273 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
274 pObject
->writeStream( aContext
, pPDFFile
);
278 fprintf( stderr
, "object not found\n" );
288 int write_addStreams( const char* pInFile
, const char* pOutFile
, PDFFile
* pPDFFile
)
292 unsigned int nElements
= pPDFFile
->m_aSubElements
.size();
293 for( unsigned i
= 0; i
< nElements
&& nRet
== 0; i
++ )
295 PDFTrailer
* pTrailer
= dynamic_cast<PDFTrailer
*>(pPDFFile
->m_aSubElements
[i
]);
296 if( pTrailer
&& pTrailer
->m_pDict
)
298 // search for AdditionalStreams entry
299 std::unordered_map
<OString
,PDFEntry
*,OStringHash
>::iterator add_stream
;
300 add_stream
= pTrailer
->m_pDict
->m_aMap
.find( "AdditionalStreams" );
301 if( add_stream
!= pTrailer
->m_pDict
->m_aMap
.end() )
303 PDFArray
* pStreams
= dynamic_cast<PDFArray
*>(add_stream
->second
);
305 nRet
= write_addStreamArray( pOutFile
, pStreams
, pPDFFile
, pInFile
);
312 int write_fonts( const char* i_pInFile
, const char* i_pOutFile
, PDFFile
* i_pPDFFile
)
315 unsigned int nElements
= i_pPDFFile
->m_aSubElements
.size();
316 for( unsigned i
= 0; i
< nElements
&& nRet
== 0; i
++ )
318 // search FontDescriptors
319 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(i_pPDFFile
->m_aSubElements
[i
]);
322 PDFDict
* pDict
= dynamic_cast<PDFDict
*>(pObj
->m_pObject
);
326 std::unordered_map
<OString
,PDFEntry
*,OStringHash
>::iterator map_it
=
327 pDict
->m_aMap
.find( "Type" );
328 if( map_it
== pDict
->m_aMap
.end() )
331 PDFName
* pName
= dynamic_cast<PDFName
*>(map_it
->second
);
334 if( ! pName
->m_aName
.equals( "FontDescriptor" ) )
337 // the font name will be helpful, also there must be one in
339 map_it
= pDict
->m_aMap
.find( "FontName" );
340 if( map_it
== pDict
->m_aMap
.end() )
342 pName
= dynamic_cast<PDFName
*>(map_it
->second
);
345 OString
aFontName( pName
->m_aName
);
347 PDFObjectRef
* pStreamRef
= 0;
348 const char* pFileType
= NULL
;
349 // we have a font descriptor, try for a type 1 font
350 map_it
= pDict
->m_aMap
.find( "FontFile" );
351 if( map_it
!= pDict
->m_aMap
.end() )
353 pStreamRef
= dynamic_cast<PDFObjectRef
*>(map_it
->second
);
358 // perhaps it's a truetype file ?
361 map_it
= pDict
->m_aMap
.find( "FontFile2" );
362 if( map_it
!= pDict
->m_aMap
.end() )
364 pStreamRef
= dynamic_cast<PDFObjectRef
*>(map_it
->second
);
373 PDFObject
* pStream
= i_pPDFFile
->findObject( pStreamRef
);
377 OStringBuffer
aOutStream( i_pOutFile
);
378 aOutStream
.append( "_font_" );
379 aOutStream
.append( sal_Int32(pStreamRef
->m_nNumber
) );
380 aOutStream
.append( "_" );
381 aOutStream
.append( sal_Int32(pStreamRef
->m_nGeneration
) );
382 aOutStream
.append( "_" );
383 aOutStream
.append( aFontName
);
386 aOutStream
.append( "." );
387 aOutStream
.append( pFileType
);
389 FileEmitContext
aContext( aOutStream
.getStr(), i_pInFile
, i_pPDFFile
);
390 aContext
.m_bDecrypt
= i_pPDFFile
->isEncrypted();
391 pStream
->writeStream( aContext
, i_pPDFFile
);
396 std::vector
< std::pair
< sal_Int32
, sal_Int32
> > s_aEmitObjects
;
398 int write_objects( const char* i_pInFile
, const char* i_pOutFile
, PDFFile
* i_pPDFFile
)
401 unsigned int nElements
= s_aEmitObjects
.size();
402 for( unsigned i
= 0; i
< nElements
&& nRet
== 0; i
++ )
404 sal_Int32 nObject
= s_aEmitObjects
[i
].first
;
405 sal_Int32 nGeneration
= s_aEmitObjects
[i
].second
;
406 PDFObject
* pStream
= i_pPDFFile
->findObject( nObject
, nGeneration
);
409 fprintf( stderr
, "object %d %d not found !\n", (int)nObject
, (int)nGeneration
);
413 OStringBuffer
aOutStream( i_pOutFile
);
414 aOutStream
.append( "_stream_" );
415 aOutStream
.append( nObject
);
416 aOutStream
.append( "_" );
417 aOutStream
.append( nGeneration
);
418 FileEmitContext
aContext( aOutStream
.getStr(), i_pInFile
, i_pPDFFile
);
419 aContext
.m_bDecrypt
= i_pPDFFile
->isEncrypted();
420 pStream
->writeStream( aContext
, i_pPDFFile
);
425 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc
, argv
)
427 const char* pInFile
= NULL
;
428 const char* pOutFile
= NULL
;
429 const char* pPassword
= NULL
;
430 OStringBuffer
aOutFile( 256 );
431 PDFFileHdl aHdl
= write_unzipFile
;
433 for( int nArg
= 1; nArg
< argc
; nArg
++ )
435 if( argv
[nArg
][0] == '-' )
437 if( ! rtl_str_compare( "-pw", argv
[nArg
] ) ||
438 ! rtl_str_compare( "--password" , argv
[nArg
] ) )
442 fprintf( stderr
, "no password given\n" );
446 pPassword
= argv
[nArg
];
448 else if( ! rtl_str_compare( "-h", argv
[nArg
] ) ||
449 ! rtl_str_compare( "--help", argv
[nArg
] ) )
451 printHelp( argv
[0] );
454 else if( ! rtl_str_compare( "-a", argv
[nArg
] ) ||
455 ! rtl_str_compare( "--extract-add-streams", argv
[nArg
] ) )
457 aHdl
= write_addStreams
;
459 else if( ! rtl_str_compare( "-f", argv
[nArg
] ) ||
460 ! rtl_str_compare( "--extract-fonts", argv
[nArg
] ) )
464 else if( ! rtl_str_compare( "-o", argv
[nArg
] ) ||
465 ! rtl_str_compare( "--extract-objects", argv
[nArg
] ) )
467 aHdl
= write_objects
;
471 OString
aObjs( argv
[nArg
] );
472 sal_Int32 nIndex
= 0;
473 while( nIndex
!= -1 )
475 OString
aToken( aObjs
.getToken( 0, ',', nIndex
) );
476 sal_Int32 nObject
= 0;
477 sal_Int32 nGeneration
= 0;
478 sal_Int32 nGenIndex
= 0;
479 nObject
= aToken
.getToken( 0, ':', nGenIndex
).toInt32();
480 if( nGenIndex
!= -1 )
481 nGeneration
= aToken
.getToken( 0, ':', nGenIndex
).toInt32();
482 s_aEmitObjects
.push_back( std::pair
<sal_Int32
,sal_Int32
>(nObject
,nGeneration
) );
488 fprintf( stderr
, "unrecognized option \"%s\"\n",
490 printHelp( argv
[0] );
494 else if( pInFile
== NULL
)
495 pInFile
= argv
[nArg
];
496 else if( pOutFile
== NULL
)
497 pOutFile
= argv
[nArg
];
501 fprintf( stderr
, "no input file given\n" );
506 OString
aFile( pInFile
);
507 if( aFile
.getLength() > 0 )
509 if( aFile
.getLength() > 4 )
511 if( aFile
.matchIgnoreAsciiCase( OString( ".pdf" ), aFile
.getLength()-4 ) )
512 aOutFile
.append( pInFile
, aFile
.getLength() - 4 );
514 aOutFile
.append( aFile
);
516 aOutFile
.append( "_unzip.pdf" );
517 pOutFile
= aOutFile
.getStr();
521 fprintf( stderr
, "no output file given\n" );
526 return handleFile( pInFile
, pOutFile
, pPassword
, aHdl
);
529 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */