1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
24 #include <osl/thread.h>
25 #include <rtl/alloc.h>
26 #include <rtl/ustring.hxx>
27 #include <rtl/strbuf.hxx>
29 #include <pdfparse.hxx>
31 using namespace pdfparse
;
34 static void printHelp( const char* pExe
)
37 "USAGE: %s [-h,--help]\n"
38 " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
39 " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
40 " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
41 " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
42 " -h, --help: show help\n"
43 " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
44 " and prints the mimetype found to stdout\n"
45 " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
46 " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
47 " object numbers, where object number and generation number are separated by \':\'\n"
48 " an omitted generation number defaults to 0\n"
49 " -pw, --password: use password for decryption\n"
51 "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
52 , pExe
, pExe
, pExe
, pExe
, pExe
);
55 class FileEmitContext
: public EmitContext
57 oslFileHandle m_aHandle
;
58 oslFileHandle m_aReadHandle
;
59 unsigned int m_nReadLen
;
61 void openReadFile( const char* pOrigName
);
64 FileEmitContext( const char* pFileName
, const char* pOrigName
, const PDFContainer
* pTop
);
65 virtual ~FileEmitContext() override
;
67 virtual bool write( const void* pBuf
, unsigned int nLen
) throw() override
;
68 virtual unsigned int getCurPos() throw() override
;
69 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) throw() override
;
70 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) throw() override
;
73 FileEmitContext::FileEmitContext( const char* pFileName
, const char* pOrigName
, const PDFContainer
* pTop
)
74 : EmitContext( pTop
),
76 m_aReadHandle( nullptr ),
79 OUString
aSysFile( OStringToOUString( OString( pFileName
), osl_getThreadTextEncoding() ) );
81 if( osl_getFileURLFromSystemPath( aSysFile
.pData
, &aURL
.pData
) != osl_File_E_None
)
83 fprintf( stderr
, "filename conversion \"%s\" failed\n", pFileName
);
87 if( osl_openFile( aURL
.pData
, &m_aHandle
, osl_File_OpenFlag_Write
) == osl_File_E_None
)
89 if( osl_setFileSize( m_aHandle
, 0 ) != osl_File_E_None
)
91 fprintf( stderr
, "could not truncate %s\n", pFileName
);
92 osl_closeFile( m_aHandle
);
96 else if( osl_openFile( aURL
.pData
, &m_aHandle
,
97 osl_File_OpenFlag_Write
|osl_File_OpenFlag_Create
) != osl_File_E_None
)
99 fprintf( stderr
, "could not open %s\n", pFileName
);
104 openReadFile( pOrigName
);
107 FileEmitContext::~FileEmitContext()
110 osl_closeFile( m_aHandle
);
112 osl_closeFile( m_aReadHandle
);
115 void FileEmitContext::openReadFile( const char* pInFile
)
117 OUString
aSysFile( OStringToOUString( OString( pInFile
), osl_getThreadTextEncoding() ) );
119 if( osl_getFileURLFromSystemPath( aSysFile
.pData
, &aURL
.pData
) != osl_File_E_None
)
121 fprintf( stderr
, "filename conversion \"%s\" failed\n", pInFile
);
125 if( osl_openFile( aURL
.pData
, &m_aReadHandle
, osl_File_OpenFlag_Read
) != osl_File_E_None
)
127 fprintf( stderr
, "could not open %s\n", pInFile
);
131 if( osl_setFilePos( m_aReadHandle
, osl_Pos_End
, 0 ) != osl_File_E_None
)
133 fprintf( stderr
, "could not seek to end of %s\n", pInFile
);
134 osl_closeFile( m_aReadHandle
);
138 sal_uInt64 nFileSize
= 0;
139 if( osl_getFilePos( m_aReadHandle
, &nFileSize
) != osl_File_E_None
)
141 fprintf( stderr
, "could not get end pos of %s\n", pInFile
);
142 osl_closeFile( m_aReadHandle
);
146 m_nReadLen
= static_cast<unsigned int>(nFileSize
);
149 bool FileEmitContext::write( const void* pBuf
, unsigned int nLen
) throw()
154 sal_uInt64 nWrite
= static_cast<sal_uInt64
>(nLen
);
155 sal_uInt64 nWritten
= 0;
156 return (osl_writeFile( m_aHandle
, pBuf
, nWrite
, &nWritten
) == osl_File_E_None
)
157 && nWrite
== nWritten
;
160 unsigned int FileEmitContext::getCurPos() throw()
162 sal_uInt64 nFileSize
= 0;
165 if( osl_getFilePos( m_aHandle
, &nFileSize
) != osl_File_E_None
)
168 return static_cast<unsigned int>(nFileSize
);
171 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) throw()
173 if( nOrigOffset
+ nLen
> m_nReadLen
)
176 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
178 fprintf( stderr
, "could not seek to offset %u\n", nOrigOffset
);
181 void* pBuf
= std::malloc( nLen
);
184 sal_uInt64 nBytesRead
= 0;
185 if( osl_readFile( m_aReadHandle
, pBuf
, nLen
, &nBytesRead
) != osl_File_E_None
186 || nBytesRead
!= static_cast<sal_uInt64
>(nLen
) )
188 fprintf( stderr
, "could not read %u bytes\n", nLen
);
192 bool bRet
= write( pBuf
, nLen
);
197 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) throw()
199 if( nOrigOffset
+ nLen
> m_nReadLen
)
202 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
204 fprintf( stderr
, "could not seek to offset %u\n", nOrigOffset
);
207 sal_uInt64 nBytesRead
= 0;
208 if( osl_readFile( m_aReadHandle
, pBuf
, nLen
, &nBytesRead
) != osl_File_E_None
)
210 return static_cast<unsigned int>(nBytesRead
);
213 typedef int(*PDFFileHdl
)(const char*, const char*, PDFFile
*);
215 static int handleFile( const char* pInFile
, const char* pOutFile
, const char* pPassword
, PDFFileHdl pHdl
)
218 std::unique_ptr
<PDFEntry
> pEntry
= pdfparse::PDFReader::read( pInFile
);
221 PDFFile
* pPDFFile
= dynamic_cast<PDFFile
*>(pEntry
.get());
224 fprintf( stdout
, "have a %s PDF file\n", pPDFFile
->isEncrypted() ? "encrypted" : "unencrypted" );
226 fprintf( stdout
, "password %s\n",
227 pPDFFile
->setupDecryptionData( pPassword
) ? "matches" : "does not match" );
228 nRet
= pHdl( pInFile
, pOutFile
, pPDFFile
);
236 static int write_unzipFile( const char* pInFile
, const char* pOutFile
, PDFFile
* pPDFFile
)
238 FileEmitContext
aContext( pOutFile
, pInFile
, pPDFFile
);
239 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
240 pPDFFile
->emit(aContext
);
244 static int write_addStreamArray( const char* pOutFile
, PDFArray
* pStreams
, PDFFile
* pPDFFile
, const char* pInFile
)
247 unsigned int nArrayElements
= pStreams
->m_aSubElements
.size();
248 for( unsigned int i
= 0; i
< nArrayElements
-1 && nRet
== 0; i
++ )
250 PDFName
* pMimeType
= dynamic_cast<PDFName
*>(pStreams
->m_aSubElements
[i
].get());
251 PDFObjectRef
* pStreamRef
= dynamic_cast<PDFObjectRef
*>(pStreams
->m_aSubElements
[i
+1].get());
253 fprintf( stderr
, "error: no mimetype element\n" );
255 fprintf( stderr
, "error: no stream ref element\n" );
256 if( pMimeType
&& pStreamRef
)
258 fprintf( stdout
, "found stream %d %d with mimetype %s\n",
259 pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
,
260 pMimeType
->m_aName
.getStr() );
261 PDFObject
* pObject
= pPDFFile
->findObject( pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
);
264 OString aOutStream
= pOutFile
+
265 OStringLiteral("_stream_") +
266 OString::number( sal_Int32(pStreamRef
->m_nNumber
) ) +
268 OString::number( sal_Int32(pStreamRef
->m_nGeneration
) );
269 FileEmitContext
aContext( aOutStream
.getStr(), pInFile
, pPDFFile
);
270 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
271 pObject
->writeStream( aContext
, pPDFFile
);
275 fprintf( stderr
, "object not found\n" );
285 static int write_addStreams( const char* pInFile
, const char* pOutFile
, PDFFile
* pPDFFile
)
289 unsigned int nElements
= pPDFFile
->m_aSubElements
.size();
290 for( unsigned i
= 0; i
< nElements
&& nRet
== 0; i
++ )
292 PDFTrailer
* pTrailer
= dynamic_cast<PDFTrailer
*>(pPDFFile
->m_aSubElements
[i
].get());
293 if( pTrailer
&& pTrailer
->m_pDict
)
295 // search for AdditionalStreams entry
296 auto add_stream
= pTrailer
->m_pDict
->m_aMap
.find( "AdditionalStreams" );
297 if( add_stream
!= pTrailer
->m_pDict
->m_aMap
.end() )
299 PDFArray
* pStreams
= dynamic_cast<PDFArray
*>(add_stream
->second
);
301 nRet
= write_addStreamArray( pOutFile
, pStreams
, pPDFFile
, pInFile
);
308 static int write_fonts( const char* i_pInFile
, const char* i_pOutFile
, PDFFile
* i_pPDFFile
)
310 unsigned int nElements
= i_pPDFFile
->m_aSubElements
.size();
311 for (unsigned i
= 0; i
< nElements
; i
++)
313 // search FontDescriptors
314 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(i_pPDFFile
->m_aSubElements
[i
].get());
317 PDFDict
* pDict
= dynamic_cast<PDFDict
*>(pObj
->m_pObject
);
321 std::unordered_map
<OString
,PDFEntry
*>::iterator map_it
=
322 pDict
->m_aMap
.find( "Type" );
323 if( map_it
== pDict
->m_aMap
.end() )
326 PDFName
* pName
= dynamic_cast<PDFName
*>(map_it
->second
);
329 if( pName
->m_aName
!= "FontDescriptor" )
332 // the font name will be helpful, also there must be one in
334 map_it
= pDict
->m_aMap
.find( "FontName" );
335 if( map_it
== pDict
->m_aMap
.end() )
337 pName
= dynamic_cast<PDFName
*>(map_it
->second
);
340 OString
aFontName( pName
->m_aName
);
342 PDFObjectRef
* pStreamRef
= nullptr;
343 const char* pFileType
= nullptr;
344 // we have a font descriptor, try for a type 1 font
345 map_it
= pDict
->m_aMap
.find( "FontFile" );
346 if( map_it
!= pDict
->m_aMap
.end() )
348 pStreamRef
= dynamic_cast<PDFObjectRef
*>(map_it
->second
);
353 // perhaps it's a truetype file ?
356 map_it
= pDict
->m_aMap
.find( "FontFile2" );
357 if( map_it
!= pDict
->m_aMap
.end() )
359 pStreamRef
= dynamic_cast<PDFObjectRef
*>(map_it
->second
);
368 PDFObject
* pStream
= i_pPDFFile
->findObject( pStreamRef
);
372 OStringBuffer
aOutStream( i_pOutFile
);
373 aOutStream
.append( "_font_" );
374 aOutStream
.append( sal_Int32(pStreamRef
->m_nNumber
) );
375 aOutStream
.append( "_" );
376 aOutStream
.append( sal_Int32(pStreamRef
->m_nGeneration
) );
377 aOutStream
.append( "_" );
378 aOutStream
.append( aFontName
);
381 aOutStream
.append( "." );
382 aOutStream
.append( pFileType
);
384 FileEmitContext
aContext( aOutStream
.getStr(), i_pInFile
, i_pPDFFile
);
385 aContext
.m_bDecrypt
= i_pPDFFile
->isEncrypted();
386 pStream
->writeStream( aContext
, i_pPDFFile
);
391 static std::vector
< std::pair
< sal_Int32
, sal_Int32
> > s_aEmitObjects
;
393 static int write_objects( const char* i_pInFile
, const char* i_pOutFile
, PDFFile
* i_pPDFFile
)
395 unsigned int nElements
= s_aEmitObjects
.size();
396 for (unsigned i
= 0; i
< nElements
; i
++)
398 sal_Int32 nObject
= s_aEmitObjects
[i
].first
;
399 sal_Int32 nGeneration
= s_aEmitObjects
[i
].second
;
400 PDFObject
* pStream
= i_pPDFFile
->findObject( nObject
, nGeneration
);
403 fprintf( stderr
, "object %d %d not found !\n", static_cast<int>(nObject
), static_cast<int>(nGeneration
) );
407 OString aOutStream
= i_pOutFile
+
408 OStringLiteral("_stream_") +
409 OString::number( nObject
) +
411 OString::number( nGeneration
);
412 FileEmitContext
aContext( aOutStream
.getStr(), i_pInFile
, i_pPDFFile
);
413 aContext
.m_bDecrypt
= i_pPDFFile
->isEncrypted();
414 pStream
->writeStream( aContext
, i_pPDFFile
);
419 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc
, argv
)
421 const char* pInFile
= nullptr;
422 const char* pOutFile
= nullptr;
423 const char* pPassword
= nullptr;
424 OStringBuffer
aOutFile( 256 );
425 PDFFileHdl aHdl
= write_unzipFile
;
427 for( int nArg
= 1; nArg
< argc
; nArg
++ )
429 if( argv
[nArg
][0] == '-' )
431 if( ! rtl_str_compare( "-pw", argv
[nArg
] ) ||
432 ! rtl_str_compare( "--password" , argv
[nArg
] ) )
436 fprintf( stderr
, "no password given\n" );
440 pPassword
= argv
[nArg
];
442 else if( ! rtl_str_compare( "-h", argv
[nArg
] ) ||
443 ! rtl_str_compare( "--help", argv
[nArg
] ) )
445 printHelp( argv
[0] );
448 else if( ! rtl_str_compare( "-a", argv
[nArg
] ) ||
449 ! rtl_str_compare( "--extract-add-streams", argv
[nArg
] ) )
451 aHdl
= write_addStreams
;
453 else if( ! rtl_str_compare( "-f", argv
[nArg
] ) ||
454 ! rtl_str_compare( "--extract-fonts", argv
[nArg
] ) )
458 else if( ! rtl_str_compare( "-o", argv
[nArg
] ) ||
459 ! rtl_str_compare( "--extract-objects", argv
[nArg
] ) )
461 aHdl
= write_objects
;
465 OString
aObjs( argv
[nArg
] );
466 sal_Int32 nIndex
= 0;
467 while( nIndex
!= -1 )
469 OString
aToken( aObjs
.getToken( 0, ',', nIndex
) );
470 sal_Int32 nObject
= 0;
471 sal_Int32 nGeneration
= 0;
472 sal_Int32 nGenIndex
= 0;
473 nObject
= aToken
.getToken( 0, ':', nGenIndex
).toInt32();
474 if( nGenIndex
!= -1 )
475 nGeneration
= aToken
.getToken( 0, ':', nGenIndex
).toInt32();
476 s_aEmitObjects
.push_back( std::pair
<sal_Int32
,sal_Int32
>(nObject
,nGeneration
) );
482 fprintf( stderr
, "unrecognized option \"%s\"\n",
484 printHelp( argv
[0] );
488 else if( pInFile
== nullptr )
489 pInFile
= argv
[nArg
];
490 else if( pOutFile
== nullptr )
491 pOutFile
= argv
[nArg
];
495 fprintf( stderr
, "no input file given\n" );
500 OString
aFile( pInFile
);
501 if( aFile
.getLength() > 0 )
503 if( aFile
.getLength() > 4 )
505 if( aFile
.matchIgnoreAsciiCase( ".pdf", aFile
.getLength()-4 ) )
506 aOutFile
.append( pInFile
, aFile
.getLength() - 4 );
508 aOutFile
.append( aFile
);
510 aOutFile
.append( "_unzip.pdf" );
511 pOutFile
= aOutFile
.getStr();
515 fprintf( stderr
, "no output file given\n" );
520 return handleFile( pInFile
, pOutFile
, pPassword
, aHdl
);
523 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */