1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 #include <string_view>
26 #include <osl/thread.h>
27 #include <rtl/alloc.h>
28 #include <rtl/ustring.hxx>
29 #include <rtl/strbuf.hxx>
30 #include <o3tl/string_view.hxx>
32 #include <pdfparse.hxx>
34 using namespace pdfparse
;
37 static void printHelp( const char* pExe
)
40 "USAGE: %s [-h,--help]\n"
41 " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
42 " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
43 " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
44 " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
45 " -h, --help: show help\n"
46 " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
47 " and prints the mimetype found to stdout\n"
48 " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
49 " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
50 " object numbers, where object number and generation number are separated by \':\'\n"
51 " an omitted generation number defaults to 0\n"
52 " -pw, --password: use password for decryption\n"
54 "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
55 , pExe
, pExe
, pExe
, pExe
, pExe
);
60 class FileEmitContext
: public EmitContext
62 oslFileHandle m_aHandle
;
63 oslFileHandle m_aReadHandle
;
64 unsigned int m_nReadLen
;
66 void openReadFile( const char* pOrigName
);
69 FileEmitContext( const char* pFileName
, const char* pOrigName
, const PDFContainer
* pTop
);
70 virtual ~FileEmitContext() override
;
72 virtual bool write( const void* pBuf
, unsigned int nLen
) noexcept override
;
73 virtual unsigned int getCurPos() noexcept override
;
74 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) noexcept override
;
75 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) noexcept override
;
80 FileEmitContext::FileEmitContext( const char* pFileName
, const char* pOrigName
, const PDFContainer
* pTop
)
81 : EmitContext( pTop
),
83 m_aReadHandle( nullptr ),
87 OStringToOUString( std::string_view( pFileName
), osl_getThreadTextEncoding() ) );
89 if( osl_getFileURLFromSystemPath( aSysFile
.pData
, &aURL
.pData
) != osl_File_E_None
)
91 fprintf( stderr
, "filename conversion \"%s\" failed\n", pFileName
);
95 if( osl_openFile( aURL
.pData
, &m_aHandle
, osl_File_OpenFlag_Write
) == osl_File_E_None
)
97 if( osl_setFileSize( m_aHandle
, 0 ) != osl_File_E_None
)
99 fprintf( stderr
, "could not truncate %s\n", pFileName
);
100 osl_closeFile( m_aHandle
);
104 else if( osl_openFile( aURL
.pData
, &m_aHandle
,
105 osl_File_OpenFlag_Write
|osl_File_OpenFlag_Create
) != osl_File_E_None
)
107 fprintf( stderr
, "could not open %s\n", pFileName
);
112 openReadFile( pOrigName
);
115 FileEmitContext::~FileEmitContext()
118 osl_closeFile( m_aHandle
);
120 osl_closeFile( m_aReadHandle
);
123 void FileEmitContext::openReadFile( const char* pInFile
)
126 OStringToOUString( std::string_view( pInFile
), osl_getThreadTextEncoding() ) );
128 if( osl_getFileURLFromSystemPath( aSysFile
.pData
, &aURL
.pData
) != osl_File_E_None
)
130 fprintf( stderr
, "filename conversion \"%s\" failed\n", pInFile
);
134 if( osl_openFile( aURL
.pData
, &m_aReadHandle
, osl_File_OpenFlag_Read
) != osl_File_E_None
)
136 fprintf( stderr
, "could not open %s\n", pInFile
);
140 if( osl_setFilePos( m_aReadHandle
, osl_Pos_End
, 0 ) != osl_File_E_None
)
142 fprintf( stderr
, "could not seek to end of %s\n", pInFile
);
143 osl_closeFile( m_aReadHandle
);
147 sal_uInt64 nFileSize
= 0;
148 if( osl_getFilePos( m_aReadHandle
, &nFileSize
) != osl_File_E_None
)
150 fprintf( stderr
, "could not get end pos of %s\n", pInFile
);
151 osl_closeFile( m_aReadHandle
);
155 m_nReadLen
= static_cast<unsigned int>(nFileSize
);
158 bool FileEmitContext::write( const void* pBuf
, unsigned int nLen
) noexcept
163 sal_uInt64 nWrite
= static_cast<sal_uInt64
>(nLen
);
164 sal_uInt64 nWritten
= 0;
165 return (osl_writeFile( m_aHandle
, pBuf
, nWrite
, &nWritten
) == osl_File_E_None
)
166 && nWrite
== nWritten
;
169 unsigned int FileEmitContext::getCurPos() noexcept
171 sal_uInt64 nFileSize
= 0;
174 if( osl_getFilePos( m_aHandle
, &nFileSize
) != osl_File_E_None
)
177 return static_cast<unsigned int>(nFileSize
);
180 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) noexcept
182 if( nOrigOffset
+ nLen
> m_nReadLen
)
185 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
187 fprintf( stderr
, "could not seek to offset %u\n", nOrigOffset
);
190 void* pBuf
= std::malloc( nLen
);
193 sal_uInt64 nBytesRead
= 0;
194 if( osl_readFile( m_aReadHandle
, pBuf
, nLen
, &nBytesRead
) != osl_File_E_None
195 || nBytesRead
!= static_cast<sal_uInt64
>(nLen
) )
197 fprintf( stderr
, "could not read %u bytes\n", nLen
);
201 bool bRet
= write( pBuf
, nLen
);
206 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) noexcept
208 if( nOrigOffset
+ nLen
> m_nReadLen
)
211 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
213 fprintf( stderr
, "could not seek to offset %u\n", nOrigOffset
);
216 sal_uInt64 nBytesRead
= 0;
217 if( osl_readFile( m_aReadHandle
, pBuf
, nLen
, &nBytesRead
) != osl_File_E_None
)
219 return static_cast<unsigned int>(nBytesRead
);
222 typedef int(*PDFFileHdl
)(const char*, const char*, PDFFile
*);
224 static int handleFile( const char* pInFile
, const char* pOutFile
, const char* pPassword
, PDFFileHdl pHdl
)
227 std::unique_ptr
<PDFEntry
> pEntry
= pdfparse::PDFReader::read( pInFile
);
230 PDFFile
* pPDFFile
= dynamic_cast<PDFFile
*>(pEntry
.get());
233 fprintf( stdout
, "have a %s PDF file\n", pPDFFile
->isEncrypted() ? "encrypted" : "unencrypted" );
235 fprintf( stdout
, "password %s\n",
236 pPDFFile
->setupDecryptionData( pPassword
) ? "matches" : "does not match" );
237 nRet
= pHdl( pInFile
, pOutFile
, pPDFFile
);
245 static int write_unzipFile( const char* pInFile
, const char* pOutFile
, PDFFile
* pPDFFile
)
247 FileEmitContext
aContext( pOutFile
, pInFile
, pPDFFile
);
248 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
249 pPDFFile
->emit(aContext
);
253 static int write_addStreamArray( const char* pOutFile
, PDFArray
* pStreams
, PDFFile
* pPDFFile
, const char* pInFile
)
256 unsigned int nArrayElements
= pStreams
->m_aSubElements
.size();
257 for( unsigned int i
= 0; i
< nArrayElements
-1 && nRet
== 0; i
++ )
259 PDFName
* pMimeType
= dynamic_cast<PDFName
*>(pStreams
->m_aSubElements
[i
].get());
260 PDFObjectRef
* pStreamRef
= dynamic_cast<PDFObjectRef
*>(pStreams
->m_aSubElements
[i
+1].get());
262 fprintf( stderr
, "error: no mimetype element\n" );
264 fprintf( stderr
, "error: no stream ref element\n" );
265 if( pMimeType
&& pStreamRef
)
267 fprintf( stdout
, "found stream %d %d with mimetype %s\n",
268 pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
,
269 pMimeType
->m_aName
.getStr() );
270 PDFObject
* pObject
= pPDFFile
->findObject( pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
);
273 OString aOutStream
= pOutFile
+
274 OString::Concat("_stream_") +
275 OString::number( sal_Int32(pStreamRef
->m_nNumber
) ) +
277 OString::number( sal_Int32(pStreamRef
->m_nGeneration
) );
278 FileEmitContext
aContext( aOutStream
.getStr(), pInFile
, pPDFFile
);
279 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
280 pObject
->writeStream( aContext
, pPDFFile
);
284 fprintf( stderr
, "object not found\n" );
294 static int write_addStreams( const char* pInFile
, const char* pOutFile
, PDFFile
* pPDFFile
)
298 unsigned int nElements
= pPDFFile
->m_aSubElements
.size();
299 for( unsigned i
= 0; i
< nElements
&& nRet
== 0; i
++ )
301 PDFTrailer
* pTrailer
= dynamic_cast<PDFTrailer
*>(pPDFFile
->m_aSubElements
[i
].get());
302 if( pTrailer
&& pTrailer
->m_pDict
)
304 // search for AdditionalStreams entry
305 auto add_stream
= pTrailer
->m_pDict
->m_aMap
.find( "AdditionalStreams" );
306 if( add_stream
!= pTrailer
->m_pDict
->m_aMap
.end() )
308 PDFArray
* pStreams
= dynamic_cast<PDFArray
*>(add_stream
->second
);
310 nRet
= write_addStreamArray( pOutFile
, pStreams
, pPDFFile
, pInFile
);
317 static int write_fonts( const char* i_pInFile
, const char* i_pOutFile
, PDFFile
* i_pPDFFile
)
319 unsigned int nElements
= i_pPDFFile
->m_aSubElements
.size();
320 for (unsigned i
= 0; i
< nElements
; i
++)
322 // search FontDescriptors
323 PDFObject
* pObj
= dynamic_cast<PDFObject
*>(i_pPDFFile
->m_aSubElements
[i
].get());
326 PDFDict
* pDict
= dynamic_cast<PDFDict
*>(pObj
->m_pObject
);
330 std::unordered_map
<OString
,PDFEntry
*>::iterator map_it
=
331 pDict
->m_aMap
.find( "Type" );
332 if( map_it
== pDict
->m_aMap
.end() )
335 PDFName
* pName
= dynamic_cast<PDFName
*>(map_it
->second
);
338 if( pName
->m_aName
!= "FontDescriptor" )
341 // the font name will be helpful, also there must be one in
343 map_it
= pDict
->m_aMap
.find( "FontName" );
344 if( map_it
== pDict
->m_aMap
.end() )
346 pName
= dynamic_cast<PDFName
*>(map_it
->second
);
349 OString
aFontName( pName
->m_aName
);
351 PDFObjectRef
* pStreamRef
= nullptr;
352 const char* pFileType
= nullptr;
353 // we have a font descriptor, try for a type 1 font
354 map_it
= pDict
->m_aMap
.find( "FontFile" );
355 if( map_it
!= pDict
->m_aMap
.end() )
357 pStreamRef
= dynamic_cast<PDFObjectRef
*>(map_it
->second
);
362 // perhaps it's a truetype file ?
365 map_it
= pDict
->m_aMap
.find( "FontFile2" );
366 if( map_it
!= pDict
->m_aMap
.end() )
368 pStreamRef
= dynamic_cast<PDFObjectRef
*>(map_it
->second
);
377 PDFObject
* pStream
= i_pPDFFile
->findObject( pStreamRef
);
381 OStringBuffer
aOutStream( OString::Concat(i_pOutFile
)
383 + OString::number( sal_Int32(pStreamRef
->m_nNumber
) )
385 + OString::number( sal_Int32(pStreamRef
->m_nGeneration
) )
390 aOutStream
.append( OString::Concat(".") + pFileType
);
392 FileEmitContext
aContext( aOutStream
.getStr(), i_pInFile
, i_pPDFFile
);
393 aContext
.m_bDecrypt
= i_pPDFFile
->isEncrypted();
394 pStream
->writeStream( aContext
, i_pPDFFile
);
399 static std::vector
< std::pair
< sal_Int32
, sal_Int32
> > s_aEmitObjects
;
401 static int write_objects( const char* i_pInFile
, const char* i_pOutFile
, PDFFile
* i_pPDFFile
)
403 unsigned int nElements
= s_aEmitObjects
.size();
404 for (unsigned i
= 0; i
< nElements
; i
++)
406 sal_Int32 nObject
= s_aEmitObjects
[i
].first
;
407 sal_Int32 nGeneration
= s_aEmitObjects
[i
].second
;
408 PDFObject
* pStream
= i_pPDFFile
->findObject( nObject
, nGeneration
);
411 fprintf( stderr
, "object %d %d not found !\n", static_cast<int>(nObject
), static_cast<int>(nGeneration
) );
415 OString aOutStream
= i_pOutFile
+
416 OString::Concat("_stream_") +
417 OString::number( nObject
) +
419 OString::number( nGeneration
);
420 FileEmitContext
aContext( aOutStream
.getStr(), i_pInFile
, i_pPDFFile
);
421 aContext
.m_bDecrypt
= i_pPDFFile
->isEncrypted();
422 pStream
->writeStream( aContext
, i_pPDFFile
);
427 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc
, argv
)
429 const char* pInFile
= nullptr;
430 const char* pOutFile
= nullptr;
431 const char* pPassword
= nullptr;
432 OStringBuffer
aOutFile( 256 );
433 PDFFileHdl aHdl
= write_unzipFile
;
435 for( int nArg
= 1; nArg
< argc
; nArg
++ )
437 if( argv
[nArg
][0] == '-' )
439 if( ! rtl_str_compare( "-pw", argv
[nArg
] ) ||
440 ! rtl_str_compare( "--password" , argv
[nArg
] ) )
444 fprintf( stderr
, "no password given\n" );
448 pPassword
= argv
[nArg
];
450 else if( ! rtl_str_compare( "-h", argv
[nArg
] ) ||
451 ! rtl_str_compare( "--help", argv
[nArg
] ) )
453 printHelp( argv
[0] );
456 else if( ! rtl_str_compare( "-a", argv
[nArg
] ) ||
457 ! rtl_str_compare( "--extract-add-streams", argv
[nArg
] ) )
459 aHdl
= write_addStreams
;
461 else if( ! rtl_str_compare( "-f", argv
[nArg
] ) ||
462 ! rtl_str_compare( "--extract-fonts", argv
[nArg
] ) )
466 else if( ! rtl_str_compare( "-o", argv
[nArg
] ) ||
467 ! rtl_str_compare( "--extract-objects", argv
[nArg
] ) )
469 aHdl
= write_objects
;
473 OString
aObjs( argv
[nArg
] );
474 sal_Int32 nIndex
= 0;
475 while( nIndex
!= -1 )
477 OString
aToken( aObjs
.getToken( 0, ',', nIndex
) );
478 sal_Int32 nObject
= 0;
479 sal_Int32 nGeneration
= 0;
480 sal_Int32 nGenIndex
= 0;
481 nObject
= o3tl::toInt32( o3tl::getToken( aToken
, 0, ':', nGenIndex
) );
482 if( nGenIndex
!= -1 )
483 nGeneration
= o3tl::toInt32( o3tl::getToken(aToken
, 0, ':', nGenIndex
));
484 s_aEmitObjects
.push_back( std::pair
<sal_Int32
,sal_Int32
>(nObject
,nGeneration
) );
490 fprintf( stderr
, "unrecognized option \"%s\"\n",
492 printHelp( argv
[0] );
496 else if( pInFile
== nullptr )
497 pInFile
= argv
[nArg
];
498 else if( pOutFile
== nullptr )
499 pOutFile
= argv
[nArg
];
503 fprintf( stderr
, "no input file given\n" );
508 OString
aFile( pInFile
);
509 if( aFile
.getLength() > 0 )
511 if( aFile
.getLength() > 4 )
513 if( aFile
.matchIgnoreAsciiCase( ".pdf", aFile
.getLength()-4 ) )
514 aOutFile
.append( pInFile
, aFile
.getLength() - 4 );
516 aOutFile
.append( aFile
);
518 aOutFile
.append( "_unzip.pdf" );
519 pOutFile
= aOutFile
.getStr();
523 fprintf( stderr
, "no output file given\n" );
528 return handleFile( pInFile
, pOutFile
, pPassword
, aHdl
);
531 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */