1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include "filterdet.hxx"
22 #include "inc/pdfihelper.hxx"
23 #include "inc/pdfparse.hxx"
25 #include <osl/diagnose.h>
27 #include <osl/thread.h>
28 #include <rtl/digest.h>
29 #include <rtl/ref.hxx>
30 #include <com/sun/star/uno/RuntimeException.hpp>
31 #include <com/sun/star/io/XInputStream.hpp>
32 #include <com/sun/star/io/XStream.hpp>
33 #include <com/sun/star/io/XSeekable.hpp>
34 #include <com/sun/star/io/TempFile.hpp>
35 #include <cppuhelper/supportsservice.hxx>
36 #include <boost/scoped_ptr.hpp>
39 using namespace com::sun::star
;
44 // TODO(T3): locking/thread safety
46 class FileEmitContext
: public pdfparse::EmitContext
49 oslFileHandle m_aReadHandle
;
50 unsigned int m_nReadLen
;
51 uno::Reference
< io::XStream
> m_xContextStream
;
52 uno::Reference
< io::XSeekable
> m_xSeek
;
53 uno::Reference
< io::XOutputStream
> m_xOut
;
56 FileEmitContext( const OUString
& rOrigFile
,
57 const uno::Reference
< uno::XComponentContext
>& xContext
,
58 const pdfparse::PDFContainer
* pTop
);
59 virtual ~FileEmitContext();
61 virtual bool write( const void* pBuf
, unsigned int nLen
) SAL_OVERRIDE
;
62 virtual unsigned int getCurPos() SAL_OVERRIDE
;
63 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) SAL_OVERRIDE
;
64 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) SAL_OVERRIDE
;
66 const uno::Reference
< io::XStream
>& getContextStream() const { return m_xContextStream
; }
69 FileEmitContext::FileEmitContext( const OUString
& rOrigFile
,
70 const uno::Reference
< uno::XComponentContext
>& xContext
,
71 const pdfparse::PDFContainer
* pTop
) :
72 pdfparse::EmitContext( pTop
),
79 m_xContextStream
= uno::Reference
< io::XStream
>(
80 io::TempFile::create(xContext
), uno::UNO_QUERY_THROW
);
81 m_xOut
= m_xContextStream
->getOutputStream();
82 m_xSeek
= uno::Reference
<io::XSeekable
>(m_xOut
, uno::UNO_QUERY_THROW
);
84 oslFileError aErr
= osl_File_E_None
;
85 if( (aErr
=osl_openFile( rOrigFile
.pData
,
87 osl_File_OpenFlag_Read
)) == osl_File_E_None
)
89 if( (aErr
=osl_setFilePos( m_aReadHandle
,
91 0 )) == osl_File_E_None
)
93 sal_uInt64 nFileSize
= 0;
94 if( (aErr
=osl_getFilePos( m_aReadHandle
,
95 &nFileSize
)) == osl_File_E_None
)
97 m_nReadLen
= static_cast<unsigned int>(nFileSize
);
100 if( aErr
!= osl_File_E_None
)
102 osl_closeFile( m_aReadHandle
);
103 m_aReadHandle
= NULL
;
109 FileEmitContext::~FileEmitContext()
112 osl_closeFile( m_aReadHandle
);
115 bool FileEmitContext::write( const void* pBuf
, unsigned int nLen
)
120 uno::Sequence
< sal_Int8
> aSeq( nLen
);
121 memcpy( aSeq
.getArray(), pBuf
, nLen
);
122 m_xOut
->writeBytes( aSeq
);
126 unsigned int FileEmitContext::getCurPos()
128 unsigned int nPos
= 0;
131 nPos
= static_cast<unsigned int>( m_xSeek
->getPosition() );
136 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
)
138 if( nOrigOffset
+ nLen
> m_nReadLen
)
141 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
144 uno::Sequence
< sal_Int8
> aSeq( nLen
);
146 sal_uInt64 nBytesRead
= 0;
147 if( osl_readFile( m_aReadHandle
,
150 &nBytesRead
) != osl_File_E_None
151 || nBytesRead
!= static_cast<sal_uInt64
>(nLen
) )
156 m_xOut
->writeBytes( aSeq
);
160 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
)
162 if( nOrigOffset
+ nLen
> m_nReadLen
)
165 if( osl_setFilePos( m_aReadHandle
,
167 nOrigOffset
) != osl_File_E_None
)
172 sal_uInt64 nBytesRead
= 0;
173 if( osl_readFile( m_aReadHandle
,
176 &nBytesRead
) != osl_File_E_None
)
180 return static_cast<unsigned int>(nBytesRead
);
187 PDFDetector::PDFDetector( const uno::Reference
< uno::XComponentContext
>& xContext
) :
188 PDFDetectorBase( m_aMutex
),
189 m_xContext( xContext
)
192 // XExtendedFilterDetection
193 OUString SAL_CALL
PDFDetector::detect( uno::Sequence
< beans::PropertyValue
>& rFilterData
) throw( uno::RuntimeException
, std::exception
)
195 osl::MutexGuard
const guard( m_aMutex
);
196 bool bSuccess
= false;
198 // get the InputStream carrying the PDF content
199 uno::Reference
< io::XInputStream
> xInput
;
200 uno::Reference
< io::XStream
> xEmbedStream
;
201 OUString aOutFilterName
, aOutTypeName
;
204 const beans::PropertyValue
* pAttribs
= rFilterData
.getConstArray();
205 sal_Int32 nAttribs
= rFilterData
.getLength();
206 sal_Int32 nFilterNamePos
= -1;
207 sal_Int32 nPwdPos
= -1;
208 for( sal_Int32 i
= 0; i
< nAttribs
; i
++ )
210 #if OSL_DEBUG_LEVEL > 1
211 OUString
aVal( "<no string>" );
212 pAttribs
[i
].Value
>>= aVal
;
213 OSL_TRACE( "doDetection: Attrib: %s = %s\n",
214 OUStringToOString( pAttribs
[i
].Name
, RTL_TEXTENCODING_UTF8
).getStr(),
215 OUStringToOString( aVal
, RTL_TEXTENCODING_UTF8
).getStr() );
217 if ( pAttribs
[i
].Name
== "InputStream" )
218 pAttribs
[i
].Value
>>= xInput
;
219 else if ( pAttribs
[i
].Name
== "URL" )
220 pAttribs
[i
].Value
>>= aURL
;
221 else if ( pAttribs
[i
].Name
== "FilterName" )
223 else if ( pAttribs
[i
].Name
== "Password" )
226 pAttribs
[i
].Value
>>= aPwd
;
231 uno::Reference
< io::XSeekable
> xSeek( xInput
, uno::UNO_QUERY
);
234 // read the first 1024 byte (see PDF reference implementation note 12)
235 const sal_Int32 nHeaderSize
= 1024;
236 uno::Sequence
< sal_Int8
> aBuf( nHeaderSize
);
237 sal_uInt64 nBytes
= 0;
238 nBytes
= xInput
->readBytes( aBuf
, nHeaderSize
);
241 const sal_Int8
* pBytes
= aBuf
.getConstArray();
242 for( unsigned int i
= 0; i
< nBytes
-5; i
++ )
244 if( pBytes
[i
] == '%' &&
245 pBytes
[i
+1] == 'P' &&
246 pBytes
[i
+2] == 'D' &&
247 pBytes
[i
+3] == 'F' &&
256 // check for hybrid PDF
257 oslFileHandle aFile
= NULL
;
259 ( aURL
.isEmpty() || !aURL
.startsWith( "file:" ) )
262 sal_uInt64 nWritten
= 0;
263 if( osl_createTempFile( NULL
, &aFile
, &aURL
.pData
) != osl_File_E_None
)
269 #if OSL_DEBUG_LEVEL > 1
270 OSL_TRACE( "created temp file %s\n",
271 OUStringToOString( aURL
, RTL_TEXTENCODING_UTF8
).getStr() );
273 osl_writeFile( aFile
, aBuf
.getConstArray(), nBytes
, &nWritten
);
275 OSL_ENSURE( nWritten
== nBytes
, "writing of header bytes failed" );
277 if( nWritten
== nBytes
)
279 const sal_uInt32 nBufSize
= 4096;
280 aBuf
= uno::Sequence
<sal_Int8
>(nBufSize
);
284 nBytes
= xInput
->readBytes( aBuf
, nBufSize
);
287 osl_writeFile( aFile
, aBuf
.getConstArray(), nBytes
, &nWritten
);
288 if( nWritten
!= nBytes
)
294 } while( nBytes
== nBufSize
);
297 osl_closeFile( aFile
);
299 OUString aEmbedMimetype
;
300 xEmbedStream
= getAdditionalStream( aURL
, aEmbedMimetype
, aPwd
, m_xContext
, rFilterData
, false );
302 osl_removeFile( aURL
.pData
);
303 if( !aEmbedMimetype
.isEmpty() )
305 if( aEmbedMimetype
== "application/vnd.oasis.opendocument.text"
306 || aEmbedMimetype
== "application/vnd.oasis.opendocument.text-master" )
307 aOutFilterName
= "writer_pdf_addstream_import";
308 else if ( aEmbedMimetype
== "application/vnd.oasis.opendocument.presentation" )
309 aOutFilterName
= "impress_pdf_addstream_import";
310 else if( aEmbedMimetype
== "application/vnd.oasis.opendocument.graphics"
311 || aEmbedMimetype
== "application/vnd.oasis.opendocument.drawing" )
312 aOutFilterName
= "draw_pdf_addstream_import";
313 else if ( aEmbedMimetype
== "application/vnd.oasis.opendocument.spreadsheet" )
314 aOutFilterName
= "calc_pdf_addstream_import";
320 if( !aOutFilterName
.isEmpty() )
322 if( nFilterNamePos
== -1 )
324 nFilterNamePos
= nAttribs
;
325 rFilterData
.realloc( ++nAttribs
);
326 rFilterData
[ nFilterNamePos
].Name
= "FilterName";
328 aOutTypeName
= "pdf_Portable_Document_Format";
330 OSL_TRACE( "setting filter name %s, input stream %s\n",
331 OUStringToOString( aOutFilterName
, RTL_TEXTENCODING_UTF8
).getStr(),
332 xEmbedStream
.is() ? "present" : "not present" );
334 rFilterData
[nFilterNamePos
].Value
<<= aOutFilterName
;
335 if( xEmbedStream
.is() )
337 rFilterData
.realloc( ++nAttribs
);
338 rFilterData
[nAttribs
-1].Name
= "EmbeddedSubstream";
339 rFilterData
[nAttribs
-1].Value
<<= xEmbedStream
;
341 if( !aPwd
.isEmpty() )
346 rFilterData
.realloc( ++nAttribs
);
347 rFilterData
[ nPwdPos
].Name
= "Password";
349 rFilterData
[ nPwdPos
].Value
<<= aPwd
;
354 if( nFilterNamePos
== -1 )
356 nFilterNamePos
= nAttribs
;
357 rFilterData
.realloc( ++nAttribs
);
358 rFilterData
[ nFilterNamePos
].Name
= "FilterName";
361 const sal_Int32 nDocumentType
= 0; //const sal_Int32 nDocumentType = queryDocumentTypeDialog(m_xContext,aURL);
362 if( nDocumentType
< 0 )
366 else switch( nDocumentType
)
369 rFilterData
[nFilterNamePos
].Value
<<= OUString( "draw_pdf_import" );
373 rFilterData
[nFilterNamePos
].Value
<<= OUString( "impress_pdf_import" );
377 rFilterData
[nFilterNamePos
].Value
<<= OUString( "writer_pdf_import" );
381 OSL_FAIL("Unexpected case");
384 aOutTypeName
= "pdf_Portable_Document_Format";
391 OUString
PDFDetector::getImplementationName()
392 throw (css::uno::RuntimeException
, std::exception
)
394 return OUString("org.libreoffice.comp.documents.PDFDetector");
397 sal_Bool
PDFDetector::supportsService(OUString
const & ServiceName
)
398 throw (css::uno::RuntimeException
, std::exception
)
400 return cppu::supportsService(this, ServiceName
);
403 css::uno::Sequence
<OUString
> PDFDetector::getSupportedServiceNames()
404 throw (css::uno::RuntimeException
, std::exception
)
406 return css::uno::Sequence
<OUString
>{"com.sun.star.document.ImportFilter"};
409 bool checkDocChecksum( const OUString
& rInPDFFileURL
,
411 const OUString
& rChkSum
)
414 if( rChkSum
.getLength() != 2* RTL_DIGEST_LENGTH_MD5
)
418 "checksum of length " << rChkSum
.getLength() << ", expected "
419 << 2*RTL_DIGEST_LENGTH_MD5
);
423 // prepare checksum to test
424 sal_uInt8 nTestChecksum
[ RTL_DIGEST_LENGTH_MD5
];
425 const sal_Unicode
* pChar
= rChkSum
.getStr();
426 for( unsigned int i
= 0; i
< RTL_DIGEST_LENGTH_MD5
; i
++ )
428 sal_uInt8 nByte
= sal_uInt8( ( (*pChar
>= '0' && *pChar
<= '9') ? *pChar
- '0' :
429 ( (*pChar
>= 'A' && *pChar
<= 'F') ? *pChar
- 'A' + 10 :
430 ( (*pChar
>= 'a' && *pChar
<= 'f') ? *pChar
- 'a' + 10 :
434 nByte
|= ( (*pChar
>= '0' && *pChar
<= '9') ? *pChar
- '0' :
435 ( (*pChar
>= 'A' && *pChar
<= 'F') ? *pChar
- 'A' + 10 :
436 ( (*pChar
>= 'a' && *pChar
<= 'f') ? *pChar
- 'a' + 10 :
439 nTestChecksum
[i
] = nByte
;
442 // open file and calculate actual checksum up to index nBytes
443 sal_uInt8 nActualChecksum
[ RTL_DIGEST_LENGTH_MD5
];
444 memset( nActualChecksum
, 0, sizeof(nActualChecksum
) );
445 rtlDigest aActualDigest
= rtl_digest_createMD5();
446 oslFileHandle aRead
= NULL
;
447 oslFileError aErr
= osl_File_E_None
;
448 if( (aErr
= osl_openFile(rInPDFFileURL
.pData
,
450 osl_File_OpenFlag_Read
)) == osl_File_E_None
)
454 sal_uInt64 nBytesRead
= 0;
455 while( nCur
< nBytes
)
457 sal_uInt32 nPass
= (nBytes
- nCur
) > sizeof( aBuf
) ? sizeof( aBuf
) : nBytes
- nCur
;
458 if( (aErr
= osl_readFile( aRead
, aBuf
, nPass
, &nBytesRead
)) != osl_File_E_None
463 nPass
= static_cast<sal_uInt32
>(nBytesRead
);
465 rtl_digest_updateMD5( aActualDigest
, aBuf
, nPass
);
467 rtl_digest_getMD5( aActualDigest
, nActualChecksum
, sizeof(nActualChecksum
) );
468 osl_closeFile( aRead
);
470 rtl_digest_destroyMD5( aActualDigest
);
472 // compare the contents
473 bRet
= (0 == memcmp( nActualChecksum
, nTestChecksum
, sizeof( nActualChecksum
) ));
474 #if OSL_DEBUG_LEVEL > 1
475 OSL_TRACE( "test checksum: " );
476 for( unsigned int i
= 0; i
< sizeof(nTestChecksum
); i
++ )
477 OSL_TRACE( "%.2X", int(nTestChecksum
[i
]) );
479 OSL_TRACE( "file checksum: " );
480 for( unsigned int i
= 0; i
< sizeof(nActualChecksum
); i
++ )
481 OSL_TRACE( "%.2X", int(nActualChecksum
[i
]) );
487 uno::Reference
< io::XStream
> getAdditionalStream( const OUString
& rInPDFFileURL
,
488 OUString
& rOutMimetype
,
490 const uno::Reference
<uno::XComponentContext
>& xContext
,
491 const uno::Sequence
<beans::PropertyValue
>& rFilterData
,
494 uno::Reference
< io::XStream
> xEmbed
;
497 if( osl_getSystemPathFromFileURL( rInPDFFileURL
.pData
, &aSysUPath
.pData
) != osl_File_E_None
)
499 aPDFFile
= OUStringToOString( aSysUPath
, osl_getThreadTextEncoding() );
501 pdfparse::PDFReader aParser
;
502 boost::scoped_ptr
<pdfparse::PDFEntry
> pEntry( pdfparse::PDFReader::read( aPDFFile
.getStr() ));
505 pdfparse::PDFFile
* pPDFFile
= dynamic_cast<pdfparse::PDFFile
*>(pEntry
.get());
508 unsigned int nElements
= pPDFFile
->m_aSubElements
.size();
509 while( nElements
-- > 0 )
511 pdfparse::PDFTrailer
* pTrailer
= dynamic_cast<pdfparse::PDFTrailer
*>(pPDFFile
->m_aSubElements
[nElements
]);
512 if( pTrailer
&& pTrailer
->m_pDict
)
514 // search document checksum entry
515 std::unordered_map
< OString
,
517 OStringHash
>::iterator chk
;
518 chk
= pTrailer
->m_pDict
->m_aMap
.find( "DocChecksum" );
519 if( chk
== pTrailer
->m_pDict
->m_aMap
.end() )
521 OSL_TRACE( "no DocChecksum entry" );
524 pdfparse::PDFName
* pChkSumName
= dynamic_cast<pdfparse::PDFName
*>(chk
->second
);
525 if( pChkSumName
== NULL
)
527 OSL_TRACE( "no name for DocChecksum entry" );
531 // search for AdditionalStreams entry
532 std::unordered_map
< OString
,
534 OStringHash
>::iterator add_stream
;
535 add_stream
= pTrailer
->m_pDict
->m_aMap
.find( "AdditionalStreams" );
536 if( add_stream
== pTrailer
->m_pDict
->m_aMap
.end() )
538 OSL_TRACE( "no AdditionalStreams entry" );
541 pdfparse::PDFArray
* pStreams
= dynamic_cast<pdfparse::PDFArray
*>(add_stream
->second
);
542 if( ! pStreams
|| pStreams
->m_aSubElements
.size() < 2 )
544 OSL_TRACE( "AdditionalStreams array too small" );
549 OUString aChkSum
= pChkSumName
->getFilteredName();
550 if( ! checkDocChecksum( rInPDFFileURL
, pTrailer
->m_nOffset
, aChkSum
) )
553 // extract addstream and mimetype
554 pdfparse::PDFName
* pMimeType
= dynamic_cast<pdfparse::PDFName
*>(pStreams
->m_aSubElements
[0]);
555 pdfparse::PDFObjectRef
* pStreamRef
= dynamic_cast<pdfparse::PDFObjectRef
*>(pStreams
->m_aSubElements
[1]);
557 OSL_ENSURE( pMimeType
, "error: no mimetype element\n" );
558 OSL_ENSURE( pStreamRef
, "error: no stream ref element\n" );
560 if( pMimeType
&& pStreamRef
)
562 pdfparse::PDFObject
* pObject
= pPDFFile
->findObject( pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
);
563 OSL_ENSURE( pObject
, "object not found\n" );
566 if( pPDFFile
->isEncrypted() )
568 bool bAuthenticated
= false;
569 if( !io_rPwd
.isEmpty() )
571 OString aIsoPwd
= OUStringToOString( io_rPwd
,
572 RTL_TEXTENCODING_ISO_8859_1
);
573 bAuthenticated
= pPDFFile
->setupDecryptionData( aIsoPwd
.getStr() );
575 if( ! bAuthenticated
)
577 const beans::PropertyValue
* pAttribs
= rFilterData
.getConstArray();
578 sal_Int32 nAttribs
= rFilterData
.getLength();
579 uno::Reference
< task::XInteractionHandler
> xIntHdl
;
580 for( sal_Int32 i
= 0; i
< nAttribs
; i
++ )
582 if ( pAttribs
[i
].Name
== "InteractionHandler" )
583 pAttribs
[i
].Value
>>= xIntHdl
;
585 if( ! bMayUseUI
|| ! xIntHdl
.is() )
587 rOutMimetype
= pMimeType
->getFilteredName();
592 OUString
aDocName( rInPDFFileURL
.copy( rInPDFFileURL
.lastIndexOf( '/' )+1 ) );
594 bool bEntered
= false;
597 bEntered
= getPassword( xIntHdl
, io_rPwd
, ! bEntered
, aDocName
);
598 OString aIsoPwd
= OUStringToOString( io_rPwd
,
599 RTL_TEXTENCODING_ISO_8859_1
);
600 bAuthenticated
= pPDFFile
->setupDecryptionData( aIsoPwd
.getStr() );
601 } while( bEntered
&& ! bAuthenticated
);
604 OSL_TRACE( "password: %s", bAuthenticated
? "matches" : "does not match" );
605 if( ! bAuthenticated
)
608 rOutMimetype
= pMimeType
->getFilteredName();
609 FileEmitContext
aContext( rInPDFFileURL
,
612 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
613 pObject
->writeStream( aContext
, pPDFFile
);
614 xEmbed
= aContext
.getContextStream();
623 OSL_TRACE( "extracted add stream: mimetype %s\n",
624 OUStringToOString( rOutMimetype
,
625 RTL_TEXTENCODING_UTF8
).getStr());
631 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */