1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include "filterdet.hxx"
22 #include "inc/pdfihelper.hxx"
23 #include "inc/pdfparse.hxx"
26 #include <osl/thread.h>
27 #include <rtl/digest.h>
28 #include <sal/log.hxx>
29 #include <com/sun/star/io/IOException.hpp>
30 #include <com/sun/star/io/XInputStream.hpp>
31 #include <com/sun/star/io/XStream.hpp>
32 #include <com/sun/star/io/XSeekable.hpp>
33 #include <com/sun/star/io/TempFile.hpp>
34 #include <com/sun/star/task/XInteractionHandler.hpp>
35 #include <comphelper/fileurl.hxx>
36 #include <comphelper/hash.hxx>
37 #include <cppuhelper/supportsservice.hxx>
38 #include <tools/diagnose_ex.h>
42 using namespace com::sun::star
;
47 // TODO(T3): locking/thread safety
49 class FileEmitContext
: public pdfparse::EmitContext
52 oslFileHandle m_aReadHandle
;
53 unsigned int m_nReadLen
;
54 uno::Reference
< io::XStream
> m_xContextStream
;
55 uno::Reference
< io::XSeekable
> m_xSeek
;
56 uno::Reference
< io::XOutputStream
> m_xOut
;
59 FileEmitContext( const OUString
& rOrigFile
,
60 const uno::Reference
< uno::XComponentContext
>& xContext
,
61 const pdfparse::PDFContainer
* pTop
);
62 virtual ~FileEmitContext() override
;
64 virtual bool write( const void* pBuf
, unsigned int nLen
) override
;
65 virtual unsigned int getCurPos() override
;
66 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) override
;
67 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) override
;
69 const uno::Reference
< io::XStream
>& getContextStream() const { return m_xContextStream
; }
72 FileEmitContext::FileEmitContext( const OUString
& rOrigFile
,
73 const uno::Reference
< uno::XComponentContext
>& xContext
,
74 const pdfparse::PDFContainer
* pTop
) :
75 pdfparse::EmitContext( pTop
),
76 m_aReadHandle(nullptr),
82 m_xContextStream
.set( io::TempFile::create(xContext
), uno::UNO_QUERY_THROW
);
83 m_xOut
= m_xContextStream
->getOutputStream();
84 m_xSeek
.set(m_xOut
, uno::UNO_QUERY_THROW
);
86 oslFileError aErr
= osl_File_E_None
;
87 if( (aErr
=osl_openFile( rOrigFile
.pData
,
89 osl_File_OpenFlag_Read
)) == osl_File_E_None
)
91 if( (aErr
=osl_setFilePos( m_aReadHandle
,
93 0 )) == osl_File_E_None
)
95 sal_uInt64 nFileSize
= 0;
96 if( (aErr
=osl_getFilePos( m_aReadHandle
,
97 &nFileSize
)) == osl_File_E_None
)
99 m_nReadLen
= static_cast<unsigned int>(nFileSize
);
102 if( aErr
!= osl_File_E_None
)
104 osl_closeFile( m_aReadHandle
);
105 m_aReadHandle
= nullptr;
111 FileEmitContext::~FileEmitContext()
114 osl_closeFile( m_aReadHandle
);
117 bool FileEmitContext::write( const void* pBuf
, unsigned int nLen
)
122 uno::Sequence
< sal_Int8
> aSeq( nLen
);
123 memcpy( aSeq
.getArray(), pBuf
, nLen
);
124 m_xOut
->writeBytes( aSeq
);
128 unsigned int FileEmitContext::getCurPos()
130 unsigned int nPos
= 0;
133 nPos
= static_cast<unsigned int>( m_xSeek
->getPosition() );
138 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
)
140 if( nOrigOffset
+ nLen
> m_nReadLen
)
143 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
146 uno::Sequence
< sal_Int8
> aSeq( nLen
);
148 sal_uInt64 nBytesRead
= 0;
149 if( osl_readFile( m_aReadHandle
,
152 &nBytesRead
) != osl_File_E_None
153 || nBytesRead
!= static_cast<sal_uInt64
>(nLen
) )
158 m_xOut
->writeBytes( aSeq
);
162 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
)
164 if( nOrigOffset
+ nLen
> m_nReadLen
)
167 if( osl_setFilePos( m_aReadHandle
,
169 nOrigOffset
) != osl_File_E_None
)
174 sal_uInt64 nBytesRead
= 0;
175 if( osl_readFile( m_aReadHandle
,
178 &nBytesRead
) != osl_File_E_None
)
182 return static_cast<unsigned int>(nBytesRead
);
186 PDFDetector::PDFDetector( const uno::Reference
< uno::XComponentContext
>& xContext
) :
187 PDFDetectorBase( m_aMutex
),
188 m_xContext( xContext
)
191 // XExtendedFilterDetection
192 OUString SAL_CALL
PDFDetector::detect( uno::Sequence
< beans::PropertyValue
>& rFilterData
)
194 osl::MutexGuard
const guard( m_aMutex
);
195 bool bSuccess
= false;
197 // get the InputStream carrying the PDF content
198 uno::Reference
< io::XInputStream
> xInput
;
199 uno::Reference
< io::XStream
> xEmbedStream
;
200 OUString aOutFilterName
, aOutTypeName
;
203 const beans::PropertyValue
* pAttribs
= rFilterData
.getConstArray();
204 sal_Int32 nAttribs
= rFilterData
.getLength();
205 sal_Int32 nFilterNamePos
= -1;
206 sal_Int32 nPwdPos
= -1;
207 for( sal_Int32 i
= 0; i
< nAttribs
; i
++ )
209 OUString
aVal( "<no string>" );
210 pAttribs
[i
].Value
>>= aVal
;
211 SAL_INFO( "sdext.pdfimport", "doDetection: Attrib: " + pAttribs
[i
].Name
+ " = " + aVal
);
213 if ( pAttribs
[i
].Name
== "InputStream" )
214 pAttribs
[i
].Value
>>= xInput
;
215 else if ( pAttribs
[i
].Name
== "URL" )
216 pAttribs
[i
].Value
>>= aURL
;
217 else if ( pAttribs
[i
].Name
== "FilterName" )
219 else if ( pAttribs
[i
].Name
== "Password" )
222 pAttribs
[i
].Value
>>= aPwd
;
227 oslFileHandle aFile
= nullptr;
229 uno::Reference
< io::XSeekable
> xSeek( xInput
, uno::UNO_QUERY
);
232 // read the first 1024 byte (see PDF reference implementation note 12)
233 const sal_Int32 nHeaderSize
= 1024;
234 uno::Sequence
< sal_Int8
> aBuf( nHeaderSize
);
235 sal_uInt64 nBytes
= xInput
->readBytes( aBuf
, nHeaderSize
);
238 const sal_Int8
* pBytes
= aBuf
.getConstArray();
239 for( sal_uInt64 i
= 0; i
< nBytes
-5; i
++ )
241 if( pBytes
[i
] == '%' &&
242 pBytes
[i
+1] == 'P' &&
243 pBytes
[i
+2] == 'D' &&
244 pBytes
[i
+3] == 'F' &&
253 // check for hybrid PDF
255 ( aURL
.isEmpty() || !comphelper::isFileUrl(aURL
) )
258 sal_uInt64 nWritten
= 0;
259 if( osl_createTempFile( nullptr, &aFile
, &aURL
.pData
) != osl_File_E_None
)
265 SAL_INFO( "sdext.pdfimport", "created temp file " + aURL
);
267 osl_writeFile( aFile
, aBuf
.getConstArray(), nBytes
, &nWritten
);
269 SAL_WARN_IF( nWritten
!= nBytes
, "sdext.pdfimport", "writing of header bytes failed" );
271 if( nWritten
== nBytes
)
273 const sal_uInt32 nBufSize
= 4096;
274 aBuf
= uno::Sequence
<sal_Int8
>(nBufSize
);
278 nBytes
= xInput
->readBytes( aBuf
, nBufSize
);
281 osl_writeFile( aFile
, aBuf
.getConstArray(), nBytes
, &nWritten
);
282 if( nWritten
!= nBytes
)
288 } while( nBytes
== nBufSize
);
291 osl_closeFile( aFile
);
293 } catch (const css::io::IOException
&) {
294 TOOLS_WARN_EXCEPTION("sdext.pdfimport", "caught");
297 OUString aEmbedMimetype
;
298 xEmbedStream
= getAdditionalStream( aURL
, aEmbedMimetype
, aPwd
, m_xContext
, rFilterData
, false );
300 osl_removeFile( aURL
.pData
);
301 if( !aEmbedMimetype
.isEmpty() )
303 if( aEmbedMimetype
== "application/vnd.oasis.opendocument.text"
304 || aEmbedMimetype
== "application/vnd.oasis.opendocument.text-master" )
305 aOutFilterName
= "writer_pdf_addstream_import";
306 else if ( aEmbedMimetype
== "application/vnd.oasis.opendocument.presentation" )
307 aOutFilterName
= "impress_pdf_addstream_import";
308 else if( aEmbedMimetype
== "application/vnd.oasis.opendocument.graphics"
309 || aEmbedMimetype
== "application/vnd.oasis.opendocument.drawing" )
310 aOutFilterName
= "draw_pdf_addstream_import";
311 else if ( aEmbedMimetype
== "application/vnd.oasis.opendocument.spreadsheet" )
312 aOutFilterName
= "calc_pdf_addstream_import";
318 if( !aOutFilterName
.isEmpty() )
320 if( nFilterNamePos
== -1 )
322 nFilterNamePos
= nAttribs
;
323 rFilterData
.realloc( ++nAttribs
);
324 rFilterData
[ nFilterNamePos
].Name
= "FilterName";
326 aOutTypeName
= "pdf_Portable_Document_Format";
328 rFilterData
[nFilterNamePos
].Value
<<= aOutFilterName
;
329 if( xEmbedStream
.is() )
331 rFilterData
.realloc( ++nAttribs
);
332 rFilterData
[nAttribs
-1].Name
= "EmbeddedSubstream";
333 rFilterData
[nAttribs
-1].Value
<<= xEmbedStream
;
335 if( !aPwd
.isEmpty() )
340 rFilterData
.realloc( ++nAttribs
);
341 rFilterData
[ nPwdPos
].Name
= "Password";
343 rFilterData
[ nPwdPos
].Value
<<= aPwd
;
348 if( nFilterNamePos
== -1 )
350 nFilterNamePos
= nAttribs
;
351 rFilterData
.realloc( ++nAttribs
);
352 rFilterData
[ nFilterNamePos
].Name
= "FilterName";
355 const sal_Int32 nDocumentType
= 0; //const sal_Int32 nDocumentType = queryDocumentTypeDialog(m_xContext,aURL);
356 if( nDocumentType
< 0 )
360 else switch( nDocumentType
)
363 rFilterData
[nFilterNamePos
].Value
<<= OUString( "draw_pdf_import" );
367 rFilterData
[nFilterNamePos
].Value
<<= OUString( "impress_pdf_import" );
371 rFilterData
[nFilterNamePos
].Value
<<= OUString( "writer_pdf_import" );
375 assert(!"Unexpected case");
378 aOutTypeName
= "pdf_Portable_Document_Format";
385 OUString
PDFDetector::getImplementationName()
387 return "org.libreoffice.comp.documents.PDFDetector";
390 sal_Bool
PDFDetector::supportsService(OUString
const & ServiceName
)
392 return cppu::supportsService(this, ServiceName
);
395 css::uno::Sequence
<OUString
> PDFDetector::getSupportedServiceNames()
397 return css::uno::Sequence
<OUString
>{"com.sun.star.document.ImportFilter"};
400 bool checkDocChecksum( const OUString
& rInPDFFileURL
,
402 const OUString
& rChkSum
)
404 if( rChkSum
.getLength() != 2* RTL_DIGEST_LENGTH_MD5
)
408 "checksum of length " << rChkSum
.getLength() << ", expected "
409 << 2*RTL_DIGEST_LENGTH_MD5
);
413 // prepare checksum to test
414 sal_uInt8 nTestChecksum
[ RTL_DIGEST_LENGTH_MD5
];
415 const sal_Unicode
* pChar
= rChkSum
.getStr();
416 for(sal_uInt8
& rn
: nTestChecksum
)
418 sal_uInt8 nByte
= sal_uInt8( ( (*pChar
>= '0' && *pChar
<= '9') ? *pChar
- '0' :
419 ( (*pChar
>= 'A' && *pChar
<= 'F') ? *pChar
- 'A' + 10 :
420 ( (*pChar
>= 'a' && *pChar
<= 'f') ? *pChar
- 'a' + 10 :
424 nByte
|= ( (*pChar
>= '0' && *pChar
<= '9') ? *pChar
- '0' :
425 ( (*pChar
>= 'A' && *pChar
<= 'F') ? *pChar
- 'A' + 10 :
426 ( (*pChar
>= 'a' && *pChar
<= 'f') ? *pChar
- 'a' + 10 :
432 // open file and calculate actual checksum up to index nBytes
433 ::std::vector
<unsigned char> nChecksum
;
434 ::comphelper::Hash
aDigest(::comphelper::HashType::MD5
);
435 oslFileHandle aRead
= nullptr;
436 oslFileError aErr
= osl_File_E_None
;
437 if( (aErr
= osl_openFile(rInPDFFileURL
.pData
,
439 osl_File_OpenFlag_Read
)) == osl_File_E_None
)
441 sal_uInt8 aBuf
[4096];
443 sal_uInt64 nBytesRead
= 0;
444 while( nCur
< nBytes
)
446 sal_uInt32 nPass
= std::min
<sal_uInt32
>(nBytes
- nCur
, sizeof( aBuf
));
447 if( (aErr
= osl_readFile( aRead
, aBuf
, nPass
, &nBytesRead
)) != osl_File_E_None
452 nPass
= static_cast<sal_uInt32
>(nBytesRead
);
454 aDigest
.update(aBuf
, nPass
);
457 nChecksum
= aDigest
.finalize();
458 osl_closeFile( aRead
);
461 // compare the contents
462 return nChecksum
.size() == RTL_DIGEST_LENGTH_MD5
463 && (0 == memcmp(nChecksum
.data(), nTestChecksum
, nChecksum
.size()));
466 uno::Reference
< io::XStream
> getAdditionalStream( const OUString
& rInPDFFileURL
,
467 OUString
& rOutMimetype
,
469 const uno::Reference
<uno::XComponentContext
>& xContext
,
470 const uno::Sequence
<beans::PropertyValue
>& rFilterData
,
473 uno::Reference
< io::XStream
> xEmbed
;
476 if( osl_getSystemPathFromFileURL( rInPDFFileURL
.pData
, &aSysUPath
.pData
) != osl_File_E_None
)
478 aPDFFile
= OUStringToOString( aSysUPath
, osl_getThreadTextEncoding() );
480 std::unique_ptr
<pdfparse::PDFEntry
> pEntry( pdfparse::PDFReader::read( aPDFFile
.getStr() ));
483 pdfparse::PDFFile
* pPDFFile
= dynamic_cast<pdfparse::PDFFile
*>(pEntry
.get());
486 unsigned int nElements
= pPDFFile
->m_aSubElements
.size();
487 while( nElements
-- > 0 )
489 pdfparse::PDFTrailer
* pTrailer
= dynamic_cast<pdfparse::PDFTrailer
*>(pPDFFile
->m_aSubElements
[nElements
].get());
490 if( pTrailer
&& pTrailer
->m_pDict
)
492 // search document checksum entry
493 auto chk
= pTrailer
->m_pDict
->m_aMap
.find( "DocChecksum" );
494 if( chk
== pTrailer
->m_pDict
->m_aMap
.end() )
496 SAL_INFO( "sdext.pdfimport", "no DocChecksum entry" );
499 pdfparse::PDFName
* pChkSumName
= dynamic_cast<pdfparse::PDFName
*>(chk
->second
);
500 if( pChkSumName
== nullptr )
502 SAL_INFO( "sdext.pdfimport", "no name for DocChecksum entry" );
506 // search for AdditionalStreams entry
507 auto add_stream
= pTrailer
->m_pDict
->m_aMap
.find( "AdditionalStreams" );
508 if( add_stream
== pTrailer
->m_pDict
->m_aMap
.end() )
510 SAL_INFO( "sdext.pdfimport", "no AdditionalStreams entry" );
513 pdfparse::PDFArray
* pStreams
= dynamic_cast<pdfparse::PDFArray
*>(add_stream
->second
);
514 if( ! pStreams
|| pStreams
->m_aSubElements
.size() < 2 )
516 SAL_INFO( "sdext.pdfimport", "AdditionalStreams array too small" );
521 OUString aChkSum
= pChkSumName
->getFilteredName();
522 if( ! checkDocChecksum( rInPDFFileURL
, pTrailer
->m_nOffset
, aChkSum
) )
525 // extract addstream and mimetype
526 pdfparse::PDFName
* pMimeType
= dynamic_cast<pdfparse::PDFName
*>(pStreams
->m_aSubElements
[0].get());
527 pdfparse::PDFObjectRef
* pStreamRef
= dynamic_cast<pdfparse::PDFObjectRef
*>(pStreams
->m_aSubElements
[1].get());
529 SAL_WARN_IF( !pMimeType
, "sdext.pdfimport", "error: no mimetype element" );
530 SAL_WARN_IF( !pStreamRef
, "sdext.pdfimport", "error: no stream ref element" );
532 if( pMimeType
&& pStreamRef
)
534 pdfparse::PDFObject
* pObject
= pPDFFile
->findObject( pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
);
535 SAL_WARN_IF( !pObject
, "sdext.pdfimport", "object not found" );
538 if( pPDFFile
->isEncrypted() )
540 bool bAuthenticated
= false;
541 if( !io_rPwd
.isEmpty() )
543 OString aIsoPwd
= OUStringToOString( io_rPwd
,
544 RTL_TEXTENCODING_ISO_8859_1
);
545 bAuthenticated
= pPDFFile
->setupDecryptionData( aIsoPwd
.getStr() );
547 if( ! bAuthenticated
)
549 uno::Reference
< task::XInteractionHandler
> xIntHdl
;
550 for( const beans::PropertyValue
& rAttrib
: rFilterData
)
552 if ( rAttrib
.Name
== "InteractionHandler" )
553 rAttrib
.Value
>>= xIntHdl
;
555 if( ! bMayUseUI
|| ! xIntHdl
.is() )
557 rOutMimetype
= pMimeType
->getFilteredName();
562 OUString
aDocName( rInPDFFileURL
.copy( rInPDFFileURL
.lastIndexOf( '/' )+1 ) );
564 bool bEntered
= false;
567 bEntered
= getPassword( xIntHdl
, io_rPwd
, ! bEntered
, aDocName
);
568 OString aIsoPwd
= OUStringToOString( io_rPwd
,
569 RTL_TEXTENCODING_ISO_8859_1
);
570 bAuthenticated
= pPDFFile
->setupDecryptionData( aIsoPwd
.getStr() );
571 } while( bEntered
&& ! bAuthenticated
);
574 if( ! bAuthenticated
)
577 rOutMimetype
= pMimeType
->getFilteredName();
578 FileEmitContext
aContext( rInPDFFileURL
,
581 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
582 pObject
->writeStream( aContext
, pPDFFile
);
583 xEmbed
= aContext
.getContextStream();
597 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */