1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include "filterdet.hxx"
22 #include "inc/pdfihelper.hxx"
23 #include "inc/pdfparse.hxx"
26 #include <osl/thread.h>
27 #include <rtl/digest.h>
28 #include <sal/log.hxx>
29 #include <com/sun/star/io/IOException.hpp>
30 #include <com/sun/star/io/XInputStream.hpp>
31 #include <com/sun/star/io/XStream.hpp>
32 #include <com/sun/star/io/XSeekable.hpp>
33 #include <com/sun/star/io/TempFile.hpp>
34 #include <com/sun/star/task/XInteractionHandler.hpp>
35 #include <comphelper/fileurl.hxx>
36 #include <comphelper/hash.hxx>
37 #include <cppuhelper/supportsservice.hxx>
38 #include <comphelper/diagnose_ex.hxx>
43 using namespace com::sun::star
;
48 // TODO(T3): locking/thread safety
52 class FileEmitContext
: public pdfparse::EmitContext
55 oslFileHandle m_aReadHandle
;
56 unsigned int m_nReadLen
;
57 uno::Reference
< io::XStream
> m_xContextStream
;
58 uno::Reference
< io::XSeekable
> m_xSeek
;
59 uno::Reference
< io::XOutputStream
> m_xOut
;
62 FileEmitContext( const OUString
& rOrigFile
,
63 const uno::Reference
< uno::XComponentContext
>& xContext
,
64 const pdfparse::PDFContainer
* pTop
);
65 virtual ~FileEmitContext() override
;
67 virtual bool write( const void* pBuf
, unsigned int nLen
) override
;
68 virtual unsigned int getCurPos() override
;
69 virtual bool copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
) override
;
70 virtual unsigned int readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
) override
;
72 const uno::Reference
< io::XStream
>& getContextStream() const { return m_xContextStream
; }
77 FileEmitContext::FileEmitContext( const OUString
& rOrigFile
,
78 const uno::Reference
< uno::XComponentContext
>& xContext
,
79 const pdfparse::PDFContainer
* pTop
) :
80 pdfparse::EmitContext( pTop
),
81 m_aReadHandle(nullptr),
84 m_xContextStream
.set( io::TempFile::create(xContext
), uno::UNO_QUERY_THROW
);
85 m_xOut
= m_xContextStream
->getOutputStream();
86 m_xSeek
.set(m_xOut
, uno::UNO_QUERY_THROW
);
88 if( osl_openFile( rOrigFile
.pData
,
90 osl_File_OpenFlag_Read
) == osl_File_E_None
)
92 oslFileError aErr
= osl_setFilePos( m_aReadHandle
, osl_Pos_End
, 0 );
93 if( aErr
== osl_File_E_None
)
95 sal_uInt64 nFileSize
= 0;
96 if( (aErr
=osl_getFilePos( m_aReadHandle
,
97 &nFileSize
)) == osl_File_E_None
)
99 m_nReadLen
= static_cast<unsigned int>(nFileSize
);
102 if( aErr
!= osl_File_E_None
)
104 osl_closeFile( m_aReadHandle
);
105 m_aReadHandle
= nullptr;
111 FileEmitContext::~FileEmitContext()
114 osl_closeFile( m_aReadHandle
);
117 bool FileEmitContext::write( const void* pBuf
, unsigned int nLen
)
122 uno::Sequence
< sal_Int8
> aSeq( nLen
);
123 memcpy( aSeq
.getArray(), pBuf
, nLen
);
124 m_xOut
->writeBytes( aSeq
);
128 unsigned int FileEmitContext::getCurPos()
130 unsigned int nPos
= 0;
133 nPos
= static_cast<unsigned int>( m_xSeek
->getPosition() );
138 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
)
140 if( nOrigOffset
+ nLen
> m_nReadLen
)
143 if( osl_setFilePos( m_aReadHandle
, osl_Pos_Absolut
, nOrigOffset
) != osl_File_E_None
)
146 uno::Sequence
< sal_Int8
> aSeq( nLen
);
148 sal_uInt64 nBytesRead
= 0;
149 if( osl_readFile( m_aReadHandle
,
152 &nBytesRead
) != osl_File_E_None
153 || nBytesRead
!= static_cast<sal_uInt64
>(nLen
) )
158 m_xOut
->writeBytes( aSeq
);
162 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset
, unsigned int nLen
, void* pBuf
)
164 if( nOrigOffset
+ nLen
> m_nReadLen
)
167 if( osl_setFilePos( m_aReadHandle
,
169 nOrigOffset
) != osl_File_E_None
)
174 sal_uInt64 nBytesRead
= 0;
175 if( osl_readFile( m_aReadHandle
,
178 &nBytesRead
) != osl_File_E_None
)
182 return static_cast<unsigned int>(nBytesRead
);
186 PDFDetector::PDFDetector( uno::Reference
< uno::XComponentContext
> xContext
) :
187 m_xContext(std::move( xContext
))
193 sal_Int32
fillAttributes(uno::Sequence
<beans::PropertyValue
> const& rFilterData
, uno::Reference
<io::XInputStream
>& xInput
, OUString
& aURL
, sal_Int32
& nFilterNamePos
, sal_Int32
& nPasswordPos
, OUString
& aPassword
)
195 const beans::PropertyValue
* pAttribs
= rFilterData
.getConstArray();
196 sal_Int32 nAttribs
= rFilterData
.getLength();
197 for (sal_Int32 i
= 0; i
< nAttribs
; i
++)
199 OUString
aVal( "<no string>" );
200 pAttribs
[i
].Value
>>= aVal
;
201 SAL_INFO("sdext.pdfimport", "doDetection: Attrib: " + pAttribs
[i
].Name
+ " = " + aVal
);
203 if (pAttribs
[i
].Name
== "InputStream")
204 pAttribs
[i
].Value
>>= xInput
;
205 else if (pAttribs
[i
].Name
== "URL")
206 pAttribs
[i
].Value
>>= aURL
;
207 else if (pAttribs
[i
].Name
== "FilterName")
209 else if (pAttribs
[i
].Name
== "Password")
212 pAttribs
[i
].Value
>>= aPassword
;
218 // read the first 1024 byte (see PDF reference implementation note 12)
219 constexpr const sal_Int32 constHeaderSize
= 1024;
221 bool detectPDF(uno::Reference
<io::XInputStream
> const& xInput
, uno::Sequence
<sal_Int8
>& aHeader
, sal_uInt64
& nHeaderReadSize
)
225 uno::Reference
<io::XSeekable
> xSeek(xInput
, uno::UNO_QUERY
);
229 nHeaderReadSize
= xInput
->readBytes(aHeader
, constHeaderSize
);
230 if (nHeaderReadSize
<= 5)
233 const sal_Int8
* pBytes
= aHeader
.getConstArray();
234 for (sal_uInt64 i
= 0; i
< nHeaderReadSize
- 5; i
++)
236 if (pBytes
[i
+0] == '%' &&
237 pBytes
[i
+1] == 'P' &&
238 pBytes
[i
+2] == 'D' &&
239 pBytes
[i
+3] == 'F' &&
246 catch (const css::io::IOException
&)
248 TOOLS_WARN_EXCEPTION("sdext.pdfimport", "caught");
253 bool copyToTemp(uno::Reference
<io::XInputStream
> const& xInput
, oslFileHandle
& rFileHandle
, uno::Sequence
<sal_Int8
> const& aHeader
, sal_uInt64 nHeaderReadSize
)
257 sal_uInt64 nWritten
= 0;
258 osl_writeFile(rFileHandle
, aHeader
.getConstArray(), nHeaderReadSize
, &nWritten
);
260 const sal_uInt64 nBufferSize
= 4096;
261 uno::Sequence
<sal_Int8
> aBuffer(nBufferSize
);
264 sal_uInt64 nRead
= 0;
267 nRead
= xInput
->readBytes(aBuffer
, nBufferSize
);
270 osl_writeFile(rFileHandle
, aBuffer
.getConstArray(), nRead
, &nWritten
);
271 if (nWritten
!= nRead
)
275 while (nRead
== nBufferSize
);
277 catch (const css::io::IOException
&)
279 TOOLS_WARN_EXCEPTION("sdext.pdfimport", "caught");
284 } // end anonymous namespace
286 // XExtendedFilterDetection
287 OUString SAL_CALL
PDFDetector::detect( uno::Sequence
< beans::PropertyValue
>& rFilterData
)
289 std::unique_lock
guard( m_aMutex
);
290 bool bSuccess
= false;
292 // get the InputStream carrying the PDF content
293 uno::Reference
<io::XInputStream
> xInput
;
294 uno::Reference
<io::XStream
> xEmbedStream
;
295 OUString aOutFilterName
;
296 OUString aOutTypeName
;
300 sal_Int32 nFilterNamePos
= -1;
301 sal_Int32 nPasswordPos
= -1;
302 sal_Int32 nAttribs
= fillAttributes(rFilterData
, xInput
, aURL
, nFilterNamePos
, nPasswordPos
, aPassword
);
308 uno::Sequence
<sal_Int8
> aHeader(constHeaderSize
);
309 sal_uInt64 nHeaderReadSize
= 0;
310 bSuccess
= detectPDF(xInput
, aHeader
, nHeaderReadSize
);
315 oslFileHandle aFileHandle
= nullptr;
317 // check for hybrid PDF
318 if (bSuccess
&& (aURL
.isEmpty() || !comphelper::isFileUrl(aURL
)))
320 if (osl_createTempFile(nullptr, &aFileHandle
, &aURL
.pData
) != osl_File_E_None
)
326 SAL_INFO( "sdext.pdfimport", "created temp file " + aURL
);
327 bSuccess
= copyToTemp(xInput
, aFileHandle
, aHeader
, nHeaderReadSize
);
329 osl_closeFile(aFileHandle
);
335 osl_removeFile(aURL
.pData
);
339 OUString aEmbedMimetype
;
340 xEmbedStream
= getAdditionalStream(aURL
, aEmbedMimetype
, aPassword
, m_xContext
, rFilterData
, false);
343 osl_removeFile(aURL
.pData
);
345 if (!aEmbedMimetype
.isEmpty())
347 if( aEmbedMimetype
== "application/vnd.oasis.opendocument.text"
348 || aEmbedMimetype
== "application/vnd.oasis.opendocument.text-master" )
349 aOutFilterName
= "writer_pdf_addstream_import";
350 else if ( aEmbedMimetype
== "application/vnd.oasis.opendocument.presentation" )
351 aOutFilterName
= "impress_pdf_addstream_import";
352 else if( aEmbedMimetype
== "application/vnd.oasis.opendocument.graphics"
353 || aEmbedMimetype
== "application/vnd.oasis.opendocument.drawing" )
354 aOutFilterName
= "draw_pdf_addstream_import";
355 else if ( aEmbedMimetype
== "application/vnd.oasis.opendocument.spreadsheet" )
356 aOutFilterName
= "calc_pdf_addstream_import";
359 if (!aOutFilterName
.isEmpty())
361 if( nFilterNamePos
== -1 )
363 nFilterNamePos
= nAttribs
;
364 rFilterData
.realloc( ++nAttribs
);
365 rFilterData
.getArray()[ nFilterNamePos
].Name
= "FilterName";
367 auto pFilterData
= rFilterData
.getArray();
368 aOutTypeName
= "pdf_Portable_Document_Format";
370 pFilterData
[nFilterNamePos
].Value
<<= aOutFilterName
;
371 if( xEmbedStream
.is() )
373 rFilterData
.realloc( ++nAttribs
);
374 pFilterData
= rFilterData
.getArray();
375 pFilterData
[nAttribs
-1].Name
= "EmbeddedSubstream";
376 pFilterData
[nAttribs
-1].Value
<<= xEmbedStream
;
378 if (!aPassword
.isEmpty())
380 if (nPasswordPos
== -1)
382 nPasswordPos
= nAttribs
;
383 rFilterData
.realloc(++nAttribs
);
384 pFilterData
= rFilterData
.getArray();
385 pFilterData
[nPasswordPos
].Name
= "Password";
387 pFilterData
[nPasswordPos
].Value
<<= aPassword
;
392 css::beans::PropertyValue
* pFilterData
;
393 if( nFilterNamePos
== -1 )
395 nFilterNamePos
= nAttribs
;
396 rFilterData
.realloc( ++nAttribs
);
397 pFilterData
= rFilterData
.getArray();
398 pFilterData
[ nFilterNamePos
].Name
= "FilterName";
401 pFilterData
= rFilterData
.getArray();
403 const sal_Int32 nDocumentType
= 0; //const sal_Int32 nDocumentType = queryDocumentTypeDialog(m_xContext,aURL);
404 if( nDocumentType
< 0 )
410 switch (nDocumentType
)
413 pFilterData
[nFilterNamePos
].Value
<<= OUString( "draw_pdf_import" );
417 pFilterData
[nFilterNamePos
].Value
<<= OUString( "impress_pdf_import" );
421 pFilterData
[nFilterNamePos
].Value
<<= OUString( "writer_pdf_import" );
425 assert(!"Unexpected case");
429 aOutTypeName
= "pdf_Portable_Document_Format";
435 OUString
PDFDetector::getImplementationName()
437 return "org.libreoffice.comp.documents.PDFDetector";
440 sal_Bool
PDFDetector::supportsService(OUString
const & ServiceName
)
442 return cppu::supportsService(this, ServiceName
);
445 css::uno::Sequence
<OUString
> PDFDetector::getSupportedServiceNames()
447 return {"com.sun.star.document.ImportFilter"};
450 bool checkDocChecksum( const OUString
& rInPDFFileURL
,
452 const OUString
& rChkSum
)
454 if( rChkSum
.getLength() != 2* RTL_DIGEST_LENGTH_MD5
)
458 "checksum of length " << rChkSum
.getLength() << ", expected "
459 << 2*RTL_DIGEST_LENGTH_MD5
);
463 // prepare checksum to test
464 sal_uInt8 nTestChecksum
[ RTL_DIGEST_LENGTH_MD5
];
465 const sal_Unicode
* pChar
= rChkSum
.getStr();
466 for(sal_uInt8
& rn
: nTestChecksum
)
468 sal_uInt8 nByte
= sal_uInt8( ( (*pChar
>= '0' && *pChar
<= '9') ? *pChar
- '0' :
469 ( (*pChar
>= 'A' && *pChar
<= 'F') ? *pChar
- 'A' + 10 :
470 ( (*pChar
>= 'a' && *pChar
<= 'f') ? *pChar
- 'a' + 10 :
474 nByte
|= ( (*pChar
>= '0' && *pChar
<= '9') ? *pChar
- '0' :
475 ( (*pChar
>= 'A' && *pChar
<= 'F') ? *pChar
- 'A' + 10 :
476 ( (*pChar
>= 'a' && *pChar
<= 'f') ? *pChar
- 'a' + 10 :
482 // open file and calculate actual checksum up to index nBytes
483 ::std::vector
<unsigned char> nChecksum
;
484 ::comphelper::Hash
aDigest(::comphelper::HashType::MD5
);
485 oslFileHandle aRead
= nullptr;
486 if( osl_openFile(rInPDFFileURL
.pData
,
488 osl_File_OpenFlag_Read
) == osl_File_E_None
)
490 sal_uInt8 aBuf
[4096];
492 sal_uInt64 nBytesRead
= 0;
493 while( nCur
< nBytes
)
495 sal_uInt32 nPass
= std::min
<sal_uInt32
>(nBytes
- nCur
, sizeof( aBuf
));
496 if( osl_readFile( aRead
, aBuf
, nPass
, &nBytesRead
) != osl_File_E_None
501 nPass
= static_cast<sal_uInt32
>(nBytesRead
);
503 aDigest
.update(aBuf
, nPass
);
506 nChecksum
= aDigest
.finalize();
507 osl_closeFile( aRead
);
510 // compare the contents
511 return nChecksum
.size() == RTL_DIGEST_LENGTH_MD5
512 && (0 == memcmp(nChecksum
.data(), nTestChecksum
, nChecksum
.size()));
515 uno::Reference
< io::XStream
> getAdditionalStream( const OUString
& rInPDFFileURL
,
516 OUString
& rOutMimetype
,
518 const uno::Reference
<uno::XComponentContext
>& xContext
,
519 const uno::Sequence
<beans::PropertyValue
>& rFilterData
,
522 uno::Reference
< io::XStream
> xEmbed
;
525 if( osl_getSystemPathFromFileURL( rInPDFFileURL
.pData
, &aSysUPath
.pData
) != osl_File_E_None
)
527 aPDFFile
= OUStringToOString( aSysUPath
, osl_getThreadTextEncoding() );
529 std::unique_ptr
<pdfparse::PDFEntry
> pEntry( pdfparse::PDFReader::read( aPDFFile
.getStr() ));
532 pdfparse::PDFFile
* pPDFFile
= dynamic_cast<pdfparse::PDFFile
*>(pEntry
.get());
535 unsigned int nElements
= pPDFFile
->m_aSubElements
.size();
536 while( nElements
-- > 0 )
538 pdfparse::PDFTrailer
* pTrailer
= dynamic_cast<pdfparse::PDFTrailer
*>(pPDFFile
->m_aSubElements
[nElements
].get());
539 if( pTrailer
&& pTrailer
->m_pDict
)
541 // search document checksum entry
542 auto chk
= pTrailer
->m_pDict
->m_aMap
.find( "DocChecksum" );
543 if( chk
== pTrailer
->m_pDict
->m_aMap
.end() )
545 SAL_INFO( "sdext.pdfimport", "no DocChecksum entry" );
548 pdfparse::PDFName
* pChkSumName
= dynamic_cast<pdfparse::PDFName
*>(chk
->second
);
549 if( pChkSumName
== nullptr )
551 SAL_INFO( "sdext.pdfimport", "no name for DocChecksum entry" );
555 // search for AdditionalStreams entry
556 auto add_stream
= pTrailer
->m_pDict
->m_aMap
.find( "AdditionalStreams" );
557 if( add_stream
== pTrailer
->m_pDict
->m_aMap
.end() )
559 SAL_INFO( "sdext.pdfimport", "no AdditionalStreams entry" );
562 pdfparse::PDFArray
* pStreams
= dynamic_cast<pdfparse::PDFArray
*>(add_stream
->second
);
563 if( ! pStreams
|| pStreams
->m_aSubElements
.size() < 2 )
565 SAL_INFO( "sdext.pdfimport", "AdditionalStreams array too small" );
570 OUString aChkSum
= pChkSumName
->getFilteredName();
571 if( ! checkDocChecksum( rInPDFFileURL
, pTrailer
->m_nOffset
, aChkSum
) )
574 // extract addstream and mimetype
575 pdfparse::PDFName
* pMimeType
= dynamic_cast<pdfparse::PDFName
*>(pStreams
->m_aSubElements
[0].get());
576 pdfparse::PDFObjectRef
* pStreamRef
= dynamic_cast<pdfparse::PDFObjectRef
*>(pStreams
->m_aSubElements
[1].get());
578 SAL_WARN_IF( !pMimeType
, "sdext.pdfimport", "error: no mimetype element" );
579 SAL_WARN_IF( !pStreamRef
, "sdext.pdfimport", "error: no stream ref element" );
581 if( pMimeType
&& pStreamRef
)
583 pdfparse::PDFObject
* pObject
= pPDFFile
->findObject( pStreamRef
->m_nNumber
, pStreamRef
->m_nGeneration
);
584 SAL_WARN_IF( !pObject
, "sdext.pdfimport", "object not found" );
587 if( pPDFFile
->isEncrypted() )
589 bool bAuthenticated
= false;
590 if( !io_rPwd
.isEmpty() )
592 OString aIsoPwd
= OUStringToOString( io_rPwd
,
593 RTL_TEXTENCODING_ISO_8859_1
);
594 bAuthenticated
= pPDFFile
->setupDecryptionData( aIsoPwd
);
596 if( ! bAuthenticated
)
598 uno::Reference
< task::XInteractionHandler
> xIntHdl
;
599 for( const beans::PropertyValue
& rAttrib
: rFilterData
)
601 if ( rAttrib
.Name
== "InteractionHandler" )
602 rAttrib
.Value
>>= xIntHdl
;
604 if( ! bMayUseUI
|| ! xIntHdl
.is() )
606 rOutMimetype
= pMimeType
->getFilteredName();
611 OUString
aDocName( rInPDFFileURL
.copy( rInPDFFileURL
.lastIndexOf( '/' )+1 ) );
613 bool bEntered
= false;
616 bEntered
= getPassword( xIntHdl
, io_rPwd
, ! bEntered
, aDocName
);
617 OString aIsoPwd
= OUStringToOString( io_rPwd
,
618 RTL_TEXTENCODING_ISO_8859_1
);
619 bAuthenticated
= pPDFFile
->setupDecryptionData( aIsoPwd
);
620 } while( bEntered
&& ! bAuthenticated
);
623 if( ! bAuthenticated
)
626 rOutMimetype
= pMimeType
->getFilteredName();
627 FileEmitContext
aContext( rInPDFFileURL
,
630 aContext
.m_bDecrypt
= pPDFFile
->isEncrypted();
631 pObject
->writeStream( aContext
, pPDFFile
);
632 xEmbed
= aContext
.getContextStream();
645 extern "C" SAL_DLLPUBLIC_EXPORT
css::uno::XInterface
*
646 sdext_PDFDetector_get_implementation(
647 css::uno::XComponentContext
* context
, css::uno::Sequence
<css::uno::Any
> const&)
649 return cppu::acquire(new PDFDetector(context
));
654 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */