Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / sdext / source / pdfimport / filterdet.cxx
blobef29e8a2c022c13f89361d5314642601f111723c
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include "filterdet.hxx"
22 #include "inc/pdfihelper.hxx"
23 #include "inc/pdfparse.hxx"
25 #include <osl/file.h>
26 #include <osl/thread.h>
27 #include <rtl/digest.h>
28 #include <sal/log.hxx>
29 #include <com/sun/star/io/IOException.hpp>
30 #include <com/sun/star/io/XInputStream.hpp>
31 #include <com/sun/star/io/XStream.hpp>
32 #include <com/sun/star/io/XSeekable.hpp>
33 #include <com/sun/star/io/TempFile.hpp>
34 #include <com/sun/star/task/XInteractionHandler.hpp>
35 #include <comphelper/fileurl.hxx>
36 #include <comphelper/hash.hxx>
37 #include <cppuhelper/supportsservice.hxx>
38 #include <comphelper/diagnose_ex.hxx>
39 #include <memory>
40 #include <utility>
41 #include <string.h>
43 using namespace com::sun::star;
45 namespace pdfi
48 // TODO(T3): locking/thread safety
50 namespace {
52 class FileEmitContext : public pdfparse::EmitContext
54 private:
55 oslFileHandle m_aReadHandle;
56 unsigned int m_nReadLen;
57 uno::Reference< io::XStream > m_xContextStream;
58 uno::Reference< io::XSeekable > m_xSeek;
59 uno::Reference< io::XOutputStream > m_xOut;
61 public:
62 FileEmitContext( const OUString& rOrigFile,
63 const uno::Reference< uno::XComponentContext >& xContext,
64 const pdfparse::PDFContainer* pTop );
65 virtual ~FileEmitContext() override;
67 virtual bool write( const void* pBuf, unsigned int nLen ) override;
68 virtual unsigned int getCurPos() override;
69 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) override;
70 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) override;
72 const uno::Reference< io::XStream >& getContextStream() const { return m_xContextStream; }
77 FileEmitContext::FileEmitContext( const OUString& rOrigFile,
78 const uno::Reference< uno::XComponentContext >& xContext,
79 const pdfparse::PDFContainer* pTop ) :
80 pdfparse::EmitContext( pTop ),
81 m_aReadHandle(nullptr),
82 m_nReadLen(0)
84 m_xContextStream.set( io::TempFile::create(xContext), uno::UNO_QUERY_THROW );
85 m_xOut = m_xContextStream->getOutputStream();
86 m_xSeek.set(m_xOut, uno::UNO_QUERY_THROW );
88 if( osl_openFile( rOrigFile.pData,
89 &m_aReadHandle,
90 osl_File_OpenFlag_Read ) == osl_File_E_None )
92 oslFileError aErr = osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 );
93 if( aErr == osl_File_E_None )
95 sal_uInt64 nFileSize = 0;
96 if( (aErr=osl_getFilePos( m_aReadHandle,
97 &nFileSize )) == osl_File_E_None )
99 m_nReadLen = static_cast<unsigned int>(nFileSize);
102 if( aErr != osl_File_E_None )
104 osl_closeFile( m_aReadHandle );
105 m_aReadHandle = nullptr;
108 m_bDeflate = true;
111 FileEmitContext::~FileEmitContext()
113 if( m_aReadHandle )
114 osl_closeFile( m_aReadHandle );
117 bool FileEmitContext::write( const void* pBuf, unsigned int nLen )
119 if( ! m_xOut.is() )
120 return false;
122 uno::Sequence< sal_Int8 > aSeq( nLen );
123 memcpy( aSeq.getArray(), pBuf, nLen );
124 m_xOut->writeBytes( aSeq );
125 return true;
128 unsigned int FileEmitContext::getCurPos()
130 unsigned int nPos = 0;
131 if( m_xSeek.is() )
133 nPos = static_cast<unsigned int>( m_xSeek->getPosition() );
135 return nPos;
138 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen )
140 if( nOrigOffset + nLen > m_nReadLen )
141 return false;
143 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
144 return false;
146 uno::Sequence< sal_Int8 > aSeq( nLen );
148 sal_uInt64 nBytesRead = 0;
149 if( osl_readFile( m_aReadHandle,
150 aSeq.getArray(),
151 nLen,
152 &nBytesRead ) != osl_File_E_None
153 || nBytesRead != static_cast<sal_uInt64>(nLen) )
155 return false;
158 m_xOut->writeBytes( aSeq );
159 return true;
162 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf )
164 if( nOrigOffset + nLen > m_nReadLen )
165 return 0;
167 if( osl_setFilePos( m_aReadHandle,
168 osl_Pos_Absolut,
169 nOrigOffset ) != osl_File_E_None )
171 return 0;
174 sal_uInt64 nBytesRead = 0;
175 if( osl_readFile( m_aReadHandle,
176 pBuf,
177 nLen,
178 &nBytesRead ) != osl_File_E_None )
180 return 0;
182 return static_cast<unsigned int>(nBytesRead);
186 PDFDetector::PDFDetector( uno::Reference< uno::XComponentContext > xContext) :
187 m_xContext(std::move( xContext ))
190 namespace
193 sal_Int32 fillAttributes(uno::Sequence<beans::PropertyValue> const& rFilterData, uno::Reference<io::XInputStream>& xInput, OUString& aURL, sal_Int32& nFilterNamePos, sal_Int32& nPasswordPos, OUString& aPassword)
195 const beans::PropertyValue* pAttribs = rFilterData.getConstArray();
196 sal_Int32 nAttribs = rFilterData.getLength();
197 for (sal_Int32 i = 0; i < nAttribs; i++)
199 OUString aVal( "<no string>" );
200 pAttribs[i].Value >>= aVal;
201 SAL_INFO("sdext.pdfimport", "doDetection: Attrib: " + pAttribs[i].Name + " = " + aVal);
203 if (pAttribs[i].Name == "InputStream")
204 pAttribs[i].Value >>= xInput;
205 else if (pAttribs[i].Name == "URL")
206 pAttribs[i].Value >>= aURL;
207 else if (pAttribs[i].Name == "FilterName")
208 nFilterNamePos = i;
209 else if (pAttribs[i].Name == "Password")
211 nPasswordPos = i;
212 pAttribs[i].Value >>= aPassword;
215 return nAttribs;
218 // read the first 1024 byte (see PDF reference implementation note 12)
219 constexpr const sal_Int32 constHeaderSize = 1024;
221 bool detectPDF(uno::Reference<io::XInputStream> const& xInput, uno::Sequence<sal_Int8>& aHeader, sal_uInt64& nHeaderReadSize)
225 uno::Reference<io::XSeekable> xSeek(xInput, uno::UNO_QUERY);
226 if (xSeek.is())
227 xSeek->seek(0);
229 nHeaderReadSize = xInput->readBytes(aHeader, constHeaderSize);
230 if (nHeaderReadSize <= 5)
231 return false;
233 const sal_Int8* pBytes = aHeader.getConstArray();
234 for (sal_uInt64 i = 0; i < nHeaderReadSize - 5; i++)
236 if (pBytes[i+0] == '%' &&
237 pBytes[i+1] == 'P' &&
238 pBytes[i+2] == 'D' &&
239 pBytes[i+3] == 'F' &&
240 pBytes[i+4] == '-')
242 return true;
246 catch (const css::io::IOException &)
248 TOOLS_WARN_EXCEPTION("sdext.pdfimport", "caught");
250 return false;
253 bool copyToTemp(uno::Reference<io::XInputStream> const& xInput, oslFileHandle& rFileHandle, uno::Sequence<sal_Int8> const& aHeader, sal_uInt64 nHeaderReadSize)
257 sal_uInt64 nWritten = 0;
258 osl_writeFile(rFileHandle, aHeader.getConstArray(), nHeaderReadSize, &nWritten);
260 const sal_uInt64 nBufferSize = 4096;
261 uno::Sequence<sal_Int8> aBuffer(nBufferSize);
263 // copy the bytes
264 sal_uInt64 nRead = 0;
267 nRead = xInput->readBytes(aBuffer, nBufferSize);
268 if (nRead > 0)
270 osl_writeFile(rFileHandle, aBuffer.getConstArray(), nRead, &nWritten);
271 if (nWritten != nRead)
272 return false;
275 while (nRead == nBufferSize);
277 catch (const css::io::IOException &)
279 TOOLS_WARN_EXCEPTION("sdext.pdfimport", "caught");
281 return false;
284 } // end anonymous namespace
286 // XExtendedFilterDetection
287 OUString SAL_CALL PDFDetector::detect( uno::Sequence< beans::PropertyValue >& rFilterData )
289 std::unique_lock guard( m_aMutex );
290 bool bSuccess = false;
292 // get the InputStream carrying the PDF content
293 uno::Reference<io::XInputStream> xInput;
294 uno::Reference<io::XStream> xEmbedStream;
295 OUString aOutFilterName;
296 OUString aOutTypeName;
297 OUString aURL;
298 OUString aPassword;
300 sal_Int32 nFilterNamePos = -1;
301 sal_Int32 nPasswordPos = -1;
302 sal_Int32 nAttribs = fillAttributes(rFilterData, xInput, aURL, nFilterNamePos, nPasswordPos, aPassword);
304 if (!xInput.is())
305 return OUString();
308 uno::Sequence<sal_Int8> aHeader(constHeaderSize);
309 sal_uInt64 nHeaderReadSize = 0;
310 bSuccess = detectPDF(xInput, aHeader, nHeaderReadSize);
312 if (!bSuccess)
313 return OUString();
315 oslFileHandle aFileHandle = nullptr;
317 // check for hybrid PDF
318 if (bSuccess && (aURL.isEmpty() || !comphelper::isFileUrl(aURL)))
320 if (osl_createTempFile(nullptr, &aFileHandle, &aURL.pData) != osl_File_E_None)
322 bSuccess = false;
324 else
326 SAL_INFO( "sdext.pdfimport", "created temp file " + aURL);
327 bSuccess = copyToTemp(xInput, aFileHandle, aHeader, nHeaderReadSize);
329 osl_closeFile(aFileHandle);
332 if (!bSuccess)
334 if (aFileHandle)
335 osl_removeFile(aURL.pData);
336 return OUString();
339 OUString aEmbedMimetype;
340 xEmbedStream = getAdditionalStream(aURL, aEmbedMimetype, aPassword, m_xContext, rFilterData, false);
342 if (aFileHandle)
343 osl_removeFile(aURL.pData);
345 if (!aEmbedMimetype.isEmpty())
347 if( aEmbedMimetype == "application/vnd.oasis.opendocument.text"
348 || aEmbedMimetype == "application/vnd.oasis.opendocument.text-master" )
349 aOutFilterName = "writer_pdf_addstream_import";
350 else if ( aEmbedMimetype == "application/vnd.oasis.opendocument.presentation" )
351 aOutFilterName = "impress_pdf_addstream_import";
352 else if( aEmbedMimetype == "application/vnd.oasis.opendocument.graphics"
353 || aEmbedMimetype == "application/vnd.oasis.opendocument.drawing" )
354 aOutFilterName = "draw_pdf_addstream_import";
355 else if ( aEmbedMimetype == "application/vnd.oasis.opendocument.spreadsheet" )
356 aOutFilterName = "calc_pdf_addstream_import";
359 if (!aOutFilterName.isEmpty())
361 if( nFilterNamePos == -1 )
363 nFilterNamePos = nAttribs;
364 rFilterData.realloc( ++nAttribs );
365 rFilterData.getArray()[ nFilterNamePos ].Name = "FilterName";
367 auto pFilterData = rFilterData.getArray();
368 aOutTypeName = "pdf_Portable_Document_Format";
370 pFilterData[nFilterNamePos].Value <<= aOutFilterName;
371 if( xEmbedStream.is() )
373 rFilterData.realloc( ++nAttribs );
374 pFilterData = rFilterData.getArray();
375 pFilterData[nAttribs-1].Name = "EmbeddedSubstream";
376 pFilterData[nAttribs-1].Value <<= xEmbedStream;
378 if (!aPassword.isEmpty())
380 if (nPasswordPos == -1)
382 nPasswordPos = nAttribs;
383 rFilterData.realloc(++nAttribs);
384 pFilterData = rFilterData.getArray();
385 pFilterData[nPasswordPos].Name = "Password";
387 pFilterData[nPasswordPos].Value <<= aPassword;
390 else
392 css::beans::PropertyValue* pFilterData;
393 if( nFilterNamePos == -1 )
395 nFilterNamePos = nAttribs;
396 rFilterData.realloc( ++nAttribs );
397 pFilterData = rFilterData.getArray();
398 pFilterData[ nFilterNamePos ].Name = "FilterName";
400 else
401 pFilterData = rFilterData.getArray();
403 const sal_Int32 nDocumentType = 0; //const sal_Int32 nDocumentType = queryDocumentTypeDialog(m_xContext,aURL);
404 if( nDocumentType < 0 )
406 return OUString();
408 else
410 switch (nDocumentType)
412 case 0:
413 pFilterData[nFilterNamePos].Value <<= OUString( "draw_pdf_import" );
414 break;
416 case 1:
417 pFilterData[nFilterNamePos].Value <<= OUString( "impress_pdf_import" );
418 break;
420 case 2:
421 pFilterData[nFilterNamePos].Value <<= OUString( "writer_pdf_import" );
422 break;
424 default:
425 assert(!"Unexpected case");
429 aOutTypeName = "pdf_Portable_Document_Format";
432 return aOutTypeName;
435 OUString PDFDetector::getImplementationName()
437 return "org.libreoffice.comp.documents.PDFDetector";
440 sal_Bool PDFDetector::supportsService(OUString const & ServiceName)
442 return cppu::supportsService(this, ServiceName);
445 css::uno::Sequence<OUString> PDFDetector::getSupportedServiceNames()
447 return {"com.sun.star.document.ImportFilter"};
450 bool checkDocChecksum( const OUString& rInPDFFileURL,
451 sal_uInt32 nBytes,
452 const OUString& rChkSum )
454 if( rChkSum.getLength() != 2* RTL_DIGEST_LENGTH_MD5 )
456 SAL_INFO(
457 "sdext.pdfimport",
458 "checksum of length " << rChkSum.getLength() << ", expected "
459 << 2*RTL_DIGEST_LENGTH_MD5);
460 return false;
463 // prepare checksum to test
464 sal_uInt8 nTestChecksum[ RTL_DIGEST_LENGTH_MD5 ];
465 const sal_Unicode* pChar = rChkSum.getStr();
466 for(sal_uInt8 & rn : nTestChecksum)
468 sal_uInt8 nByte = sal_uInt8( ( (*pChar >= '0' && *pChar <= '9') ? *pChar - '0' :
469 ( (*pChar >= 'A' && *pChar <= 'F') ? *pChar - 'A' + 10 :
470 ( (*pChar >= 'a' && *pChar <= 'f') ? *pChar - 'a' + 10 :
471 0 ) ) ) );
472 nByte <<= 4;
473 pChar++;
474 nByte |= ( (*pChar >= '0' && *pChar <= '9') ? *pChar - '0' :
475 ( (*pChar >= 'A' && *pChar <= 'F') ? *pChar - 'A' + 10 :
476 ( (*pChar >= 'a' && *pChar <= 'f') ? *pChar - 'a' + 10 :
477 0 ) ) );
478 pChar++;
479 rn = nByte;
482 // open file and calculate actual checksum up to index nBytes
483 ::std::vector<unsigned char> nChecksum;
484 ::comphelper::Hash aDigest(::comphelper::HashType::MD5);
485 oslFileHandle aRead = nullptr;
486 if( osl_openFile(rInPDFFileURL.pData,
487 &aRead,
488 osl_File_OpenFlag_Read ) == osl_File_E_None )
490 sal_uInt8 aBuf[4096];
491 sal_uInt32 nCur = 0;
492 sal_uInt64 nBytesRead = 0;
493 while( nCur < nBytes )
495 sal_uInt32 nPass = std::min<sal_uInt32>(nBytes - nCur, sizeof( aBuf ));
496 if( osl_readFile( aRead, aBuf, nPass, &nBytesRead) != osl_File_E_None
497 || nBytesRead == 0 )
499 break;
501 nPass = static_cast<sal_uInt32>(nBytesRead);
502 nCur += nPass;
503 aDigest.update(aBuf, nPass);
506 nChecksum = aDigest.finalize();
507 osl_closeFile( aRead );
510 // compare the contents
511 return nChecksum.size() == RTL_DIGEST_LENGTH_MD5
512 && (0 == memcmp(nChecksum.data(), nTestChecksum, nChecksum.size()));
515 uno::Reference< io::XStream > getAdditionalStream( const OUString& rInPDFFileURL,
516 OUString& rOutMimetype,
517 OUString& io_rPwd,
518 const uno::Reference<uno::XComponentContext>& xContext,
519 const uno::Sequence<beans::PropertyValue>& rFilterData,
520 bool bMayUseUI )
522 uno::Reference< io::XStream > xEmbed;
523 OString aPDFFile;
524 OUString aSysUPath;
525 if( osl_getSystemPathFromFileURL( rInPDFFileURL.pData, &aSysUPath.pData ) != osl_File_E_None )
526 return xEmbed;
527 aPDFFile = OUStringToOString( aSysUPath, osl_getThreadTextEncoding() );
529 std::unique_ptr<pdfparse::PDFEntry> pEntry( pdfparse::PDFReader::read( aPDFFile.getStr() ));
530 if( pEntry )
532 pdfparse::PDFFile* pPDFFile = dynamic_cast<pdfparse::PDFFile*>(pEntry.get());
533 if( pPDFFile )
535 unsigned int nElements = pPDFFile->m_aSubElements.size();
536 while( nElements-- > 0 )
538 pdfparse::PDFTrailer* pTrailer = dynamic_cast<pdfparse::PDFTrailer*>(pPDFFile->m_aSubElements[nElements].get());
539 if( pTrailer && pTrailer->m_pDict )
541 // search document checksum entry
542 auto chk = pTrailer->m_pDict->m_aMap.find( "DocChecksum" );
543 if( chk == pTrailer->m_pDict->m_aMap.end() )
545 SAL_INFO( "sdext.pdfimport", "no DocChecksum entry" );
546 continue;
548 pdfparse::PDFName* pChkSumName = dynamic_cast<pdfparse::PDFName*>(chk->second);
549 if( pChkSumName == nullptr )
551 SAL_INFO( "sdext.pdfimport", "no name for DocChecksum entry" );
552 continue;
555 // search for AdditionalStreams entry
556 auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
557 if( add_stream == pTrailer->m_pDict->m_aMap.end() )
559 SAL_INFO( "sdext.pdfimport", "no AdditionalStreams entry" );
560 continue;
562 pdfparse::PDFArray* pStreams = dynamic_cast<pdfparse::PDFArray*>(add_stream->second);
563 if( ! pStreams || pStreams->m_aSubElements.size() < 2 )
565 SAL_INFO( "sdext.pdfimport", "AdditionalStreams array too small" );
566 continue;
569 // check checksum
570 OUString aChkSum = pChkSumName->getFilteredName();
571 if( ! checkDocChecksum( rInPDFFileURL, pTrailer->m_nOffset, aChkSum ) )
572 continue;
574 // extract addstream and mimetype
575 pdfparse::PDFName* pMimeType = dynamic_cast<pdfparse::PDFName*>(pStreams->m_aSubElements[0].get());
576 pdfparse::PDFObjectRef* pStreamRef = dynamic_cast<pdfparse::PDFObjectRef*>(pStreams->m_aSubElements[1].get());
578 SAL_WARN_IF( !pMimeType, "sdext.pdfimport", "error: no mimetype element" );
579 SAL_WARN_IF( !pStreamRef, "sdext.pdfimport", "error: no stream ref element" );
581 if( pMimeType && pStreamRef )
583 pdfparse::PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
584 SAL_WARN_IF( !pObject, "sdext.pdfimport", "object not found" );
585 if( pObject )
587 if( pPDFFile->isEncrypted() )
589 bool bAuthenticated = false;
590 if( !io_rPwd.isEmpty() )
592 OString aIsoPwd = OUStringToOString( io_rPwd,
593 RTL_TEXTENCODING_ISO_8859_1 );
594 bAuthenticated = pPDFFile->setupDecryptionData( aIsoPwd );
596 if( ! bAuthenticated )
598 uno::Reference< task::XInteractionHandler > xIntHdl;
599 for( const beans::PropertyValue& rAttrib : rFilterData )
601 if ( rAttrib.Name == "InteractionHandler" )
602 rAttrib.Value >>= xIntHdl;
604 if( ! bMayUseUI || ! xIntHdl.is() )
606 rOutMimetype = pMimeType->getFilteredName();
607 xEmbed.clear();
608 break;
611 OUString aDocName( rInPDFFileURL.copy( rInPDFFileURL.lastIndexOf( '/' )+1 ) );
613 bool bEntered = false;
616 bEntered = getPassword( xIntHdl, io_rPwd, ! bEntered, aDocName );
617 OString aIsoPwd = OUStringToOString( io_rPwd,
618 RTL_TEXTENCODING_ISO_8859_1 );
619 bAuthenticated = pPDFFile->setupDecryptionData( aIsoPwd );
620 } while( bEntered && ! bAuthenticated );
623 if( ! bAuthenticated )
624 continue;
626 rOutMimetype = pMimeType->getFilteredName();
627 FileEmitContext aContext( rInPDFFileURL,
628 xContext,
629 pPDFFile );
630 aContext.m_bDecrypt = pPDFFile->isEncrypted();
631 pObject->writeStream( aContext, pPDFFile );
632 xEmbed = aContext.getContextStream();
633 break; // success
641 return xEmbed;
645 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
646 sdext_PDFDetector_get_implementation(
647 css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
649 return cppu::acquire(new PDFDetector(context));
654 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */