vcl: allow for overriding the default PDF rendering resolution
[LibreOffice.git] / sdext / source / pdfimport / filterdet.cxx
blob03931f47edb0142442204346ac527f72b910f7ef
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include "filterdet.hxx"
22 #include "inc/pdfihelper.hxx"
23 #include "inc/pdfparse.hxx"
25 #include <osl/file.h>
26 #include <osl/thread.h>
27 #include <rtl/digest.h>
28 #include <sal/log.hxx>
29 #include <com/sun/star/io/IOException.hpp>
30 #include <com/sun/star/io/XInputStream.hpp>
31 #include <com/sun/star/io/XStream.hpp>
32 #include <com/sun/star/io/XSeekable.hpp>
33 #include <com/sun/star/io/TempFile.hpp>
34 #include <com/sun/star/task/XInteractionHandler.hpp>
35 #include <comphelper/fileurl.hxx>
36 #include <comphelper/hash.hxx>
37 #include <cppuhelper/supportsservice.hxx>
38 #include <tools/diagnose_ex.h>
39 #include <memory>
40 #include <string.h>
42 using namespace com::sun::star;
44 namespace pdfi
47 // TODO(T3): locking/thread safety
49 class FileEmitContext : public pdfparse::EmitContext
51 private:
52 oslFileHandle m_aReadHandle;
53 unsigned int m_nReadLen;
54 uno::Reference< io::XStream > m_xContextStream;
55 uno::Reference< io::XSeekable > m_xSeek;
56 uno::Reference< io::XOutputStream > m_xOut;
58 public:
59 FileEmitContext( const OUString& rOrigFile,
60 const uno::Reference< uno::XComponentContext >& xContext,
61 const pdfparse::PDFContainer* pTop );
62 virtual ~FileEmitContext() override;
64 virtual bool write( const void* pBuf, unsigned int nLen ) override;
65 virtual unsigned int getCurPos() override;
66 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) override;
67 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) override;
69 const uno::Reference< io::XStream >& getContextStream() const { return m_xContextStream; }
72 FileEmitContext::FileEmitContext( const OUString& rOrigFile,
73 const uno::Reference< uno::XComponentContext >& xContext,
74 const pdfparse::PDFContainer* pTop ) :
75 pdfparse::EmitContext( pTop ),
76 m_aReadHandle(nullptr),
77 m_nReadLen(0),
78 m_xContextStream(),
79 m_xSeek(),
80 m_xOut()
82 m_xContextStream.set( io::TempFile::create(xContext), uno::UNO_QUERY_THROW );
83 m_xOut = m_xContextStream->getOutputStream();
84 m_xSeek.set(m_xOut, uno::UNO_QUERY_THROW );
86 oslFileError aErr = osl_File_E_None;
87 if( (aErr=osl_openFile( rOrigFile.pData,
88 &m_aReadHandle,
89 osl_File_OpenFlag_Read )) == osl_File_E_None )
91 if( (aErr=osl_setFilePos( m_aReadHandle,
92 osl_Pos_End,
93 0 )) == osl_File_E_None )
95 sal_uInt64 nFileSize = 0;
96 if( (aErr=osl_getFilePos( m_aReadHandle,
97 &nFileSize )) == osl_File_E_None )
99 m_nReadLen = static_cast<unsigned int>(nFileSize);
102 if( aErr != osl_File_E_None )
104 osl_closeFile( m_aReadHandle );
105 m_aReadHandle = nullptr;
108 m_bDeflate = true;
111 FileEmitContext::~FileEmitContext()
113 if( m_aReadHandle )
114 osl_closeFile( m_aReadHandle );
117 bool FileEmitContext::write( const void* pBuf, unsigned int nLen )
119 if( ! m_xOut.is() )
120 return false;
122 uno::Sequence< sal_Int8 > aSeq( nLen );
123 memcpy( aSeq.getArray(), pBuf, nLen );
124 m_xOut->writeBytes( aSeq );
125 return true;
128 unsigned int FileEmitContext::getCurPos()
130 unsigned int nPos = 0;
131 if( m_xSeek.is() )
133 nPos = static_cast<unsigned int>( m_xSeek->getPosition() );
135 return nPos;
138 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen )
140 if( nOrigOffset + nLen > m_nReadLen )
141 return false;
143 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
144 return false;
146 uno::Sequence< sal_Int8 > aSeq( nLen );
148 sal_uInt64 nBytesRead = 0;
149 if( osl_readFile( m_aReadHandle,
150 aSeq.getArray(),
151 nLen,
152 &nBytesRead ) != osl_File_E_None
153 || nBytesRead != static_cast<sal_uInt64>(nLen) )
155 return false;
158 m_xOut->writeBytes( aSeq );
159 return true;
162 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf )
164 if( nOrigOffset + nLen > m_nReadLen )
165 return 0;
167 if( osl_setFilePos( m_aReadHandle,
168 osl_Pos_Absolut,
169 nOrigOffset ) != osl_File_E_None )
171 return 0;
174 sal_uInt64 nBytesRead = 0;
175 if( osl_readFile( m_aReadHandle,
176 pBuf,
177 nLen,
178 &nBytesRead ) != osl_File_E_None )
180 return 0;
182 return static_cast<unsigned int>(nBytesRead);
186 PDFDetector::PDFDetector( const uno::Reference< uno::XComponentContext >& xContext) :
187 PDFDetectorBase( m_aMutex ),
188 m_xContext( xContext )
191 // XExtendedFilterDetection
192 OUString SAL_CALL PDFDetector::detect( uno::Sequence< beans::PropertyValue >& rFilterData )
194 osl::MutexGuard const guard( m_aMutex );
195 bool bSuccess = false;
197 // get the InputStream carrying the PDF content
198 uno::Reference< io::XInputStream > xInput;
199 uno::Reference< io::XStream > xEmbedStream;
200 OUString aOutFilterName, aOutTypeName;
201 OUString aURL;
202 OUString aPwd;
203 const beans::PropertyValue* pAttribs = rFilterData.getConstArray();
204 sal_Int32 nAttribs = rFilterData.getLength();
205 sal_Int32 nFilterNamePos = -1;
206 sal_Int32 nPwdPos = -1;
207 for( sal_Int32 i = 0; i < nAttribs; i++ )
209 OUString aVal( "<no string>" );
210 pAttribs[i].Value >>= aVal;
211 SAL_INFO( "sdext.pdfimport", "doDetection: Attrib: " + pAttribs[i].Name + " = " + aVal);
213 if ( pAttribs[i].Name == "InputStream" )
214 pAttribs[i].Value >>= xInput;
215 else if ( pAttribs[i].Name == "URL" )
216 pAttribs[i].Value >>= aURL;
217 else if ( pAttribs[i].Name == "FilterName" )
218 nFilterNamePos = i;
219 else if ( pAttribs[i].Name == "Password" )
221 nPwdPos = i;
222 pAttribs[i].Value >>= aPwd;
225 if( xInput.is() )
227 oslFileHandle aFile = nullptr;
228 try {
229 uno::Reference< io::XSeekable > xSeek( xInput, uno::UNO_QUERY );
230 if( xSeek.is() )
231 xSeek->seek( 0 );
232 // read the first 1024 byte (see PDF reference implementation note 12)
233 const sal_Int32 nHeaderSize = 1024;
234 uno::Sequence< sal_Int8 > aBuf( nHeaderSize );
235 sal_uInt64 nBytes = xInput->readBytes( aBuf, nHeaderSize );
236 if( nBytes > 5 )
238 const sal_Int8* pBytes = aBuf.getConstArray();
239 for( sal_uInt64 i = 0; i < nBytes-5; i++ )
241 if( pBytes[i] == '%' &&
242 pBytes[i+1] == 'P' &&
243 pBytes[i+2] == 'D' &&
244 pBytes[i+3] == 'F' &&
245 pBytes[i+4] == '-' )
247 bSuccess = true;
248 break;
253 // check for hybrid PDF
254 if( bSuccess &&
255 ( aURL.isEmpty() || !comphelper::isFileUrl(aURL) )
258 sal_uInt64 nWritten = 0;
259 if( osl_createTempFile( nullptr, &aFile, &aURL.pData ) != osl_File_E_None )
261 bSuccess = false;
263 else
265 SAL_INFO( "sdext.pdfimport", "created temp file " + aURL );
267 osl_writeFile( aFile, aBuf.getConstArray(), nBytes, &nWritten );
269 SAL_WARN_IF( nWritten != nBytes, "sdext.pdfimport", "writing of header bytes failed" );
271 if( nWritten == nBytes )
273 const sal_uInt32 nBufSize = 4096;
274 aBuf = uno::Sequence<sal_Int8>(nBufSize);
275 // copy the bytes
278 nBytes = xInput->readBytes( aBuf, nBufSize );
279 if( nBytes > 0 )
281 osl_writeFile( aFile, aBuf.getConstArray(), nBytes, &nWritten );
282 if( nWritten != nBytes )
284 bSuccess = false;
285 break;
288 } while( nBytes == nBufSize );
291 osl_closeFile( aFile );
293 } catch (const css::io::IOException &) {
294 TOOLS_WARN_EXCEPTION("sdext.pdfimport", "caught");
295 return OUString();
297 OUString aEmbedMimetype;
298 xEmbedStream = getAdditionalStream( aURL, aEmbedMimetype, aPwd, m_xContext, rFilterData, false );
299 if( aFile )
300 osl_removeFile( aURL.pData );
301 if( !aEmbedMimetype.isEmpty() )
303 if( aEmbedMimetype == "application/vnd.oasis.opendocument.text"
304 || aEmbedMimetype == "application/vnd.oasis.opendocument.text-master" )
305 aOutFilterName = "writer_pdf_addstream_import";
306 else if ( aEmbedMimetype == "application/vnd.oasis.opendocument.presentation" )
307 aOutFilterName = "impress_pdf_addstream_import";
308 else if( aEmbedMimetype == "application/vnd.oasis.opendocument.graphics"
309 || aEmbedMimetype == "application/vnd.oasis.opendocument.drawing" )
310 aOutFilterName = "draw_pdf_addstream_import";
311 else if ( aEmbedMimetype == "application/vnd.oasis.opendocument.spreadsheet" )
312 aOutFilterName = "calc_pdf_addstream_import";
316 if( bSuccess )
318 if( !aOutFilterName.isEmpty() )
320 if( nFilterNamePos == -1 )
322 nFilterNamePos = nAttribs;
323 rFilterData.realloc( ++nAttribs );
324 rFilterData[ nFilterNamePos ].Name = "FilterName";
326 aOutTypeName = "pdf_Portable_Document_Format";
328 rFilterData[nFilterNamePos].Value <<= aOutFilterName;
329 if( xEmbedStream.is() )
331 rFilterData.realloc( ++nAttribs );
332 rFilterData[nAttribs-1].Name = "EmbeddedSubstream";
333 rFilterData[nAttribs-1].Value <<= xEmbedStream;
335 if( !aPwd.isEmpty() )
337 if( nPwdPos == -1 )
339 nPwdPos = nAttribs;
340 rFilterData.realloc( ++nAttribs );
341 rFilterData[ nPwdPos ].Name = "Password";
343 rFilterData[ nPwdPos ].Value <<= aPwd;
346 else
348 if( nFilterNamePos == -1 )
350 nFilterNamePos = nAttribs;
351 rFilterData.realloc( ++nAttribs );
352 rFilterData[ nFilterNamePos ].Name = "FilterName";
355 const sal_Int32 nDocumentType = 0; //const sal_Int32 nDocumentType = queryDocumentTypeDialog(m_xContext,aURL);
356 if( nDocumentType < 0 )
358 return OUString();
360 else switch( nDocumentType )
362 case 0:
363 rFilterData[nFilterNamePos].Value <<= OUString( "draw_pdf_import" );
364 break;
366 case 1:
367 rFilterData[nFilterNamePos].Value <<= OUString( "impress_pdf_import" );
368 break;
370 case 2:
371 rFilterData[nFilterNamePos].Value <<= OUString( "writer_pdf_import" );
372 break;
374 default:
375 assert(!"Unexpected case");
378 aOutTypeName = "pdf_Portable_Document_Format";
382 return aOutTypeName;
385 OUString PDFDetector::getImplementationName()
387 return "org.libreoffice.comp.documents.PDFDetector";
390 sal_Bool PDFDetector::supportsService(OUString const & ServiceName)
392 return cppu::supportsService(this, ServiceName);
395 css::uno::Sequence<OUString> PDFDetector::getSupportedServiceNames()
397 return css::uno::Sequence<OUString>{"com.sun.star.document.ImportFilter"};
400 bool checkDocChecksum( const OUString& rInPDFFileURL,
401 sal_uInt32 nBytes,
402 const OUString& rChkSum )
404 if( rChkSum.getLength() != 2* RTL_DIGEST_LENGTH_MD5 )
406 SAL_INFO(
407 "sdext.pdfimport",
408 "checksum of length " << rChkSum.getLength() << ", expected "
409 << 2*RTL_DIGEST_LENGTH_MD5);
410 return false;
413 // prepare checksum to test
414 sal_uInt8 nTestChecksum[ RTL_DIGEST_LENGTH_MD5 ];
415 const sal_Unicode* pChar = rChkSum.getStr();
416 for(sal_uInt8 & rn : nTestChecksum)
418 sal_uInt8 nByte = sal_uInt8( ( (*pChar >= '0' && *pChar <= '9') ? *pChar - '0' :
419 ( (*pChar >= 'A' && *pChar <= 'F') ? *pChar - 'A' + 10 :
420 ( (*pChar >= 'a' && *pChar <= 'f') ? *pChar - 'a' + 10 :
421 0 ) ) ) );
422 nByte <<= 4;
423 pChar++;
424 nByte |= ( (*pChar >= '0' && *pChar <= '9') ? *pChar - '0' :
425 ( (*pChar >= 'A' && *pChar <= 'F') ? *pChar - 'A' + 10 :
426 ( (*pChar >= 'a' && *pChar <= 'f') ? *pChar - 'a' + 10 :
427 0 ) ) );
428 pChar++;
429 rn = nByte;
432 // open file and calculate actual checksum up to index nBytes
433 ::std::vector<unsigned char> nChecksum;
434 ::comphelper::Hash aDigest(::comphelper::HashType::MD5);
435 oslFileHandle aRead = nullptr;
436 oslFileError aErr = osl_File_E_None;
437 if( (aErr = osl_openFile(rInPDFFileURL.pData,
438 &aRead,
439 osl_File_OpenFlag_Read )) == osl_File_E_None )
441 sal_uInt8 aBuf[4096];
442 sal_uInt32 nCur = 0;
443 sal_uInt64 nBytesRead = 0;
444 while( nCur < nBytes )
446 sal_uInt32 nPass = std::min<sal_uInt32>(nBytes - nCur, sizeof( aBuf ));
447 if( (aErr = osl_readFile( aRead, aBuf, nPass, &nBytesRead)) != osl_File_E_None
448 || nBytesRead == 0 )
450 break;
452 nPass = static_cast<sal_uInt32>(nBytesRead);
453 nCur += nPass;
454 aDigest.update(aBuf, nPass);
457 nChecksum = aDigest.finalize();
458 osl_closeFile( aRead );
461 // compare the contents
462 return nChecksum.size() == RTL_DIGEST_LENGTH_MD5
463 && (0 == memcmp(nChecksum.data(), nTestChecksum, nChecksum.size()));
466 uno::Reference< io::XStream > getAdditionalStream( const OUString& rInPDFFileURL,
467 OUString& rOutMimetype,
468 OUString& io_rPwd,
469 const uno::Reference<uno::XComponentContext>& xContext,
470 const uno::Sequence<beans::PropertyValue>& rFilterData,
471 bool bMayUseUI )
473 uno::Reference< io::XStream > xEmbed;
474 OString aPDFFile;
475 OUString aSysUPath;
476 if( osl_getSystemPathFromFileURL( rInPDFFileURL.pData, &aSysUPath.pData ) != osl_File_E_None )
477 return xEmbed;
478 aPDFFile = OUStringToOString( aSysUPath, osl_getThreadTextEncoding() );
480 std::unique_ptr<pdfparse::PDFEntry> pEntry( pdfparse::PDFReader::read( aPDFFile.getStr() ));
481 if( pEntry )
483 pdfparse::PDFFile* pPDFFile = dynamic_cast<pdfparse::PDFFile*>(pEntry.get());
484 if( pPDFFile )
486 unsigned int nElements = pPDFFile->m_aSubElements.size();
487 while( nElements-- > 0 )
489 pdfparse::PDFTrailer* pTrailer = dynamic_cast<pdfparse::PDFTrailer*>(pPDFFile->m_aSubElements[nElements].get());
490 if( pTrailer && pTrailer->m_pDict )
492 // search document checksum entry
493 auto chk = pTrailer->m_pDict->m_aMap.find( "DocChecksum" );
494 if( chk == pTrailer->m_pDict->m_aMap.end() )
496 SAL_INFO( "sdext.pdfimport", "no DocChecksum entry" );
497 continue;
499 pdfparse::PDFName* pChkSumName = dynamic_cast<pdfparse::PDFName*>(chk->second);
500 if( pChkSumName == nullptr )
502 SAL_INFO( "sdext.pdfimport", "no name for DocChecksum entry" );
503 continue;
506 // search for AdditionalStreams entry
507 auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
508 if( add_stream == pTrailer->m_pDict->m_aMap.end() )
510 SAL_INFO( "sdext.pdfimport", "no AdditionalStreams entry" );
511 continue;
513 pdfparse::PDFArray* pStreams = dynamic_cast<pdfparse::PDFArray*>(add_stream->second);
514 if( ! pStreams || pStreams->m_aSubElements.size() < 2 )
516 SAL_INFO( "sdext.pdfimport", "AdditionalStreams array too small" );
517 continue;
520 // check checksum
521 OUString aChkSum = pChkSumName->getFilteredName();
522 if( ! checkDocChecksum( rInPDFFileURL, pTrailer->m_nOffset, aChkSum ) )
523 continue;
525 // extract addstream and mimetype
526 pdfparse::PDFName* pMimeType = dynamic_cast<pdfparse::PDFName*>(pStreams->m_aSubElements[0].get());
527 pdfparse::PDFObjectRef* pStreamRef = dynamic_cast<pdfparse::PDFObjectRef*>(pStreams->m_aSubElements[1].get());
529 SAL_WARN_IF( !pMimeType, "sdext.pdfimport", "error: no mimetype element" );
530 SAL_WARN_IF( !pStreamRef, "sdext.pdfimport", "error: no stream ref element" );
532 if( pMimeType && pStreamRef )
534 pdfparse::PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
535 SAL_WARN_IF( !pObject, "sdext.pdfimport", "object not found" );
536 if( pObject )
538 if( pPDFFile->isEncrypted() )
540 bool bAuthenticated = false;
541 if( !io_rPwd.isEmpty() )
543 OString aIsoPwd = OUStringToOString( io_rPwd,
544 RTL_TEXTENCODING_ISO_8859_1 );
545 bAuthenticated = pPDFFile->setupDecryptionData( aIsoPwd.getStr() );
547 if( ! bAuthenticated )
549 uno::Reference< task::XInteractionHandler > xIntHdl;
550 for( const beans::PropertyValue& rAttrib : rFilterData )
552 if ( rAttrib.Name == "InteractionHandler" )
553 rAttrib.Value >>= xIntHdl;
555 if( ! bMayUseUI || ! xIntHdl.is() )
557 rOutMimetype = pMimeType->getFilteredName();
558 xEmbed.clear();
559 break;
562 OUString aDocName( rInPDFFileURL.copy( rInPDFFileURL.lastIndexOf( '/' )+1 ) );
564 bool bEntered = false;
567 bEntered = getPassword( xIntHdl, io_rPwd, ! bEntered, aDocName );
568 OString aIsoPwd = OUStringToOString( io_rPwd,
569 RTL_TEXTENCODING_ISO_8859_1 );
570 bAuthenticated = pPDFFile->setupDecryptionData( aIsoPwd.getStr() );
571 } while( bEntered && ! bAuthenticated );
574 if( ! bAuthenticated )
575 continue;
577 rOutMimetype = pMimeType->getFilteredName();
578 FileEmitContext aContext( rInPDFFileURL,
579 xContext,
580 pPDFFile );
581 aContext.m_bDecrypt = pPDFFile->isEncrypted();
582 pObject->writeStream( aContext, pPDFFile );
583 xEmbed = aContext.getContextStream();
584 break; // success
592 return xEmbed;
597 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */