bump product version to 5.0.4.1
[LibreOffice.git] / sdext / source / pdfimport / filterdet.cxx
blob23bb0c91ab24f1accc29b93e4bf73068f4534ed2
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include "filterdet.hxx"
22 #include "inc/pdfihelper.hxx"
23 #include "inc/pdfparse.hxx"
25 #include <osl/diagnose.h>
26 #include <osl/file.h>
27 #include <osl/thread.h>
28 #include <rtl/digest.h>
29 #include <rtl/ref.hxx>
30 #include <com/sun/star/uno/RuntimeException.hpp>
31 #include <com/sun/star/io/XInputStream.hpp>
32 #include <com/sun/star/io/XStream.hpp>
33 #include <com/sun/star/io/XSeekable.hpp>
34 #include <com/sun/star/io/TempFile.hpp>
35 #include <cppuhelper/supportsservice.hxx>
36 #include <boost/scoped_ptr.hpp>
37 #include <string.h>
39 using namespace com::sun::star;
41 namespace pdfi
44 // TODO(T3): locking/thread safety
46 class FileEmitContext : public pdfparse::EmitContext
48 private:
49 oslFileHandle m_aReadHandle;
50 unsigned int m_nReadLen;
51 uno::Reference< io::XStream > m_xContextStream;
52 uno::Reference< io::XSeekable > m_xSeek;
53 uno::Reference< io::XOutputStream > m_xOut;
55 public:
56 FileEmitContext( const OUString& rOrigFile,
57 const uno::Reference< uno::XComponentContext >& xContext,
58 const pdfparse::PDFContainer* pTop );
59 virtual ~FileEmitContext();
61 virtual bool write( const void* pBuf, unsigned int nLen ) SAL_OVERRIDE;
62 virtual unsigned int getCurPos() SAL_OVERRIDE;
63 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) SAL_OVERRIDE;
64 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) SAL_OVERRIDE;
66 const uno::Reference< io::XStream >& getContextStream() const { return m_xContextStream; }
69 FileEmitContext::FileEmitContext( const OUString& rOrigFile,
70 const uno::Reference< uno::XComponentContext >& xContext,
71 const pdfparse::PDFContainer* pTop ) :
72 pdfparse::EmitContext( pTop ),
73 m_aReadHandle(NULL),
74 m_nReadLen(0),
75 m_xContextStream(),
76 m_xSeek(),
77 m_xOut()
79 m_xContextStream = uno::Reference< io::XStream >(
80 io::TempFile::create(xContext), uno::UNO_QUERY_THROW );
81 m_xOut = m_xContextStream->getOutputStream();
82 m_xSeek = uno::Reference<io::XSeekable>(m_xOut, uno::UNO_QUERY_THROW );
84 oslFileError aErr = osl_File_E_None;
85 if( (aErr=osl_openFile( rOrigFile.pData,
86 &m_aReadHandle,
87 osl_File_OpenFlag_Read )) == osl_File_E_None )
89 if( (aErr=osl_setFilePos( m_aReadHandle,
90 osl_Pos_End,
91 0 )) == osl_File_E_None )
93 sal_uInt64 nFileSize = 0;
94 if( (aErr=osl_getFilePos( m_aReadHandle,
95 &nFileSize )) == osl_File_E_None )
97 m_nReadLen = static_cast<unsigned int>(nFileSize);
100 if( aErr != osl_File_E_None )
102 osl_closeFile( m_aReadHandle );
103 m_aReadHandle = NULL;
106 m_bDeflate = true;
109 FileEmitContext::~FileEmitContext()
111 if( m_aReadHandle )
112 osl_closeFile( m_aReadHandle );
115 bool FileEmitContext::write( const void* pBuf, unsigned int nLen )
117 if( ! m_xOut.is() )
118 return false;
120 uno::Sequence< sal_Int8 > aSeq( nLen );
121 memcpy( aSeq.getArray(), pBuf, nLen );
122 m_xOut->writeBytes( aSeq );
123 return true;
126 unsigned int FileEmitContext::getCurPos()
128 unsigned int nPos = 0;
129 if( m_xSeek.is() )
131 nPos = static_cast<unsigned int>( m_xSeek->getPosition() );
133 return nPos;
136 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen )
138 if( nOrigOffset + nLen > m_nReadLen )
139 return false;
141 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
142 return false;
144 uno::Sequence< sal_Int8 > aSeq( nLen );
146 sal_uInt64 nBytesRead = 0;
147 if( osl_readFile( m_aReadHandle,
148 aSeq.getArray(),
149 nLen,
150 &nBytesRead ) != osl_File_E_None
151 || nBytesRead != static_cast<sal_uInt64>(nLen) )
153 return false;
156 m_xOut->writeBytes( aSeq );
157 return true;
160 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf )
162 if( nOrigOffset + nLen > m_nReadLen )
163 return 0;
165 if( osl_setFilePos( m_aReadHandle,
166 osl_Pos_Absolut,
167 nOrigOffset ) != osl_File_E_None )
169 return 0;
172 sal_uInt64 nBytesRead = 0;
173 if( osl_readFile( m_aReadHandle,
174 pBuf,
175 nLen,
176 &nBytesRead ) != osl_File_E_None )
178 return 0;
180 return static_cast<unsigned int>(nBytesRead);
187 PDFDetector::PDFDetector( const uno::Reference< uno::XComponentContext >& xContext) :
188 PDFDetectorBase( m_aMutex ),
189 m_xContext( xContext )
192 // XExtendedFilterDetection
193 OUString SAL_CALL PDFDetector::detect( uno::Sequence< beans::PropertyValue >& rFilterData ) throw( uno::RuntimeException, std::exception )
195 osl::MutexGuard const guard( m_aMutex );
196 bool bSuccess = false;
198 // get the InputStream carrying the PDF content
199 uno::Reference< io::XInputStream > xInput;
200 uno::Reference< io::XStream > xEmbedStream;
201 OUString aOutFilterName, aOutTypeName;
202 OUString aURL;
203 OUString aPwd;
204 const beans::PropertyValue* pAttribs = rFilterData.getConstArray();
205 sal_Int32 nAttribs = rFilterData.getLength();
206 sal_Int32 nFilterNamePos = -1;
207 sal_Int32 nPwdPos = -1;
208 for( sal_Int32 i = 0; i < nAttribs; i++ )
210 #if OSL_DEBUG_LEVEL > 1
211 OUString aVal( "<no string>" );
212 pAttribs[i].Value >>= aVal;
213 OSL_TRACE( "doDetection: Attrib: %s = %s\n",
214 OUStringToOString( pAttribs[i].Name, RTL_TEXTENCODING_UTF8 ).getStr(),
215 OUStringToOString( aVal, RTL_TEXTENCODING_UTF8 ).getStr() );
216 #endif
217 if ( pAttribs[i].Name == "InputStream" )
218 pAttribs[i].Value >>= xInput;
219 else if ( pAttribs[i].Name == "URL" )
220 pAttribs[i].Value >>= aURL;
221 else if ( pAttribs[i].Name == "FilterName" )
222 nFilterNamePos = i;
223 else if ( pAttribs[i].Name == "Password" )
225 nPwdPos = i;
226 pAttribs[i].Value >>= aPwd;
229 if( xInput.is() )
231 uno::Reference< io::XSeekable > xSeek( xInput, uno::UNO_QUERY );
232 if( xSeek.is() )
233 xSeek->seek( 0 );
234 // read the first 1024 byte (see PDF reference implementation note 12)
235 const sal_Int32 nHeaderSize = 1024;
236 uno::Sequence< sal_Int8 > aBuf( nHeaderSize );
237 sal_uInt64 nBytes = 0;
238 nBytes = xInput->readBytes( aBuf, nHeaderSize );
239 if( nBytes > 5 )
241 const sal_Int8* pBytes = aBuf.getConstArray();
242 for( unsigned int i = 0; i < nBytes-5; i++ )
244 if( pBytes[i] == '%' &&
245 pBytes[i+1] == 'P' &&
246 pBytes[i+2] == 'D' &&
247 pBytes[i+3] == 'F' &&
248 pBytes[i+4] == '-' )
250 bSuccess = true;
251 break;
256 // check for hybrid PDF
257 oslFileHandle aFile = NULL;
258 if( bSuccess &&
259 ( aURL.isEmpty() || !aURL.startsWith( "file:" ) )
262 sal_uInt64 nWritten = 0;
263 if( osl_createTempFile( NULL, &aFile, &aURL.pData ) != osl_File_E_None )
265 bSuccess = false;
267 else
269 #if OSL_DEBUG_LEVEL > 1
270 OSL_TRACE( "created temp file %s\n",
271 OUStringToOString( aURL, RTL_TEXTENCODING_UTF8 ).getStr() );
272 #endif
273 osl_writeFile( aFile, aBuf.getConstArray(), nBytes, &nWritten );
275 OSL_ENSURE( nWritten == nBytes, "writing of header bytes failed" );
277 if( nWritten == nBytes )
279 const sal_uInt32 nBufSize = 4096;
280 aBuf = uno::Sequence<sal_Int8>(nBufSize);
281 // copy the bytes
284 nBytes = xInput->readBytes( aBuf, nBufSize );
285 if( nBytes > 0 )
287 osl_writeFile( aFile, aBuf.getConstArray(), nBytes, &nWritten );
288 if( nWritten != nBytes )
290 bSuccess = false;
291 break;
294 } while( nBytes == nBufSize );
297 osl_closeFile( aFile );
299 OUString aEmbedMimetype;
300 xEmbedStream = getAdditionalStream( aURL, aEmbedMimetype, aPwd, m_xContext, rFilterData, false );
301 if( aFile )
302 osl_removeFile( aURL.pData );
303 if( !aEmbedMimetype.isEmpty() )
305 if( aEmbedMimetype == "application/vnd.oasis.opendocument.text"
306 || aEmbedMimetype == "application/vnd.oasis.opendocument.text-master" )
307 aOutFilterName = "writer_pdf_addstream_import";
308 else if ( aEmbedMimetype == "application/vnd.oasis.opendocument.presentation" )
309 aOutFilterName = "impress_pdf_addstream_import";
310 else if( aEmbedMimetype == "application/vnd.oasis.opendocument.graphics"
311 || aEmbedMimetype == "application/vnd.oasis.opendocument.drawing" )
312 aOutFilterName = "draw_pdf_addstream_import";
313 else if ( aEmbedMimetype == "application/vnd.oasis.opendocument.spreadsheet" )
314 aOutFilterName = "calc_pdf_addstream_import";
318 if( bSuccess )
320 if( !aOutFilterName.isEmpty() )
322 if( nFilterNamePos == -1 )
324 nFilterNamePos = nAttribs;
325 rFilterData.realloc( ++nAttribs );
326 rFilterData[ nFilterNamePos ].Name = "FilterName";
328 aOutTypeName = "pdf_Portable_Document_Format";
330 OSL_TRACE( "setting filter name %s, input stream %s\n",
331 OUStringToOString( aOutFilterName, RTL_TEXTENCODING_UTF8 ).getStr(),
332 xEmbedStream.is() ? "present" : "not present" );
334 rFilterData[nFilterNamePos].Value <<= aOutFilterName;
335 if( xEmbedStream.is() )
337 rFilterData.realloc( ++nAttribs );
338 rFilterData[nAttribs-1].Name = "EmbeddedSubstream";
339 rFilterData[nAttribs-1].Value <<= xEmbedStream;
341 if( !aPwd.isEmpty() )
343 if( nPwdPos == -1 )
345 nPwdPos = nAttribs;
346 rFilterData.realloc( ++nAttribs );
347 rFilterData[ nPwdPos ].Name = "Password";
349 rFilterData[ nPwdPos ].Value <<= aPwd;
352 else
354 if( nFilterNamePos == -1 )
356 nFilterNamePos = nAttribs;
357 rFilterData.realloc( ++nAttribs );
358 rFilterData[ nFilterNamePos ].Name = "FilterName";
361 const sal_Int32 nDocumentType = 0; //const sal_Int32 nDocumentType = queryDocumentTypeDialog(m_xContext,aURL);
362 if( nDocumentType < 0 )
364 return OUString();
366 else switch( nDocumentType )
368 case 0:
369 rFilterData[nFilterNamePos].Value <<= OUString( "draw_pdf_import" );
370 break;
372 case 1:
373 rFilterData[nFilterNamePos].Value <<= OUString( "impress_pdf_import" );
374 break;
376 case 2:
377 rFilterData[nFilterNamePos].Value <<= OUString( "writer_pdf_import" );
378 break;
380 default:
381 OSL_FAIL("Unexpected case");
384 aOutTypeName = "pdf_Portable_Document_Format";
388 return aOutTypeName;
391 OUString PDFDetector::getImplementationName()
392 throw (css::uno::RuntimeException, std::exception)
394 return OUString("org.libreoffice.comp.documents.PDFDetector");
397 sal_Bool PDFDetector::supportsService(OUString const & ServiceName)
398 throw (css::uno::RuntimeException, std::exception)
400 return cppu::supportsService(this, ServiceName);
403 css::uno::Sequence<OUString> PDFDetector::getSupportedServiceNames()
404 throw (css::uno::RuntimeException, std::exception)
406 return css::uno::Sequence<OUString>{"com.sun.star.document.ImportFilter"};
409 bool checkDocChecksum( const OUString& rInPDFFileURL,
410 sal_uInt32 nBytes,
411 const OUString& rChkSum )
413 bool bRet = false;
414 if( rChkSum.getLength() != 2* RTL_DIGEST_LENGTH_MD5 )
416 SAL_INFO(
417 "sdext.pdfimport",
418 "checksum of length " << rChkSum.getLength() << ", expected "
419 << 2*RTL_DIGEST_LENGTH_MD5);
420 return false;
423 // prepare checksum to test
424 sal_uInt8 nTestChecksum[ RTL_DIGEST_LENGTH_MD5 ];
425 const sal_Unicode* pChar = rChkSum.getStr();
426 for( unsigned int i = 0; i < RTL_DIGEST_LENGTH_MD5; i++ )
428 sal_uInt8 nByte = sal_uInt8( ( (*pChar >= '0' && *pChar <= '9') ? *pChar - '0' :
429 ( (*pChar >= 'A' && *pChar <= 'F') ? *pChar - 'A' + 10 :
430 ( (*pChar >= 'a' && *pChar <= 'f') ? *pChar - 'a' + 10 :
431 0 ) ) ) );
432 nByte <<= 4;
433 pChar++;
434 nByte |= ( (*pChar >= '0' && *pChar <= '9') ? *pChar - '0' :
435 ( (*pChar >= 'A' && *pChar <= 'F') ? *pChar - 'A' + 10 :
436 ( (*pChar >= 'a' && *pChar <= 'f') ? *pChar - 'a' + 10 :
437 0 ) ) );
438 pChar++;
439 nTestChecksum[i] = nByte;
442 // open file and calculate actual checksum up to index nBytes
443 sal_uInt8 nActualChecksum[ RTL_DIGEST_LENGTH_MD5 ];
444 memset( nActualChecksum, 0, sizeof(nActualChecksum) );
445 rtlDigest aActualDigest = rtl_digest_createMD5();
446 oslFileHandle aRead = NULL;
447 oslFileError aErr = osl_File_E_None;
448 if( (aErr = osl_openFile(rInPDFFileURL.pData,
449 &aRead,
450 osl_File_OpenFlag_Read )) == osl_File_E_None )
452 sal_Int8 aBuf[4096];
453 sal_uInt32 nCur = 0;
454 sal_uInt64 nBytesRead = 0;
455 while( nCur < nBytes )
457 sal_uInt32 nPass = (nBytes - nCur) > sizeof( aBuf ) ? sizeof( aBuf ) : nBytes - nCur;
458 if( (aErr = osl_readFile( aRead, aBuf, nPass, &nBytesRead)) != osl_File_E_None
459 || nBytesRead == 0 )
461 break;
463 nPass = static_cast<sal_uInt32>(nBytesRead);
464 nCur += nPass;
465 rtl_digest_updateMD5( aActualDigest, aBuf, nPass );
467 rtl_digest_getMD5( aActualDigest, nActualChecksum, sizeof(nActualChecksum) );
468 osl_closeFile( aRead );
470 rtl_digest_destroyMD5( aActualDigest );
472 // compare the contents
473 bRet = (0 == memcmp( nActualChecksum, nTestChecksum, sizeof( nActualChecksum ) ));
474 #if OSL_DEBUG_LEVEL > 1
475 OSL_TRACE( "test checksum: " );
476 for( unsigned int i = 0; i < sizeof(nTestChecksum); i++ )
477 OSL_TRACE( "%.2X", int(nTestChecksum[i]) );
478 OSL_TRACE( "\n" );
479 OSL_TRACE( "file checksum: " );
480 for( unsigned int i = 0; i < sizeof(nActualChecksum); i++ )
481 OSL_TRACE( "%.2X", int(nActualChecksum[i]) );
482 OSL_TRACE( "\n" );
483 #endif
484 return bRet;
487 uno::Reference< io::XStream > getAdditionalStream( const OUString& rInPDFFileURL,
488 OUString& rOutMimetype,
489 OUString& io_rPwd,
490 const uno::Reference<uno::XComponentContext>& xContext,
491 const uno::Sequence<beans::PropertyValue>& rFilterData,
492 bool bMayUseUI )
494 uno::Reference< io::XStream > xEmbed;
495 OString aPDFFile;
496 OUString aSysUPath;
497 if( osl_getSystemPathFromFileURL( rInPDFFileURL.pData, &aSysUPath.pData ) != osl_File_E_None )
498 return xEmbed;
499 aPDFFile = OUStringToOString( aSysUPath, osl_getThreadTextEncoding() );
501 pdfparse::PDFReader aParser;
502 boost::scoped_ptr<pdfparse::PDFEntry> pEntry( pdfparse::PDFReader::read( aPDFFile.getStr() ));
503 if( pEntry )
505 pdfparse::PDFFile* pPDFFile = dynamic_cast<pdfparse::PDFFile*>(pEntry.get());
506 if( pPDFFile )
508 unsigned int nElements = pPDFFile->m_aSubElements.size();
509 while( nElements-- > 0 )
511 pdfparse::PDFTrailer* pTrailer = dynamic_cast<pdfparse::PDFTrailer*>(pPDFFile->m_aSubElements[nElements]);
512 if( pTrailer && pTrailer->m_pDict )
514 // search document checksum entry
515 std::unordered_map< OString,
516 pdfparse::PDFEntry*,
517 OStringHash >::iterator chk;
518 chk = pTrailer->m_pDict->m_aMap.find( "DocChecksum" );
519 if( chk == pTrailer->m_pDict->m_aMap.end() )
521 OSL_TRACE( "no DocChecksum entry" );
522 continue;
524 pdfparse::PDFName* pChkSumName = dynamic_cast<pdfparse::PDFName*>(chk->second);
525 if( pChkSumName == NULL )
527 OSL_TRACE( "no name for DocChecksum entry" );
528 continue;
531 // search for AdditionalStreams entry
532 std::unordered_map< OString,
533 pdfparse::PDFEntry*,
534 OStringHash >::iterator add_stream;
535 add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
536 if( add_stream == pTrailer->m_pDict->m_aMap.end() )
538 OSL_TRACE( "no AdditionalStreams entry" );
539 continue;
541 pdfparse::PDFArray* pStreams = dynamic_cast<pdfparse::PDFArray*>(add_stream->second);
542 if( ! pStreams || pStreams->m_aSubElements.size() < 2 )
544 OSL_TRACE( "AdditionalStreams array too small" );
545 continue;
548 // check checksum
549 OUString aChkSum = pChkSumName->getFilteredName();
550 if( ! checkDocChecksum( rInPDFFileURL, pTrailer->m_nOffset, aChkSum ) )
551 continue;
553 // extract addstream and mimetype
554 pdfparse::PDFName* pMimeType = dynamic_cast<pdfparse::PDFName*>(pStreams->m_aSubElements[0]);
555 pdfparse::PDFObjectRef* pStreamRef = dynamic_cast<pdfparse::PDFObjectRef*>(pStreams->m_aSubElements[1]);
557 OSL_ENSURE( pMimeType, "error: no mimetype element\n" );
558 OSL_ENSURE( pStreamRef, "error: no stream ref element\n" );
560 if( pMimeType && pStreamRef )
562 pdfparse::PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
563 OSL_ENSURE( pObject, "object not found\n" );
564 if( pObject )
566 if( pPDFFile->isEncrypted() )
568 bool bAuthenticated = false;
569 if( !io_rPwd.isEmpty() )
571 OString aIsoPwd = OUStringToOString( io_rPwd,
572 RTL_TEXTENCODING_ISO_8859_1 );
573 bAuthenticated = pPDFFile->setupDecryptionData( aIsoPwd.getStr() );
575 if( ! bAuthenticated )
577 const beans::PropertyValue* pAttribs = rFilterData.getConstArray();
578 sal_Int32 nAttribs = rFilterData.getLength();
579 uno::Reference< task::XInteractionHandler > xIntHdl;
580 for( sal_Int32 i = 0; i < nAttribs; i++ )
582 if ( pAttribs[i].Name == "InteractionHandler" )
583 pAttribs[i].Value >>= xIntHdl;
585 if( ! bMayUseUI || ! xIntHdl.is() )
587 rOutMimetype = pMimeType->getFilteredName();
588 xEmbed.clear();
589 break;
592 OUString aDocName( rInPDFFileURL.copy( rInPDFFileURL.lastIndexOf( '/' )+1 ) );
594 bool bEntered = false;
597 bEntered = getPassword( xIntHdl, io_rPwd, ! bEntered, aDocName );
598 OString aIsoPwd = OUStringToOString( io_rPwd,
599 RTL_TEXTENCODING_ISO_8859_1 );
600 bAuthenticated = pPDFFile->setupDecryptionData( aIsoPwd.getStr() );
601 } while( bEntered && ! bAuthenticated );
604 OSL_TRACE( "password: %s", bAuthenticated ? "matches" : "does not match" );
605 if( ! bAuthenticated )
606 continue;
608 rOutMimetype = pMimeType->getFilteredName();
609 FileEmitContext aContext( rInPDFFileURL,
610 xContext,
611 pPDFFile );
612 aContext.m_bDecrypt = pPDFFile->isEncrypted();
613 pObject->writeStream( aContext, pPDFFile );
614 xEmbed = aContext.getContextStream();
615 break; // success
623 OSL_TRACE( "extracted add stream: mimetype %s\n",
624 OUStringToOString( rOutMimetype,
625 RTL_TEXTENCODING_UTF8 ).getStr());
626 return xEmbed;
631 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */