oox/source/vml/vmlinputstream.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include "oox/vml/vmlinputstream.hxx"
  21
  22 #include <com/sun/star/io/XTextInputStream2.hpp>
  23 #include <map>
  24 #include <string.h>
  25 #include <rtl/strbuf.hxx>
  26 #include "oox/helper/helper.hxx"
  27 #include "oox/helper/textinputstream.hxx"
  28
  29 namespace oox {
  30 namespace vml {
  31
  32 // ============================================================================
  33
  34 using namespace ::com::sun::star::io;
  35 using namespace ::com::sun::star::uno;
  36
  37 // ============================================================================
  38
  39 namespace {
  40
  41 inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
  42 {
  43     sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
  44     return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
  45 }
  46
  47 inline bool lclIsWhiteSpace( sal_Char cChar )
  48 {
  49     return cChar < 32;
  50 }
  51
  52 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
  53 {
  54     for( ; pcBeg < pcEnd; ++pcBeg )
  55         if( lclIsWhiteSpace( *pcBeg ) )
  56             return pcBeg;
  57     return pcEnd;
  58 }
  59
  60 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
  61 {
  62     for( ; pcBeg < pcEnd; ++pcBeg )
  63         if( !lclIsWhiteSpace( *pcBeg ) )
  64             return pcBeg;
  65     return pcEnd;
  66 }
  67
  68 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
  69 {
  70     while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
  71         --pcEnd;
  72     return pcEnd;
  73 }
  74
  75 inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
  76 {
  77     rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
  78 }
  79
  80 // ----------------------------------------------------------------------------
  81
  82 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
  83 {
  84     /*  Map attribute names to char-pointer of all attributes. This map is used
  85         to find multiple occurrences of attributes with the same name. The
  86         mapped pointers are used as map key in the next map below. */
  87     typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
  88     AttributeNameMap aAttributeNames;
  89
  90     /*  Map the char-pointers of all attributes to the full attribute definition
  91         string. This preserves the original order of the used attributes. */
  92     typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
  93     AttributeDataMap aAttributes;
  94
  95     bool bOk = true;
  96     const sal_Char* pcNameBeg = pcBeg;
  97     while( bOk && (pcNameBeg < pcEnd) )
  98     {
  99         // pcNameBeg points to begin of attribute name, find equality sign
 100         const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
 101         if( (bOk = pcEqualSign < pcEnd) == true )
 102         {
 103             // find end of attribute name (ignore whitespace between name and equality sign)
 104             const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
 105             if( (bOk = pcNameBeg < pcNameEnd) == true )
 106             {
 107                 // find begin of attribute value (must be single or double quote)
 108                 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
 109                 if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) == true )
 110                 {
 111                     // find end of attribute value (matching quote character)
 112                     const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
 113                     if( (bOk = pcValueEnd < pcEnd) == true )
 114                     {
 115                         ++pcValueEnd;
 116                         OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
 117                         OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
 118                         // search for an existing attribute with the same name
 119                         AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
 120                         // remove its definition from the data map
 121                         if( aIt != aAttributeNames.end() )
 122                             aAttributes.erase( aIt->second );
 123                         // insert the attribute into both maps
 124                         aAttributeNames[ aAttribName ] = pcNameBeg;
 125                         aAttributes[ pcNameBeg ] = aAttribData;
 126                         // continue with next attribute (skip whitespace after this attribute)
 127                         pcNameBeg = pcValueEnd;
 128                         if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg )) == true) )
 129                             pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
 130                     }
 131                 }
 132             }
 133         }
 134     }
 135
 136     // if no error has occurred, build the resulting attribute list
 137     if( bOk )
 138         for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
 139             rBuffer.append( ' ' ).append( aIt->second );
 140     // on error, just append the complete passed string
 141     else
 142         lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
 143 }
 144
 145 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
 146 {
 147     // check that passed string starts and ends with the brackets of an XML element
 148     sal_Int32 nElementLen = rElement.getLength();
 149     if( nElementLen == 0 )
 150         return;
 151
 152     const sal_Char* pcOpen = rElement.getStr();
 153     const sal_Char* pcClose = pcOpen + nElementLen - 1;
 154
 155     // no complete element found
 156     if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
 157     {
 158         // just append all passed characters
 159         rBuffer.append( rElement );
 160     }
 161
 162     // skip parser instructions: '<![...]>'
 163     else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
 164     {
 165         // do nothing
 166     }
 167
 168     // replace '<br>' element with newline
 169     else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
 170     {
 171         rBuffer.append( '\n' );
 172     }
 173
 174     // check start elements and simple elements for repeated attributes
 175     else if( pcOpen[ 1 ] != '/' )
 176     {
 177         // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
 178         const sal_Char* pcContentBeg = pcOpen + 1;
 179         bool bIsEmptyElement = pcClose[ -1 ] == '/';
 180         const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
 181         // append opening bracket and element name to buffer
 182         const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
 183         lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
 184         // find begin of attributes, and process all attributes
 185         const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
 186         if( pcAttribBeg < pcContentEnd )
 187             lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
 188         // close the element
 189         if( bIsEmptyElement )
 190             rBuffer.append( '/' );
 191         rBuffer.append( '>' );
 192     }
 193
 194     // append end elements without further processing
 195     else
 196     {
 197         rBuffer.append( rElement );
 198     }
 199 }
 200
 201 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
 202 {
 203     /*  MSO has a very weird way to store and handle whitespaces. The stream
 204         may contain lots of spaces, tabs, and newlines which have to be handled
 205         as single space character. This will be done in this function.
 206
 207         If the element text contains a literal line break, it will be stored as
 208         <br> tag (without matching </br> element). This input stream wrapper
 209         will replace this element with a literal LF character (see below).
 210
 211         A single space character for its own is stored as is. Example: The
 212         element
 213             <font> </font>
 214         represents a single space character. The XML parser will ignore this
 215         space character completely without issuing a 'characters' event. The
 216         VML import filter implementation has to react on this case manually.
 217
 218         A single space character following another character is stored
 219         literally and must not be stipped away here. Example: The element
 220             <font>abc </font>
 221         contains the three letters a, b, and c, followed by a space character.
 222
 223         Consecutive space characters, or a leading single space character, are
 224         stored in a <span> element. If there are N space characters (N > 1),
 225         then the <span> element contains exactly (N-1) NBSP (non-breaking
 226         space) characters, followed by a regular space character. Examples:
 227         The element
 228             <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
 229         represents 4 consecutive space characters. Has to be handled by the
 230         implementation. The element
 231             <font><span style='mso-spacerun:yes'> abc</span></font>
 232         represents a space characters followed by the letters a, b, c. These
 233         strings have to be handled by the VML import filter implementation.
 234      */
 235
 236     // passed string ends with the leading opening bracket of an XML element
 237     const sal_Char* pcBeg = rChars.getStr();
 238     const sal_Char* pcEnd = pcBeg + rChars.getLength();
 239     bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
 240     if( bHasBracket ) --pcEnd;
 241
 242     // skip leading whitespace
 243     const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
 244     while( pcContentsBeg < pcEnd )
 245     {
 246         const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
 247         lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
 248         if( pcWhitespaceBeg < pcEnd )
 249             rBuffer.append( ' ' );
 250         pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
 251     }
 252
 253     return bHasBracket;
 254 }
 255
 256 } // namespace
 257
 258 // ============================================================================
 259
 260 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
 261     // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
 262     mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
 263     maOpeningBracket( 1 ),
 264     maClosingBracket( 1 ),
 265     maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
 266     maClosingCData( CREATE_OSTRING( "]]>" ) ),
 267     mnBufferPos( 0 )
 268 {
 269     maOpeningBracket[ 0 ] = '<';
 270     maClosingBracket[ 0 ] = '>';
 271 }
 272
 273 InputStream::~InputStream()
 274 {
 275 }
 276
 277 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
 278         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
 279 {
 280     if( nBytesToRead < 0 )
 281         throw IOException();
 282
 283     rData.realloc( nBytesToRead );
 284     sal_Int8* pcDest = rData.getArray();
 285     sal_Int32 nRet = 0;
 286     while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
 287     {
 288         updateBuffer();
 289         sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
 290         if( nReadSize > 0 )
 291         {
 292             memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
 293             mnBufferPos += nReadSize;
 294             nBytesToRead -= nReadSize;
 295             nRet += nReadSize;
 296         }
 297     }
 298     if( nRet < rData.getLength() )
 299         rData.realloc( nRet );
 300     return nRet;
 301 }
 302
 303 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
 304         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
 305 {
 306     return readBytes( rData, nMaxBytesToRead );
 307 }
 308
 309 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
 310         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
 311 {
 312     if( nBytesToSkip < 0 )
 313         throw IOException();
 314
 315     while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
 316     {
 317         updateBuffer();
 318         sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
 319         mnBufferPos += nSkipSize;
 320         nBytesToSkip -= nSkipSize;
 321     }
 322 }
 323
 324 sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException)
 325 {
 326     updateBuffer();
 327     return maBuffer.getLength() - mnBufferPos;
 328 }
 329
 330 void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException)
 331 {
 332     mxTextStrm->closeInput();
 333 }
 334
 335 // private --------------------------------------------------------------------
 336
 337 void InputStream::updateBuffer() throw (IOException, RuntimeException)
 338 {
 339     while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
 340     {
 341         // collect new contents in a string buffer
 342         OStringBuffer aBuffer;
 343
 344         // read and process characters until the opening bracket of the next XML element
 345         OString aChars = readToElementBegin();
 346         bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
 347
 348         // read and process characters until (and including) closing bracket (an XML element)
 349         OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
 350         if( bHasOpeningBracket && !mxTextStrm->isEOF() )
 351         {
 352             // read the element text (add the leading opening bracket manually)
 353             OString aElement = OString( '<' ) + readToElementEnd();
 354             // check for CDATA part, starting with '<![CDATA['
 355             if( aElement.match( maOpeningCData ) )
 356             {
 357                 // search the end tag ']]>'
 358                 while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.match( maClosingCData, aElement.getLength() - maClosingCData.getLength() )) && !mxTextStrm->isEOF() )
 359                     aElement += readToElementEnd();
 360                 // copy the entire CDATA part
 361                 aBuffer.append( aElement );
 362             }
 363             else
 364             {
 365                 // no CDATA part - process the contents of the element
 366                 lclProcessElement( aBuffer, aElement );
 367             }
 368         }
 369
 370         maBuffer = aBuffer.makeStringAndClear();
 371         mnBufferPos = 0;
 372     }
 373 }
 374
 375 OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
 376 {
 377     return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
 378 }
 379
 380 OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
 381 {
 382     OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
 383     OSL_ENSURE( !aText.isEmpty() && (aText[ aText.getLength() - 1 ] == '>'), "InputStream::readToElementEnd - missing closing bracket of XML element" );
 384     return aText;
 385 }
 386
 387 // ============================================================================
 388
 389 } // namespace vml
 390 } // namespave oox
 391
 392 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */