oox/source/vml/vmlinputstream.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include "oox/vml/vmlinputstream.hxx"
  21
  22 #include <com/sun/star/io/XTextInputStream2.hpp>
  23 #include <map>
  24 #include <string.h>
  25 #include <rtl/strbuf.hxx>
  26 #include <osl/diagnose.h>
  27 #include "oox/helper/helper.hxx"
  28 #include "oox/helper/textinputstream.hxx"
  29
  30 namespace oox {
  31 namespace vml {
  32
  33 using namespace ::com::sun::star::io;
  34 using namespace ::com::sun::star::uno;
  35
  36 namespace {
  37
  38 inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
  39 {
  40     sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
  41     return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
  42 }
  43
  44 inline bool lclIsWhiteSpace( sal_Char cChar )
  45 {
  46     return cChar < 32;
  47 }
  48
  49 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
  50 {
  51     for( ; pcBeg < pcEnd; ++pcBeg )
  52         if( lclIsWhiteSpace( *pcBeg ) )
  53             return pcBeg;
  54     return pcEnd;
  55 }
  56
  57 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
  58 {
  59     for( ; pcBeg < pcEnd; ++pcBeg )
  60         if( !lclIsWhiteSpace( *pcBeg ) )
  61             return pcBeg;
  62     return pcEnd;
  63 }
  64
  65 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
  66 {
  67     while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
  68         --pcEnd;
  69     return pcEnd;
  70 }
  71
  72 inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
  73 {
  74     rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
  75 }
  76
  77 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
  78 {
  79     /*  Map attribute names to char-pointer of all attributes. This map is used
  80         to find multiple occurrences of attributes with the same name. The
  81         mapped pointers are used as map key in the next map below. */
  82     typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
  83     AttributeNameMap aAttributeNames;
  84
  85     /*  Map the char-pointers of all attributes to the full attribute definition
  86         string. This preserves the original order of the used attributes. */
  87     typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
  88     AttributeDataMap aAttributes;
  89
  90     bool bOk = true;
  91     const sal_Char* pcNameBeg = pcBeg;
  92     while( bOk && (pcNameBeg < pcEnd) )
  93     {
  94         // pcNameBeg points to begin of attribute name, find equality sign
  95         const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
  96         if ((bOk = (pcEqualSign < pcEnd)))
  97         {
  98             // find end of attribute name (ignore whitespace between name and equality sign)
  99             const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
 100             if( (bOk = (pcNameBeg < pcNameEnd)) )
 101             {
 102                 // find begin of attribute value (must be single or double quote)
 103                 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
 104                 if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) )
 105                 {
 106                     // find end of attribute value (matching quote character)
 107                     const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
 108                     if( (bOk = (pcValueEnd < pcEnd)) )
 109                     {
 110                         ++pcValueEnd;
 111                         OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
 112                         OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
 113                         // search for an existing attribute with the same name
 114                         AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
 115                         // remove its definition from the data map
 116                         if( aIt != aAttributeNames.end() )
 117                             aAttributes.erase( aIt->second );
 118                         // insert the attribute into both maps
 119                         aAttributeNames[ aAttribName ] = pcNameBeg;
 120                         aAttributes[ pcNameBeg ] = aAttribData;
 121                         // continue with next attribute (skip whitespace after this attribute)
 122                         pcNameBeg = pcValueEnd;
 123                         if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg ))) )
 124                             pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
 125                     }
 126                 }
 127             }
 128         }
 129     }
 130
 131     // if no error has occurred, build the resulting attribute list
 132     if( bOk )
 133         for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
 134             rBuffer.append( ' ' ).append( aIt->second );
 135     // on error, just append the complete passed string
 136     else
 137         lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
 138 }
 139
 140 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
 141 {
 142     // check that passed string starts and ends with the brackets of an XML element
 143     sal_Int32 nElementLen = rElement.getLength();
 144     if( nElementLen == 0 )
 145         return;
 146
 147     const sal_Char* pcOpen = rElement.getStr();
 148     const sal_Char* pcClose = pcOpen + nElementLen - 1;
 149
 150     // no complete element found
 151     if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
 152     {
 153         // just append all passed characters
 154         rBuffer.append( rElement );
 155     }
 156
 157     // skip parser instructions: '<![...]>'
 158     else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
 159     {
 160         // do nothing
 161     }
 162
 163     // replace '<br>' element with newline
 164     else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
 165     {
 166         rBuffer.append( '\n' );
 167     }
 168
 169     // check start elements and simple elements for repeated attributes
 170     else if( pcOpen[ 1 ] != '/' )
 171     {
 172         // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
 173         const sal_Char* pcContentBeg = pcOpen + 1;
 174         bool bIsEmptyElement = pcClose[ -1 ] == '/';
 175         const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
 176         // append opening bracket and element name to buffer
 177         const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
 178         lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
 179         // find begin of attributes, and process all attributes
 180         const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
 181         if( pcAttribBeg < pcContentEnd )
 182             lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
 183         // close the element
 184         if( bIsEmptyElement )
 185             rBuffer.append( '/' );
 186         rBuffer.append( '>' );
 187     }
 188
 189     // append end elements without further processing
 190     else
 191     {
 192         rBuffer.append( rElement );
 193     }
 194 }
 195
 196 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
 197 {
 198     /*  MSO has a very weird way to store and handle whitespaces. The stream
 199         may contain lots of spaces, tabs, and newlines which have to be handled
 200         as single space character. This will be done in this function.
 201
 202         If the element text contains a literal line break, it will be stored as
 203         <br> tag (without matching </br> element). This input stream wrapper
 204         will replace this element with a literal LF character (see below).
 205
 206         A single space character for its own is stored as is. Example: The
 207         element
 208             <font> </font>
 209         represents a single space character. The XML parser will ignore this
 210         space character completely without issuing a 'characters' event. The
 211         VML import filter implementation has to react on this case manually.
 212
 213         A single space character following another character is stored
 214         literally and must not be stipped away here. Example: The element
 215             <font>abc </font>
 216         contains the three letters a, b, and c, followed by a space character.
 217
 218         Consecutive space characters, or a leading single space character, are
 219         stored in a <span> element. If there are N space characters (N > 1),
 220         then the <span> element contains exactly (N-1) NBSP (non-breaking
 221         space) characters, followed by a regular space character. Examples:
 222         The element
 223             <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
 224         represents 4 consecutive space characters. Has to be handled by the
 225         implementation. The element
 226             <font><span style='mso-spacerun:yes'> abc</span></font>
 227         represents a space characters followed by the letters a, b, c. These
 228         strings have to be handled by the VML import filter implementation.
 229      */
 230
 231     // passed string ends with the leading opening bracket of an XML element
 232     const sal_Char* pcBeg = rChars.getStr();
 233     const sal_Char* pcEnd = pcBeg + rChars.getLength();
 234     bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
 235     if( bHasBracket ) --pcEnd;
 236
 237     // skip leading whitespace
 238     const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
 239     while( pcContentsBeg < pcEnd )
 240     {
 241         const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
 242         lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
 243         if( pcWhitespaceBeg < pcEnd )
 244             rBuffer.append( ' ' );
 245         pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
 246     }
 247
 248     return bHasBracket;
 249 }
 250
 251 } // namespace
 252
 253 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
 254     // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
 255     mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
 256     maOpeningBracket( 1 ),
 257     maClosingBracket( 1 ),
 258     maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
 259     maClosingCData( CREATE_OSTRING( "]]>" ) ),
 260     mnBufferPos( 0 )
 261 {
 262     if (!mxTextStrm.is())
 263         throw IOException();
 264     maOpeningBracket[ 0 ] = '<';
 265     maClosingBracket[ 0 ] = '>';
 266 }
 267
 268 InputStream::~InputStream()
 269 {
 270 }
 271
 272 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
 273         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
 274 {
 275     if( nBytesToRead < 0 )
 276         throw IOException();
 277
 278     rData.realloc( nBytesToRead );
 279     sal_Int8* pcDest = rData.getArray();
 280     sal_Int32 nRet = 0;
 281     while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
 282     {
 283         updateBuffer();
 284         sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
 285         if( nReadSize > 0 )
 286         {
 287             memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
 288             mnBufferPos += nReadSize;
 289             nBytesToRead -= nReadSize;
 290             nRet += nReadSize;
 291         }
 292     }
 293     if( nRet < rData.getLength() )
 294         rData.realloc( nRet );
 295     return nRet;
 296 }
 297
 298 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
 299         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
 300 {
 301     return readBytes( rData, nMaxBytesToRead );
 302 }
 303
 304 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
 305         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
 306 {
 307     if( nBytesToSkip < 0 )
 308         throw IOException();
 309
 310     while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
 311     {
 312         updateBuffer();
 313         sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
 314         mnBufferPos += nSkipSize;
 315         nBytesToSkip -= nSkipSize;
 316     }
 317 }
 318
 319 sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException, std::exception)
 320 {
 321     updateBuffer();
 322     return maBuffer.getLength() - mnBufferPos;
 323 }
 324
 325 void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException, std::exception)
 326 {
 327     mxTextStrm->closeInput();
 328 }
 329
 330 // private --------------------------------------------------------------------
 331
 332 void InputStream::updateBuffer() throw (IOException, RuntimeException)
 333 {
 334     while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
 335     {
 336         // collect new contents in a string buffer
 337         OStringBuffer aBuffer;
 338
 339         // read and process characters until the opening bracket of the next XML element
 340         OString aChars = readToElementBegin();
 341         bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
 342
 343         // read and process characters until (and including) closing bracket (an XML element)
 344         OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
 345         if( bHasOpeningBracket && !mxTextStrm->isEOF() )
 346         {
 347             // read the element text (add the leading opening bracket manually)
 348             OString aElement = OString( '<' ) + readToElementEnd();
 349             // check for CDATA part, starting with '<![CDATA['
 350             if( aElement.match( maOpeningCData ) )
 351             {
 352                 // search the end tag ']]>'
 353                 while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.endsWith( maClosingCData )) && !mxTextStrm->isEOF() )
 354                     aElement += readToElementEnd();
 355                 // copy the entire CDATA part
 356                 aBuffer.append( aElement );
 357             }
 358             else
 359             {
 360                 // no CDATA part - process the contents of the element
 361                 lclProcessElement( aBuffer, aElement );
 362             }
 363         }
 364
 365         maBuffer = aBuffer.makeStringAndClear();
 366         mnBufferPos = 0;
 367     }
 368 }
 369
 370 OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
 371 {
 372     return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
 373 }
 374
 375 OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
 376 {
 377     OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
 378     OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
 379     return aText;
 380 }
 381
 382 } // namespace vml
 383 } // namespave oox
 384
 385 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */