oox/source/vml/vmlinputstream.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <oox/vml/vmlinputstream.hxx>
  21
  22 #include <com/sun/star/io/XTextInputStream2.hpp>
  23 #include <map>
  24 #include <string.h>
  25 #include <rtl/strbuf.hxx>
  26 #include <osl/diagnose.h>
  27 #include <oox/helper/helper.hxx>
  28 #include <oox/helper/textinputstream.hxx>
  29
  30 namespace oox {
  31 namespace vml {
  32
  33 using namespace ::com::sun::star::io;
  34 using namespace ::com::sun::star::uno;
  35
  36 namespace {
  37
  38 const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
  39 {
  40     sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
  41     return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
  42 }
  43
  44 bool lclIsWhiteSpace( sal_Char cChar )
  45 {
  46     return cChar <= 32;
  47 }
  48
  49 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
  50 {
  51     for( ; pcBeg < pcEnd; ++pcBeg )
  52         if( lclIsWhiteSpace( *pcBeg ) )
  53             return pcBeg;
  54     return pcEnd;
  55 }
  56
  57 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
  58 {
  59     for( ; pcBeg < pcEnd; ++pcBeg )
  60         if( !lclIsWhiteSpace( *pcBeg ) )
  61             return pcBeg;
  62     return pcEnd;
  63 }
  64
  65 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
  66 {
  67     while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
  68         --pcEnd;
  69     return pcEnd;
  70 }
  71
  72 void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
  73 {
  74     rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
  75 }
  76
  77 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
  78 {
  79     /*  Map attribute names to char-pointer of all attributes. This map is used
  80         to find multiple occurrences of attributes with the same name. The
  81         mapped pointers are used as map key in the next map below. */
  82     typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
  83     AttributeNameMap aAttributeNames;
  84
  85     /*  Map the char-pointers of all attributes to the full attribute definition
  86         string. This preserves the original order of the used attributes. */
  87     typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
  88     AttributeDataMap aAttributes;
  89
  90     bool bOk = true;
  91     const sal_Char* pcNameBeg = pcBeg;
  92     while( bOk && (pcNameBeg < pcEnd) )
  93     {
  94         // pcNameBeg points to begin of attribute name, find equality sign
  95         const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
  96         bOk = (pcEqualSign < pcEnd);
  97         if (bOk)
  98         {
  99             // find end of attribute name (ignore whitespace between name and equality sign)
 100             const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
 101             bOk = (pcNameBeg < pcNameEnd);
 102             if( bOk )
 103             {
 104                 // find begin of attribute value (must be single or double quote)
 105                 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
 106                 bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'));
 107                 if( bOk )
 108                 {
 109                     // find end of attribute value (matching quote character)
 110                     const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
 111                     bOk = (pcValueEnd < pcEnd);
 112                     if( bOk )
 113                     {
 114                         ++pcValueEnd;
 115                         OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
 116                         OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
 117                         // search for an existing attribute with the same name
 118                         AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
 119                         // remove its definition from the data map
 120                         if( aIt != aAttributeNames.end() )
 121                             aAttributes.erase( aIt->second );
 122                         // insert the attribute into both maps
 123                         aAttributeNames[ aAttribName ] = pcNameBeg;
 124                         aAttributes[ pcNameBeg ] = aAttribData;
 125                         // continue with next attribute (skip whitespace after this attribute)
 126                         pcNameBeg = pcValueEnd;
 127                         if( pcNameBeg < pcEnd )
 128                         {
 129                             bOk = lclIsWhiteSpace( *pcNameBeg );
 130                             if( bOk )
 131                                 pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
 132                         }
 133                     }
 134                 }
 135             }
 136         }
 137     }
 138
 139     // if no error has occurred, build the resulting attribute list
 140     if( bOk )
 141         for (auto const& attrib : aAttributes)
 142             rBuffer.append( ' ' ).append( attrib.second );
 143     // on error, just append the complete passed string
 144     else
 145         lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
 146 }
 147
 148 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
 149 {
 150     // check that passed string starts and ends with the brackets of an XML element
 151     sal_Int32 nElementLen = rElement.getLength();
 152     if( nElementLen == 0 )
 153         return;
 154
 155     const sal_Char* pcOpen = rElement.getStr();
 156     const sal_Char* pcClose = pcOpen + nElementLen - 1;
 157
 158     // no complete element found
 159     if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
 160     {
 161         // just append all passed characters
 162         rBuffer.append( rElement );
 163     }
 164
 165     // skip parser instructions: '<![...]>'
 166     else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
 167     {
 168         // do nothing
 169     }
 170
 171     // replace '<br>' element with newline
 172     else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
 173     {
 174         rBuffer.append( '\n' );
 175     }
 176
 177     // check start elements and simple elements for repeated attributes
 178     else if( pcOpen[ 1 ] != '/' )
 179     {
 180         // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
 181         const sal_Char* pcContentBeg = pcOpen + 1;
 182         bool bIsEmptyElement = pcClose[ -1 ] == '/';
 183         const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
 184         // append opening bracket and element name to buffer
 185         const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
 186         lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
 187         // find begin of attributes, and process all attributes
 188         const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
 189         if( pcAttribBeg < pcContentEnd )
 190             lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
 191         // close the element
 192         if( bIsEmptyElement )
 193             rBuffer.append( '/' );
 194         rBuffer.append( '>' );
 195     }
 196
 197     // append end elements without further processing
 198     else
 199     {
 200         rBuffer.append( rElement );
 201     }
 202 }
 203
 204 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
 205 {
 206     /*  MSO has a very weird way to store and handle whitespaces. The stream
 207         may contain lots of spaces, tabs, and newlines which have to be handled
 208         as single space character. This will be done in this function.
 209
 210         If the element text contains a literal line break, it will be stored as
 211         <br> tag (without matching </br> element). This input stream wrapper
 212         will replace this element with a literal LF character (see below).
 213
 214         A single space character for its own is stored as is. Example: The
 215         element
 216             <font> </font>
 217         represents a single space character. The XML parser will ignore this
 218         space character completely without issuing a 'characters' event. The
 219         VML import filter implementation has to react on this case manually.
 220
 221         A single space character following another character is stored
 222         literally and must not be stripped away here. Example: The element
 223             <font>abc </font>
 224         contains the three letters a, b, and c, followed by a space character.
 225
 226         Consecutive space characters, or a leading single space character, are
 227         stored in a <span> element. If there are N space characters (N > 1),
 228         then the <span> element contains exactly (N-1) NBSP (non-breaking
 229         space) characters, followed by a regular space character. Examples:
 230         The element
 231             <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
 232         represents 4 consecutive space characters. Has to be handled by the
 233         implementation. The element
 234             <font><span style='mso-spacerun:yes'> abc</span></font>
 235         represents a space characters followed by the letters a, b, c. These
 236         strings have to be handled by the VML import filter implementation.
 237      */
 238
 239     // passed string ends with the leading opening bracket of an XML element
 240     const sal_Char* pcBeg = rChars.getStr();
 241     const sal_Char* pcEnd = pcBeg + rChars.getLength();
 242     bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
 243     if( bHasBracket ) --pcEnd;
 244
 245     // skip leading whitespace
 246     const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
 247     while( pcContentsBeg < pcEnd )
 248     {
 249         const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
 250         lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
 251         if( pcWhitespaceBeg < pcEnd )
 252             rBuffer.append( ' ' );
 253         pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
 254     }
 255
 256     return bHasBracket;
 257 }
 258
 259 } // namespace
 260
 261 static const OString gaOpeningCData( "<![CDATA[" );
 262 static const OString gaClosingCData( "]]>" );
 263
 264 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
 265     // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
 266     mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
 267     maOpeningBracket( 1 ),
 268     maClosingBracket( 1 ),
 269     mnBufferPos( 0 )
 270 {
 271     if (!mxTextStrm.is())
 272         throw IOException();
 273     maOpeningBracket[ 0 ] = '<';
 274     maClosingBracket[ 0 ] = '>';
 275 }
 276
 277 InputStream::~InputStream()
 278 {
 279 }
 280
 281 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
 282 {
 283     if( nBytesToRead < 0 )
 284         throw IOException();
 285
 286     rData.realloc( nBytesToRead );
 287     sal_Int8* pcDest = rData.getArray();
 288     sal_Int32 nRet = 0;
 289     while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
 290     {
 291         updateBuffer();
 292         sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
 293         if( nReadSize > 0 )
 294         {
 295             memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
 296             mnBufferPos += nReadSize;
 297             nBytesToRead -= nReadSize;
 298             nRet += nReadSize;
 299         }
 300     }
 301     if( nRet < rData.getLength() )
 302         rData.realloc( nRet );
 303     return nRet;
 304 }
 305
 306 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
 307 {
 308     return readBytes( rData, nMaxBytesToRead );
 309 }
 310
 311 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
 312 {
 313     if( nBytesToSkip < 0 )
 314         throw IOException();
 315
 316     while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
 317     {
 318         updateBuffer();
 319         sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
 320         mnBufferPos += nSkipSize;
 321         nBytesToSkip -= nSkipSize;
 322     }
 323 }
 324
 325 sal_Int32 SAL_CALL InputStream::available()
 326 {
 327     updateBuffer();
 328     return maBuffer.getLength() - mnBufferPos;
 329 }
 330
 331 void SAL_CALL InputStream::closeInput()
 332 {
 333     mxTextStrm->closeInput();
 334 }
 335
 336 // private --------------------------------------------------------------------
 337
 338 void InputStream::updateBuffer()
 339 {
 340     while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
 341     {
 342         // collect new contents in a string buffer
 343         OStringBuffer aBuffer;
 344
 345         // read and process characters until the opening bracket of the next XML element
 346         OString aChars = readToElementBegin();
 347         bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
 348
 349         // read and process characters until (and including) closing bracket (an XML element)
 350         OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
 351         if( bHasOpeningBracket && !mxTextStrm->isEOF() )
 352         {
 353             // read the element text (add the leading opening bracket manually)
 354             OString aElement = OString( '<' ) + readToElementEnd();
 355             // check for CDATA part, starting with '<![CDATA['
 356             if( aElement.match( gaOpeningCData ) )
 357             {
 358                 // search the end tag ']]>'
 359                 while( ((aElement.getLength() < gaClosingCData.getLength()) || !aElement.endsWith( gaClosingCData )) && !mxTextStrm->isEOF() )
 360                     aElement += readToElementEnd();
 361                 // copy the entire CDATA part
 362                 aBuffer.append( aElement );
 363             }
 364             else
 365             {
 366                 // no CDATA part - process the contents of the element
 367                 lclProcessElement( aBuffer, aElement );
 368             }
 369         }
 370
 371         maBuffer = aBuffer.makeStringAndClear();
 372         mnBufferPos = 0;
 373     }
 374 }
 375
 376 OString InputStream::readToElementBegin()
 377 {
 378     return OUStringToOString( mxTextStrm->readString( maOpeningBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
 379 }
 380
 381 OString InputStream::readToElementEnd()
 382 {
 383     OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
 384     OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
 385     return aText;
 386 }
 387
 388 } // namespace vml
 389 } // namespave oox
 390
 391 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */