bump product version to 6.3.0.0.beta1
[LibreOffice.git] / oox / source / vml / vmlinputstream.cxx
blob2442fa9bc489b0e64841828adc990e7bd6bf3971
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <oox/vml/vmlinputstream.hxx>
22 #include <com/sun/star/io/XTextInputStream2.hpp>
23 #include <map>
24 #include <string.h>
25 #include <rtl/strbuf.hxx>
26 #include <osl/diagnose.h>
27 #include <oox/helper/helper.hxx>
28 #include <oox/helper/textinputstream.hxx>
30 namespace oox {
31 namespace vml {
33 using namespace ::com::sun::star::io;
34 using namespace ::com::sun::star::uno;
36 namespace {
38 const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
40 sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
41 return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
44 bool lclIsWhiteSpace( sal_Char cChar )
46 return cChar <= 32;
49 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
51 for( ; pcBeg < pcEnd; ++pcBeg )
52 if( lclIsWhiteSpace( *pcBeg ) )
53 return pcBeg;
54 return pcEnd;
57 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
59 for( ; pcBeg < pcEnd; ++pcBeg )
60 if( !lclIsWhiteSpace( *pcBeg ) )
61 return pcBeg;
62 return pcEnd;
65 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
67 while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
68 --pcEnd;
69 return pcEnd;
72 void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
74 rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
77 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
79 /* Map attribute names to char-pointer of all attributes. This map is used
80 to find multiple occurrences of attributes with the same name. The
81 mapped pointers are used as map key in the next map below. */
82 typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
83 AttributeNameMap aAttributeNames;
85 /* Map the char-pointers of all attributes to the full attribute definition
86 string. This preserves the original order of the used attributes. */
87 typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
88 AttributeDataMap aAttributes;
90 bool bOk = true;
91 const sal_Char* pcNameBeg = pcBeg;
92 while( bOk && (pcNameBeg < pcEnd) )
94 // pcNameBeg points to begin of attribute name, find equality sign
95 const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
96 bOk = (pcEqualSign < pcEnd);
97 if (bOk)
99 // find end of attribute name (ignore whitespace between name and equality sign)
100 const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
101 bOk = (pcNameBeg < pcNameEnd);
102 if( bOk )
104 // find begin of attribute value (must be single or double quote)
105 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
106 bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'));
107 if( bOk )
109 // find end of attribute value (matching quote character)
110 const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
111 bOk = (pcValueEnd < pcEnd);
112 if( bOk )
114 ++pcValueEnd;
115 OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
116 OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
117 // search for an existing attribute with the same name
118 AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
119 // remove its definition from the data map
120 if( aIt != aAttributeNames.end() )
121 aAttributes.erase( aIt->second );
122 // insert the attribute into both maps
123 aAttributeNames[ aAttribName ] = pcNameBeg;
124 aAttributes[ pcNameBeg ] = aAttribData;
125 // continue with next attribute (skip whitespace after this attribute)
126 pcNameBeg = pcValueEnd;
127 if( pcNameBeg < pcEnd )
129 bOk = lclIsWhiteSpace( *pcNameBeg );
130 if( bOk )
131 pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
139 // if no error has occurred, build the resulting attribute list
140 if( bOk )
141 for (auto const& attrib : aAttributes)
142 rBuffer.append( ' ' ).append( attrib.second );
143 // on error, just append the complete passed string
144 else
145 lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
148 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
150 // check that passed string starts and ends with the brackets of an XML element
151 sal_Int32 nElementLen = rElement.getLength();
152 if( nElementLen == 0 )
153 return;
155 const sal_Char* pcOpen = rElement.getStr();
156 const sal_Char* pcClose = pcOpen + nElementLen - 1;
158 // no complete element found
159 if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
161 // just append all passed characters
162 rBuffer.append( rElement );
165 // skip parser instructions: '<![...]>'
166 else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
168 // do nothing
171 // replace '<br>' element with newline
172 else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
174 rBuffer.append( '\n' );
177 // check start elements and simple elements for repeated attributes
178 else if( pcOpen[ 1 ] != '/' )
180 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
181 const sal_Char* pcContentBeg = pcOpen + 1;
182 bool bIsEmptyElement = pcClose[ -1 ] == '/';
183 const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
184 // append opening bracket and element name to buffer
185 const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
186 lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
187 // find begin of attributes, and process all attributes
188 const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
189 if( pcAttribBeg < pcContentEnd )
190 lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
191 // close the element
192 if( bIsEmptyElement )
193 rBuffer.append( '/' );
194 rBuffer.append( '>' );
197 // append end elements without further processing
198 else
200 rBuffer.append( rElement );
204 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
206 /* MSO has a very weird way to store and handle whitespaces. The stream
207 may contain lots of spaces, tabs, and newlines which have to be handled
208 as single space character. This will be done in this function.
210 If the element text contains a literal line break, it will be stored as
211 <br> tag (without matching </br> element). This input stream wrapper
212 will replace this element with a literal LF character (see below).
214 A single space character for its own is stored as is. Example: The
215 element
216 <font> </font>
217 represents a single space character. The XML parser will ignore this
218 space character completely without issuing a 'characters' event. The
219 VML import filter implementation has to react on this case manually.
221 A single space character following another character is stored
222 literally and must not be stripped away here. Example: The element
223 <font>abc </font>
224 contains the three letters a, b, and c, followed by a space character.
226 Consecutive space characters, or a leading single space character, are
227 stored in a <span> element. If there are N space characters (N > 1),
228 then the <span> element contains exactly (N-1) NBSP (non-breaking
229 space) characters, followed by a regular space character. Examples:
230 The element
231 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
232 represents 4 consecutive space characters. Has to be handled by the
233 implementation. The element
234 <font><span style='mso-spacerun:yes'> abc</span></font>
235 represents a space characters followed by the letters a, b, c. These
236 strings have to be handled by the VML import filter implementation.
239 // passed string ends with the leading opening bracket of an XML element
240 const sal_Char* pcBeg = rChars.getStr();
241 const sal_Char* pcEnd = pcBeg + rChars.getLength();
242 bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
243 if( bHasBracket ) --pcEnd;
245 // skip leading whitespace
246 const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
247 while( pcContentsBeg < pcEnd )
249 const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
250 lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
251 if( pcWhitespaceBeg < pcEnd )
252 rBuffer.append( ' ' );
253 pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
256 return bHasBracket;
259 } // namespace
261 static const OString gaOpeningCData( "<![CDATA[" );
262 static const OString gaClosingCData( "]]>" );
264 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
265 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
266 mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
267 maOpeningBracket( 1 ),
268 maClosingBracket( 1 ),
269 mnBufferPos( 0 )
271 if (!mxTextStrm.is())
272 throw IOException();
273 maOpeningBracket[ 0 ] = '<';
274 maClosingBracket[ 0 ] = '>';
277 InputStream::~InputStream()
281 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
283 if( nBytesToRead < 0 )
284 throw IOException();
286 rData.realloc( nBytesToRead );
287 sal_Int8* pcDest = rData.getArray();
288 sal_Int32 nRet = 0;
289 while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
291 updateBuffer();
292 sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
293 if( nReadSize > 0 )
295 memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
296 mnBufferPos += nReadSize;
297 nBytesToRead -= nReadSize;
298 nRet += nReadSize;
301 if( nRet < rData.getLength() )
302 rData.realloc( nRet );
303 return nRet;
306 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
308 return readBytes( rData, nMaxBytesToRead );
311 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
313 if( nBytesToSkip < 0 )
314 throw IOException();
316 while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
318 updateBuffer();
319 sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
320 mnBufferPos += nSkipSize;
321 nBytesToSkip -= nSkipSize;
325 sal_Int32 SAL_CALL InputStream::available()
327 updateBuffer();
328 return maBuffer.getLength() - mnBufferPos;
331 void SAL_CALL InputStream::closeInput()
333 mxTextStrm->closeInput();
336 // private --------------------------------------------------------------------
338 void InputStream::updateBuffer()
340 while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
342 // collect new contents in a string buffer
343 OStringBuffer aBuffer;
345 // read and process characters until the opening bracket of the next XML element
346 OString aChars = readToElementBegin();
347 bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
349 // read and process characters until (and including) closing bracket (an XML element)
350 OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
351 if( bHasOpeningBracket && !mxTextStrm->isEOF() )
353 // read the element text (add the leading opening bracket manually)
354 OString aElement = OString( '<' ) + readToElementEnd();
355 // check for CDATA part, starting with '<![CDATA['
356 if( aElement.match( gaOpeningCData ) )
358 // search the end tag ']]>'
359 while( ((aElement.getLength() < gaClosingCData.getLength()) || !aElement.endsWith( gaClosingCData )) && !mxTextStrm->isEOF() )
360 aElement += readToElementEnd();
361 // copy the entire CDATA part
362 aBuffer.append( aElement );
364 else
366 // no CDATA part - process the contents of the element
367 lclProcessElement( aBuffer, aElement );
371 maBuffer = aBuffer.makeStringAndClear();
372 mnBufferPos = 0;
376 OString InputStream::readToElementBegin()
378 return OUStringToOString( mxTextStrm->readString( maOpeningBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
381 OString InputStream::readToElementEnd()
383 OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
384 OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
385 return aText;
388 } // namespace vml
389 } // namespave oox
391 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */