Branch libreoffice-5-0-4
[LibreOffice.git] / oox / source / vml / vmlinputstream.cxx
blob91af9e1cc03119a4253009502f3cf79c18f1568d
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "oox/vml/vmlinputstream.hxx"
22 #include <com/sun/star/io/XTextInputStream2.hpp>
23 #include <map>
24 #include <string.h>
25 #include <rtl/strbuf.hxx>
26 #include <osl/diagnose.h>
27 #include "oox/helper/helper.hxx"
28 #include "oox/helper/textinputstream.hxx"
30 namespace oox {
31 namespace vml {
33 using namespace ::com::sun::star::io;
34 using namespace ::com::sun::star::uno;
36 namespace {
38 inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
40 sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
41 return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
44 inline bool lclIsWhiteSpace( sal_Char cChar )
46 return cChar < 32;
49 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
51 for( ; pcBeg < pcEnd; ++pcBeg )
52 if( lclIsWhiteSpace( *pcBeg ) )
53 return pcBeg;
54 return pcEnd;
57 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
59 for( ; pcBeg < pcEnd; ++pcBeg )
60 if( !lclIsWhiteSpace( *pcBeg ) )
61 return pcBeg;
62 return pcEnd;
65 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
67 while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
68 --pcEnd;
69 return pcEnd;
72 inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
74 rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
77 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
79 /* Map attribute names to char-pointer of all attributes. This map is used
80 to find multiple occurrences of attributes with the same name. The
81 mapped pointers are used as map key in the next map below. */
82 typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
83 AttributeNameMap aAttributeNames;
85 /* Map the char-pointers of all attributes to the full attribute definition
86 string. This preserves the original order of the used attributes. */
87 typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
88 AttributeDataMap aAttributes;
90 bool bOk = true;
91 const sal_Char* pcNameBeg = pcBeg;
92 while( bOk && (pcNameBeg < pcEnd) )
94 // pcNameBeg points to begin of attribute name, find equality sign
95 const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
96 if ((bOk = (pcEqualSign < pcEnd)))
98 // find end of attribute name (ignore whitespace between name and equality sign)
99 const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
100 if( (bOk = (pcNameBeg < pcNameEnd)) )
102 // find begin of attribute value (must be single or double quote)
103 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
104 if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) )
106 // find end of attribute value (matching quote character)
107 const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
108 if( (bOk = (pcValueEnd < pcEnd)) )
110 ++pcValueEnd;
111 OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
112 OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
113 // search for an existing attribute with the same name
114 AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
115 // remove its definition from the data map
116 if( aIt != aAttributeNames.end() )
117 aAttributes.erase( aIt->second );
118 // insert the attribute into both maps
119 aAttributeNames[ aAttribName ] = pcNameBeg;
120 aAttributes[ pcNameBeg ] = aAttribData;
121 // continue with next attribute (skip whitespace after this attribute)
122 pcNameBeg = pcValueEnd;
123 if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg ))) )
124 pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
131 // if no error has occurred, build the resulting attribute list
132 if( bOk )
133 for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
134 rBuffer.append( ' ' ).append( aIt->second );
135 // on error, just append the complete passed string
136 else
137 lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
140 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
142 // check that passed string starts and ends with the brackets of an XML element
143 sal_Int32 nElementLen = rElement.getLength();
144 if( nElementLen == 0 )
145 return;
147 const sal_Char* pcOpen = rElement.getStr();
148 const sal_Char* pcClose = pcOpen + nElementLen - 1;
150 // no complete element found
151 if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
153 // just append all passed characters
154 rBuffer.append( rElement );
157 // skip parser instructions: '<![...]>'
158 else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
160 // do nothing
163 // replace '<br>' element with newline
164 else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
166 rBuffer.append( '\n' );
169 // check start elements and simple elements for repeated attributes
170 else if( pcOpen[ 1 ] != '/' )
172 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
173 const sal_Char* pcContentBeg = pcOpen + 1;
174 bool bIsEmptyElement = pcClose[ -1 ] == '/';
175 const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
176 // append opening bracket and element name to buffer
177 const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
178 lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
179 // find begin of attributes, and process all attributes
180 const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
181 if( pcAttribBeg < pcContentEnd )
182 lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
183 // close the element
184 if( bIsEmptyElement )
185 rBuffer.append( '/' );
186 rBuffer.append( '>' );
189 // append end elements without further processing
190 else
192 rBuffer.append( rElement );
196 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
198 /* MSO has a very weird way to store and handle whitespaces. The stream
199 may contain lots of spaces, tabs, and newlines which have to be handled
200 as single space character. This will be done in this function.
202 If the element text contains a literal line break, it will be stored as
203 <br> tag (without matching </br> element). This input stream wrapper
204 will replace this element with a literal LF character (see below).
206 A single space character for its own is stored as is. Example: The
207 element
208 <font> </font>
209 represents a single space character. The XML parser will ignore this
210 space character completely without issuing a 'characters' event. The
211 VML import filter implementation has to react on this case manually.
213 A single space character following another character is stored
214 literally and must not be stipped away here. Example: The element
215 <font>abc </font>
216 contains the three letters a, b, and c, followed by a space character.
218 Consecutive space characters, or a leading single space character, are
219 stored in a <span> element. If there are N space characters (N > 1),
220 then the <span> element contains exactly (N-1) NBSP (non-breaking
221 space) characters, followed by a regular space character. Examples:
222 The element
223 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
224 represents 4 consecutive space characters. Has to be handled by the
225 implementation. The element
226 <font><span style='mso-spacerun:yes'> abc</span></font>
227 represents a space characters followed by the letters a, b, c. These
228 strings have to be handled by the VML import filter implementation.
231 // passed string ends with the leading opening bracket of an XML element
232 const sal_Char* pcBeg = rChars.getStr();
233 const sal_Char* pcEnd = pcBeg + rChars.getLength();
234 bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
235 if( bHasBracket ) --pcEnd;
237 // skip leading whitespace
238 const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
239 while( pcContentsBeg < pcEnd )
241 const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
242 lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
243 if( pcWhitespaceBeg < pcEnd )
244 rBuffer.append( ' ' );
245 pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
248 return bHasBracket;
251 } // namespace
253 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
254 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
255 mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
256 maOpeningBracket( 1 ),
257 maClosingBracket( 1 ),
258 maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
259 maClosingCData( CREATE_OSTRING( "]]>" ) ),
260 mnBufferPos( 0 )
262 if (!mxTextStrm.is())
263 throw IOException();
264 maOpeningBracket[ 0 ] = '<';
265 maClosingBracket[ 0 ] = '>';
268 InputStream::~InputStream()
272 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
273 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
275 if( nBytesToRead < 0 )
276 throw IOException();
278 rData.realloc( nBytesToRead );
279 sal_Int8* pcDest = rData.getArray();
280 sal_Int32 nRet = 0;
281 while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
283 updateBuffer();
284 sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
285 if( nReadSize > 0 )
287 memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
288 mnBufferPos += nReadSize;
289 nBytesToRead -= nReadSize;
290 nRet += nReadSize;
293 if( nRet < rData.getLength() )
294 rData.realloc( nRet );
295 return nRet;
298 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
299 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
301 return readBytes( rData, nMaxBytesToRead );
304 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
305 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
307 if( nBytesToSkip < 0 )
308 throw IOException();
310 while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
312 updateBuffer();
313 sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
314 mnBufferPos += nSkipSize;
315 nBytesToSkip -= nSkipSize;
319 sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException, std::exception)
321 updateBuffer();
322 return maBuffer.getLength() - mnBufferPos;
325 void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException, std::exception)
327 mxTextStrm->closeInput();
330 // private --------------------------------------------------------------------
332 void InputStream::updateBuffer() throw (IOException, RuntimeException)
334 while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
336 // collect new contents in a string buffer
337 OStringBuffer aBuffer;
339 // read and process characters until the opening bracket of the next XML element
340 OString aChars = readToElementBegin();
341 bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
343 // read and process characters until (and including) closing bracket (an XML element)
344 OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
345 if( bHasOpeningBracket && !mxTextStrm->isEOF() )
347 // read the element text (add the leading opening bracket manually)
348 OString aElement = OString( '<' ) + readToElementEnd();
349 // check for CDATA part, starting with '<![CDATA['
350 if( aElement.match( maOpeningCData ) )
352 // search the end tag ']]>'
353 while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.endsWith( maClosingCData )) && !mxTextStrm->isEOF() )
354 aElement += readToElementEnd();
355 // copy the entire CDATA part
356 aBuffer.append( aElement );
358 else
360 // no CDATA part - process the contents of the element
361 lclProcessElement( aBuffer, aElement );
365 maBuffer = aBuffer.makeStringAndClear();
366 mnBufferPos = 0;
370 OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
372 return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
375 OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
377 OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
378 OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
379 return aText;
382 } // namespace vml
383 } // namespave oox
385 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */