1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "oox/vml/vmlinputstream.hxx"
22 #include <com/sun/star/io/XTextInputStream2.hpp>
25 #include <rtl/strbuf.hxx>
26 #include "oox/helper/helper.hxx"
27 #include "oox/helper/textinputstream.hxx"
32 // ============================================================================
34 using namespace ::com::sun::star::io
;
35 using namespace ::com::sun::star::uno
;
37 // ============================================================================
41 inline const sal_Char
* lclFindCharacter( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
, sal_Char cChar
)
43 sal_Int32 nIndex
= rtl_str_indexOfChar_WithLength( pcBeg
, static_cast< sal_Int32
>( pcEnd
- pcBeg
), cChar
);
44 return (nIndex
< 0) ? pcEnd
: (pcBeg
+ nIndex
);
47 inline bool lclIsWhiteSpace( sal_Char cChar
)
52 const sal_Char
* lclFindWhiteSpace( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
54 for( ; pcBeg
< pcEnd
; ++pcBeg
)
55 if( lclIsWhiteSpace( *pcBeg
) )
60 const sal_Char
* lclFindNonWhiteSpace( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
62 for( ; pcBeg
< pcEnd
; ++pcBeg
)
63 if( !lclIsWhiteSpace( *pcBeg
) )
68 const sal_Char
* lclTrimWhiteSpaceFromEnd( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
70 while( (pcBeg
< pcEnd
) && lclIsWhiteSpace( pcEnd
[ -1 ] ) )
75 inline void lclAppendToBuffer( OStringBuffer
& rBuffer
, const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
77 rBuffer
.append( pcBeg
, static_cast< sal_Int32
>( pcEnd
- pcBeg
) );
80 // ----------------------------------------------------------------------------
82 void lclProcessAttribs( OStringBuffer
& rBuffer
, const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
84 /* Map attribute names to char-pointer of all attributes. This map is used
85 to find multiple occurrences of attributes with the same name. The
86 mapped pointers are used as map key in the next map below. */
87 typedef ::std::map
< OString
, const sal_Char
* > AttributeNameMap
;
88 AttributeNameMap aAttributeNames
;
90 /* Map the char-pointers of all attributes to the full attribute definition
91 string. This preserves the original order of the used attributes. */
92 typedef ::std::map
< const sal_Char
*, OString
> AttributeDataMap
;
93 AttributeDataMap aAttributes
;
96 const sal_Char
* pcNameBeg
= pcBeg
;
97 while( bOk
&& (pcNameBeg
< pcEnd
) )
99 // pcNameBeg points to begin of attribute name, find equality sign
100 const sal_Char
* pcEqualSign
= lclFindCharacter( pcNameBeg
, pcEnd
, '=' );
101 if( (bOk
= pcEqualSign
< pcEnd
) == true )
103 // find end of attribute name (ignore whitespace between name and equality sign)
104 const sal_Char
* pcNameEnd
= lclTrimWhiteSpaceFromEnd( pcNameBeg
, pcEqualSign
);
105 if( (bOk
= pcNameBeg
< pcNameEnd
) == true )
107 // find begin of attribute value (must be single or double quote)
108 const sal_Char
* pcValueBeg
= lclFindNonWhiteSpace( pcEqualSign
+ 1, pcEnd
);
109 if( (bOk
= (pcValueBeg
< pcEnd
) && ((*pcValueBeg
== '\'') || (*pcValueBeg
== '"'))) == true )
111 // find end of attribute value (matching quote character)
112 const sal_Char
* pcValueEnd
= lclFindCharacter( pcValueBeg
+ 1, pcEnd
, *pcValueBeg
);
113 if( (bOk
= pcValueEnd
< pcEnd
) == true )
116 OString
aAttribName( pcNameBeg
, static_cast< sal_Int32
>( pcNameEnd
- pcNameBeg
) );
117 OString
aAttribData( pcNameBeg
, static_cast< sal_Int32
>( pcValueEnd
- pcNameBeg
) );
118 // search for an existing attribute with the same name
119 AttributeNameMap::iterator aIt
= aAttributeNames
.find( aAttribName
);
120 // remove its definition from the data map
121 if( aIt
!= aAttributeNames
.end() )
122 aAttributes
.erase( aIt
->second
);
123 // insert the attribute into both maps
124 aAttributeNames
[ aAttribName
] = pcNameBeg
;
125 aAttributes
[ pcNameBeg
] = aAttribData
;
126 // continue with next attribute (skip whitespace after this attribute)
127 pcNameBeg
= pcValueEnd
;
128 if( (pcNameBeg
< pcEnd
) && ((bOk
= lclIsWhiteSpace( *pcNameBeg
)) == true) )
129 pcNameBeg
= lclFindNonWhiteSpace( pcNameBeg
+ 1, pcEnd
);
136 // if no error has occurred, build the resulting attribute list
138 for( AttributeDataMap::iterator aIt
= aAttributes
.begin(), aEnd
= aAttributes
.end(); aIt
!= aEnd
; ++aIt
)
139 rBuffer
.append( ' ' ).append( aIt
->second
);
140 // on error, just append the complete passed string
142 lclAppendToBuffer( rBuffer
, pcBeg
, pcEnd
);
145 void lclProcessElement( OStringBuffer
& rBuffer
, const OString
& rElement
)
147 // check that passed string starts and ends with the brackets of an XML element
148 sal_Int32 nElementLen
= rElement
.getLength();
149 if( nElementLen
== 0 )
152 const sal_Char
* pcOpen
= rElement
.getStr();
153 const sal_Char
* pcClose
= pcOpen
+ nElementLen
- 1;
155 // no complete element found
156 if( (pcOpen
>= pcClose
) || (*pcOpen
!= '<') || (*pcClose
!= '>') )
158 // just append all passed characters
159 rBuffer
.append( rElement
);
162 // skip parser instructions: '<![...]>'
163 else if( (nElementLen
>= 5) && (pcOpen
[ 1 ] == '!') && (pcOpen
[ 2 ] == '[') && (pcClose
[ -1 ] == ']') )
168 // replace '<br>' element with newline
169 else if( (nElementLen
>= 4) && (pcOpen
[ 1 ] == 'b') && (pcOpen
[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen
+ 3, pcClose
) == pcClose
) )
171 rBuffer
.append( '\n' );
174 // check start elements and simple elements for repeated attributes
175 else if( pcOpen
[ 1 ] != '/' )
177 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
178 const sal_Char
* pcContentBeg
= pcOpen
+ 1;
179 bool bIsEmptyElement
= pcClose
[ -1 ] == '/';
180 const sal_Char
* pcContentEnd
= bIsEmptyElement
? (pcClose
- 1) : pcClose
;
181 // append opening bracket and element name to buffer
182 const sal_Char
* pcWhiteSpace
= lclFindWhiteSpace( pcContentBeg
, pcContentEnd
);
183 lclAppendToBuffer( rBuffer
, pcOpen
, pcWhiteSpace
);
184 // find begin of attributes, and process all attributes
185 const sal_Char
* pcAttribBeg
= lclFindNonWhiteSpace( pcWhiteSpace
, pcContentEnd
);
186 if( pcAttribBeg
< pcContentEnd
)
187 lclProcessAttribs( rBuffer
, pcAttribBeg
, pcContentEnd
);
189 if( bIsEmptyElement
)
190 rBuffer
.append( '/' );
191 rBuffer
.append( '>' );
194 // append end elements without further processing
197 rBuffer
.append( rElement
);
201 bool lclProcessCharacters( OStringBuffer
& rBuffer
, const OString
& rChars
)
203 /* MSO has a very weird way to store and handle whitespaces. The stream
204 may contain lots of spaces, tabs, and newlines which have to be handled
205 as single space character. This will be done in this function.
207 If the element text contains a literal line break, it will be stored as
208 <br> tag (without matching </br> element). This input stream wrapper
209 will replace this element with a literal LF character (see below).
211 A single space character for its own is stored as is. Example: The
214 represents a single space character. The XML parser will ignore this
215 space character completely without issuing a 'characters' event. The
216 VML import filter implementation has to react on this case manually.
218 A single space character following another character is stored
219 literally and must not be stipped away here. Example: The element
221 contains the three letters a, b, and c, followed by a space character.
223 Consecutive space characters, or a leading single space character, are
224 stored in a <span> element. If there are N space characters (N > 1),
225 then the <span> element contains exactly (N-1) NBSP (non-breaking
226 space) characters, followed by a regular space character. Examples:
228 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
229 represents 4 consecutive space characters. Has to be handled by the
230 implementation. The element
231 <font><span style='mso-spacerun:yes'> abc</span></font>
232 represents a space characters followed by the letters a, b, c. These
233 strings have to be handled by the VML import filter implementation.
236 // passed string ends with the leading opening bracket of an XML element
237 const sal_Char
* pcBeg
= rChars
.getStr();
238 const sal_Char
* pcEnd
= pcBeg
+ rChars
.getLength();
239 bool bHasBracket
= (pcBeg
< pcEnd
) && (pcEnd
[ -1 ] == '<');
240 if( bHasBracket
) --pcEnd
;
242 // skip leading whitespace
243 const sal_Char
* pcContentsBeg
= lclFindNonWhiteSpace( pcBeg
, pcEnd
);
244 while( pcContentsBeg
< pcEnd
)
246 const sal_Char
* pcWhitespaceBeg
= lclFindWhiteSpace( pcContentsBeg
+ 1, pcEnd
);
247 lclAppendToBuffer( rBuffer
, pcContentsBeg
, pcWhitespaceBeg
);
248 if( pcWhitespaceBeg
< pcEnd
)
249 rBuffer
.append( ' ' );
250 pcContentsBeg
= lclFindNonWhiteSpace( pcWhitespaceBeg
, pcEnd
);
258 // ============================================================================
260 InputStream::InputStream( const Reference
< XComponentContext
>& rxContext
, const Reference
< XInputStream
>& rxInStrm
) :
261 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
262 mxTextStrm( TextInputStream::createXTextInputStream( rxContext
, rxInStrm
, RTL_TEXTENCODING_ISO_8859_1
) ),
263 maOpeningBracket( 1 ),
264 maClosingBracket( 1 ),
265 maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
266 maClosingCData( CREATE_OSTRING( "]]>" ) ),
269 maOpeningBracket
[ 0 ] = '<';
270 maClosingBracket
[ 0 ] = '>';
273 InputStream::~InputStream()
277 sal_Int32 SAL_CALL
InputStream::readBytes( Sequence
< sal_Int8
>& rData
, sal_Int32 nBytesToRead
)
278 throw (NotConnectedException
, BufferSizeExceededException
, IOException
, RuntimeException
)
280 if( nBytesToRead
< 0 )
283 rData
.realloc( nBytesToRead
);
284 sal_Int8
* pcDest
= rData
.getArray();
286 while( (nBytesToRead
> 0) && !mxTextStrm
->isEOF() )
289 sal_Int32 nReadSize
= ::std::min( nBytesToRead
, maBuffer
.getLength() - mnBufferPos
);
292 memcpy( pcDest
+ nRet
, maBuffer
.getStr() + mnBufferPos
, static_cast< size_t >( nReadSize
) );
293 mnBufferPos
+= nReadSize
;
294 nBytesToRead
-= nReadSize
;
298 if( nRet
< rData
.getLength() )
299 rData
.realloc( nRet
);
303 sal_Int32 SAL_CALL
InputStream::readSomeBytes( Sequence
< sal_Int8
>& rData
, sal_Int32 nMaxBytesToRead
)
304 throw (NotConnectedException
, BufferSizeExceededException
, IOException
, RuntimeException
)
306 return readBytes( rData
, nMaxBytesToRead
);
309 void SAL_CALL
InputStream::skipBytes( sal_Int32 nBytesToSkip
)
310 throw (NotConnectedException
, BufferSizeExceededException
, IOException
, RuntimeException
)
312 if( nBytesToSkip
< 0 )
315 while( (nBytesToSkip
> 0) && !mxTextStrm
->isEOF() )
318 sal_Int32 nSkipSize
= ::std::min( nBytesToSkip
, maBuffer
.getLength() - mnBufferPos
);
319 mnBufferPos
+= nSkipSize
;
320 nBytesToSkip
-= nSkipSize
;
324 sal_Int32 SAL_CALL
InputStream::available() throw (NotConnectedException
, IOException
, RuntimeException
)
327 return maBuffer
.getLength() - mnBufferPos
;
330 void SAL_CALL
InputStream::closeInput() throw (NotConnectedException
, IOException
, RuntimeException
)
332 mxTextStrm
->closeInput();
335 // private --------------------------------------------------------------------
337 void InputStream::updateBuffer() throw (IOException
, RuntimeException
)
339 while( (mnBufferPos
>= maBuffer
.getLength()) && !mxTextStrm
->isEOF() )
341 // collect new contents in a string buffer
342 OStringBuffer aBuffer
;
344 // read and process characters until the opening bracket of the next XML element
345 OString aChars
= readToElementBegin();
346 bool bHasOpeningBracket
= lclProcessCharacters( aBuffer
, aChars
);
348 // read and process characters until (and including) closing bracket (an XML element)
349 OSL_ENSURE( bHasOpeningBracket
|| mxTextStrm
->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
350 if( bHasOpeningBracket
&& !mxTextStrm
->isEOF() )
352 // read the element text (add the leading opening bracket manually)
353 OString aElement
= OString( '<' ) + readToElementEnd();
354 // check for CDATA part, starting with '<![CDATA['
355 if( aElement
.match( maOpeningCData
) )
357 // search the end tag ']]>'
358 while( ((aElement
.getLength() < maClosingCData
.getLength()) || !aElement
.match( maClosingCData
, aElement
.getLength() - maClosingCData
.getLength() )) && !mxTextStrm
->isEOF() )
359 aElement
+= readToElementEnd();
360 // copy the entire CDATA part
361 aBuffer
.append( aElement
);
365 // no CDATA part - process the contents of the element
366 lclProcessElement( aBuffer
, aElement
);
370 maBuffer
= aBuffer
.makeStringAndClear();
375 OString
InputStream::readToElementBegin() throw (IOException
, RuntimeException
)
377 return OUStringToOString( mxTextStrm
->readString( maOpeningBracket
, sal_False
), RTL_TEXTENCODING_ISO_8859_1
);
380 OString
InputStream::readToElementEnd() throw (IOException
, RuntimeException
)
382 OString aText
= OUStringToOString( mxTextStrm
->readString( maClosingBracket
, sal_False
), RTL_TEXTENCODING_ISO_8859_1
);
383 OSL_ENSURE( !aText
.isEmpty() && (aText
[ aText
.getLength() - 1 ] == '>'), "InputStream::readToElementEnd - missing closing bracket of XML element" );
387 // ============================================================================
392 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */