1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <oox/vml/vmlinputstream.hxx>
22 #include <com/sun/star/io/XTextInputStream2.hpp>
25 #include <rtl/strbuf.hxx>
26 #include <osl/diagnose.h>
27 #include <oox/helper/helper.hxx>
28 #include <oox/helper/textinputstream.hxx>
33 using namespace ::com::sun::star::io
;
34 using namespace ::com::sun::star::uno
;
38 const sal_Char
* lclFindCharacter( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
, sal_Char cChar
)
40 sal_Int32 nIndex
= rtl_str_indexOfChar_WithLength( pcBeg
, static_cast< sal_Int32
>( pcEnd
- pcBeg
), cChar
);
41 return (nIndex
< 0) ? pcEnd
: (pcBeg
+ nIndex
);
44 bool lclIsWhiteSpace( sal_Char cChar
)
49 const sal_Char
* lclFindWhiteSpace( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
51 for( ; pcBeg
< pcEnd
; ++pcBeg
)
52 if( lclIsWhiteSpace( *pcBeg
) )
57 const sal_Char
* lclFindNonWhiteSpace( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
59 for( ; pcBeg
< pcEnd
; ++pcBeg
)
60 if( !lclIsWhiteSpace( *pcBeg
) )
65 const sal_Char
* lclTrimWhiteSpaceFromEnd( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
67 while( (pcBeg
< pcEnd
) && lclIsWhiteSpace( pcEnd
[ -1 ] ) )
72 void lclAppendToBuffer( OStringBuffer
& rBuffer
, const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
74 rBuffer
.append( pcBeg
, static_cast< sal_Int32
>( pcEnd
- pcBeg
) );
77 void lclProcessAttribs( OStringBuffer
& rBuffer
, const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
79 /* Map attribute names to char-pointer of all attributes. This map is used
80 to find multiple occurrences of attributes with the same name. The
81 mapped pointers are used as map key in the next map below. */
82 typedef ::std::map
< OString
, const sal_Char
* > AttributeNameMap
;
83 AttributeNameMap aAttributeNames
;
85 /* Map the char-pointers of all attributes to the full attribute definition
86 string. This preserves the original order of the used attributes. */
87 typedef ::std::map
< const sal_Char
*, OString
> AttributeDataMap
;
88 AttributeDataMap aAttributes
;
91 const sal_Char
* pcNameBeg
= pcBeg
;
92 while( bOk
&& (pcNameBeg
< pcEnd
) )
94 // pcNameBeg points to begin of attribute name, find equality sign
95 const sal_Char
* pcEqualSign
= lclFindCharacter( pcNameBeg
, pcEnd
, '=' );
96 bOk
= (pcEqualSign
< pcEnd
);
99 // find end of attribute name (ignore whitespace between name and equality sign)
100 const sal_Char
* pcNameEnd
= lclTrimWhiteSpaceFromEnd( pcNameBeg
, pcEqualSign
);
101 bOk
= (pcNameBeg
< pcNameEnd
);
104 // find begin of attribute value (must be single or double quote)
105 const sal_Char
* pcValueBeg
= lclFindNonWhiteSpace( pcEqualSign
+ 1, pcEnd
);
106 bOk
= (pcValueBeg
< pcEnd
) && ((*pcValueBeg
== '\'') || (*pcValueBeg
== '"'));
109 // find end of attribute value (matching quote character)
110 const sal_Char
* pcValueEnd
= lclFindCharacter( pcValueBeg
+ 1, pcEnd
, *pcValueBeg
);
111 bOk
= (pcValueEnd
< pcEnd
);
115 OString
aAttribName( pcNameBeg
, static_cast< sal_Int32
>( pcNameEnd
- pcNameBeg
) );
116 OString
aAttribData( pcNameBeg
, static_cast< sal_Int32
>( pcValueEnd
- pcNameBeg
) );
117 // search for an existing attribute with the same name
118 AttributeNameMap::iterator aIt
= aAttributeNames
.find( aAttribName
);
119 // remove its definition from the data map
120 if( aIt
!= aAttributeNames
.end() )
121 aAttributes
.erase( aIt
->second
);
122 // insert the attribute into both maps
123 aAttributeNames
[ aAttribName
] = pcNameBeg
;
124 aAttributes
[ pcNameBeg
] = aAttribData
;
125 // continue with next attribute (skip whitespace after this attribute)
126 pcNameBeg
= pcValueEnd
;
127 if( pcNameBeg
< pcEnd
)
129 bOk
= lclIsWhiteSpace( *pcNameBeg
);
131 pcNameBeg
= lclFindNonWhiteSpace( pcNameBeg
+ 1, pcEnd
);
139 // if no error has occurred, build the resulting attribute list
141 for (auto const& attrib
: aAttributes
)
142 rBuffer
.append( ' ' ).append( attrib
.second
);
143 // on error, just append the complete passed string
145 lclAppendToBuffer( rBuffer
, pcBeg
, pcEnd
);
148 void lclProcessElement( OStringBuffer
& rBuffer
, const OString
& rElement
)
150 // check that passed string starts and ends with the brackets of an XML element
151 sal_Int32 nElementLen
= rElement
.getLength();
152 if( nElementLen
== 0 )
155 const sal_Char
* pcOpen
= rElement
.getStr();
156 const sal_Char
* pcClose
= pcOpen
+ nElementLen
- 1;
158 // no complete element found
159 if( (pcOpen
>= pcClose
) || (*pcOpen
!= '<') || (*pcClose
!= '>') )
161 // just append all passed characters
162 rBuffer
.append( rElement
);
165 // skip parser instructions: '<![...]>'
166 else if( (nElementLen
>= 5) && (pcOpen
[ 1 ] == '!') && (pcOpen
[ 2 ] == '[') && (pcClose
[ -1 ] == ']') )
171 // replace '<br>' element with newline
172 else if( (nElementLen
>= 4) && (pcOpen
[ 1 ] == 'b') && (pcOpen
[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen
+ 3, pcClose
) == pcClose
) )
174 rBuffer
.append( '\n' );
177 // check start elements and simple elements for repeated attributes
178 else if( pcOpen
[ 1 ] != '/' )
180 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
181 const sal_Char
* pcContentBeg
= pcOpen
+ 1;
182 bool bIsEmptyElement
= pcClose
[ -1 ] == '/';
183 const sal_Char
* pcContentEnd
= bIsEmptyElement
? (pcClose
- 1) : pcClose
;
184 // append opening bracket and element name to buffer
185 const sal_Char
* pcWhiteSpace
= lclFindWhiteSpace( pcContentBeg
, pcContentEnd
);
186 lclAppendToBuffer( rBuffer
, pcOpen
, pcWhiteSpace
);
187 // find begin of attributes, and process all attributes
188 const sal_Char
* pcAttribBeg
= lclFindNonWhiteSpace( pcWhiteSpace
, pcContentEnd
);
189 if( pcAttribBeg
< pcContentEnd
)
190 lclProcessAttribs( rBuffer
, pcAttribBeg
, pcContentEnd
);
192 if( bIsEmptyElement
)
193 rBuffer
.append( '/' );
194 rBuffer
.append( '>' );
197 // append end elements without further processing
200 rBuffer
.append( rElement
);
204 bool lclProcessCharacters( OStringBuffer
& rBuffer
, const OString
& rChars
)
206 /* MSO has a very weird way to store and handle whitespaces. The stream
207 may contain lots of spaces, tabs, and newlines which have to be handled
208 as single space character. This will be done in this function.
210 If the element text contains a literal line break, it will be stored as
211 <br> tag (without matching </br> element). This input stream wrapper
212 will replace this element with a literal LF character (see below).
214 A single space character for its own is stored as is. Example: The
217 represents a single space character. The XML parser will ignore this
218 space character completely without issuing a 'characters' event. The
219 VML import filter implementation has to react on this case manually.
221 A single space character following another character is stored
222 literally and must not be stripped away here. Example: The element
224 contains the three letters a, b, and c, followed by a space character.
226 Consecutive space characters, or a leading single space character, are
227 stored in a <span> element. If there are N space characters (N > 1),
228 then the <span> element contains exactly (N-1) NBSP (non-breaking
229 space) characters, followed by a regular space character. Examples:
231 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
232 represents 4 consecutive space characters. Has to be handled by the
233 implementation. The element
234 <font><span style='mso-spacerun:yes'> abc</span></font>
235 represents a space characters followed by the letters a, b, c. These
236 strings have to be handled by the VML import filter implementation.
239 // passed string ends with the leading opening bracket of an XML element
240 const sal_Char
* pcBeg
= rChars
.getStr();
241 const sal_Char
* pcEnd
= pcBeg
+ rChars
.getLength();
242 bool bHasBracket
= (pcBeg
< pcEnd
) && (pcEnd
[ -1 ] == '<');
243 if( bHasBracket
) --pcEnd
;
245 // skip leading whitespace
246 const sal_Char
* pcContentsBeg
= lclFindNonWhiteSpace( pcBeg
, pcEnd
);
247 while( pcContentsBeg
< pcEnd
)
249 const sal_Char
* pcWhitespaceBeg
= lclFindWhiteSpace( pcContentsBeg
+ 1, pcEnd
);
250 lclAppendToBuffer( rBuffer
, pcContentsBeg
, pcWhitespaceBeg
);
251 if( pcWhitespaceBeg
< pcEnd
)
252 rBuffer
.append( ' ' );
253 pcContentsBeg
= lclFindNonWhiteSpace( pcWhitespaceBeg
, pcEnd
);
261 static const OString
gaOpeningCData( "<![CDATA[" );
262 static const OString
gaClosingCData( "]]>" );
264 InputStream::InputStream( const Reference
< XComponentContext
>& rxContext
, const Reference
< XInputStream
>& rxInStrm
) :
265 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
266 mxTextStrm( TextInputStream::createXTextInputStream( rxContext
, rxInStrm
, RTL_TEXTENCODING_ISO_8859_1
) ),
267 maOpeningBracket( 1 ),
268 maClosingBracket( 1 ),
271 if (!mxTextStrm
.is())
273 maOpeningBracket
[ 0 ] = '<';
274 maClosingBracket
[ 0 ] = '>';
277 InputStream::~InputStream()
281 sal_Int32 SAL_CALL
InputStream::readBytes( Sequence
< sal_Int8
>& rData
, sal_Int32 nBytesToRead
)
283 if( nBytesToRead
< 0 )
286 rData
.realloc( nBytesToRead
);
287 sal_Int8
* pcDest
= rData
.getArray();
289 while( (nBytesToRead
> 0) && !mxTextStrm
->isEOF() )
292 sal_Int32 nReadSize
= ::std::min( nBytesToRead
, maBuffer
.getLength() - mnBufferPos
);
295 memcpy( pcDest
+ nRet
, maBuffer
.getStr() + mnBufferPos
, static_cast< size_t >( nReadSize
) );
296 mnBufferPos
+= nReadSize
;
297 nBytesToRead
-= nReadSize
;
301 if( nRet
< rData
.getLength() )
302 rData
.realloc( nRet
);
306 sal_Int32 SAL_CALL
InputStream::readSomeBytes( Sequence
< sal_Int8
>& rData
, sal_Int32 nMaxBytesToRead
)
308 return readBytes( rData
, nMaxBytesToRead
);
311 void SAL_CALL
InputStream::skipBytes( sal_Int32 nBytesToSkip
)
313 if( nBytesToSkip
< 0 )
316 while( (nBytesToSkip
> 0) && !mxTextStrm
->isEOF() )
319 sal_Int32 nSkipSize
= ::std::min( nBytesToSkip
, maBuffer
.getLength() - mnBufferPos
);
320 mnBufferPos
+= nSkipSize
;
321 nBytesToSkip
-= nSkipSize
;
325 sal_Int32 SAL_CALL
InputStream::available()
328 return maBuffer
.getLength() - mnBufferPos
;
331 void SAL_CALL
InputStream::closeInput()
333 mxTextStrm
->closeInput();
336 // private --------------------------------------------------------------------
338 void InputStream::updateBuffer()
340 while( (mnBufferPos
>= maBuffer
.getLength()) && !mxTextStrm
->isEOF() )
342 // collect new contents in a string buffer
343 OStringBuffer aBuffer
;
345 // read and process characters until the opening bracket of the next XML element
346 OString aChars
= readToElementBegin();
347 bool bHasOpeningBracket
= lclProcessCharacters( aBuffer
, aChars
);
349 // read and process characters until (and including) closing bracket (an XML element)
350 OSL_ENSURE( bHasOpeningBracket
|| mxTextStrm
->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
351 if( bHasOpeningBracket
&& !mxTextStrm
->isEOF() )
353 // read the element text (add the leading opening bracket manually)
354 OString aElement
= OString( '<' ) + readToElementEnd();
355 // check for CDATA part, starting with '<![CDATA['
356 if( aElement
.match( gaOpeningCData
) )
358 // search the end tag ']]>'
359 while( ((aElement
.getLength() < gaClosingCData
.getLength()) || !aElement
.endsWith( gaClosingCData
)) && !mxTextStrm
->isEOF() )
360 aElement
+= readToElementEnd();
361 // copy the entire CDATA part
362 aBuffer
.append( aElement
);
366 // no CDATA part - process the contents of the element
367 lclProcessElement( aBuffer
, aElement
);
371 maBuffer
= aBuffer
.makeStringAndClear();
376 OString
InputStream::readToElementBegin()
378 return OUStringToOString( mxTextStrm
->readString( maOpeningBracket
, false ), RTL_TEXTENCODING_ISO_8859_1
);
381 OString
InputStream::readToElementEnd()
383 OString aText
= OUStringToOString( mxTextStrm
->readString( maClosingBracket
, false ), RTL_TEXTENCODING_ISO_8859_1
);
384 OSL_ENSURE( aText
.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
391 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */