1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <oox/vml/vmlinputstream.hxx>
22 #include <com/sun/star/io/XTextInputStream2.hpp>
25 #include <rtl/strbuf.hxx>
26 #include <osl/diagnose.h>
27 #include <oox/helper/helper.hxx>
28 #include <oox/helper/textinputstream.hxx>
33 using namespace ::com::sun::star::io
;
34 using namespace ::com::sun::star::uno
;
38 inline const sal_Char
* lclFindCharacter( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
, sal_Char cChar
)
40 sal_Int32 nIndex
= rtl_str_indexOfChar_WithLength( pcBeg
, static_cast< sal_Int32
>( pcEnd
- pcBeg
), cChar
);
41 return (nIndex
< 0) ? pcEnd
: (pcBeg
+ nIndex
);
44 inline bool lclIsWhiteSpace( sal_Char cChar
)
49 const sal_Char
* lclFindWhiteSpace( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
51 for( ; pcBeg
< pcEnd
; ++pcBeg
)
52 if( lclIsWhiteSpace( *pcBeg
) )
57 const sal_Char
* lclFindNonWhiteSpace( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
59 for( ; pcBeg
< pcEnd
; ++pcBeg
)
60 if( !lclIsWhiteSpace( *pcBeg
) )
65 const sal_Char
* lclTrimWhiteSpaceFromEnd( const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
67 while( (pcBeg
< pcEnd
) && lclIsWhiteSpace( pcEnd
[ -1 ] ) )
72 inline void lclAppendToBuffer( OStringBuffer
& rBuffer
, const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
74 rBuffer
.append( pcBeg
, static_cast< sal_Int32
>( pcEnd
- pcBeg
) );
77 void lclProcessAttribs( OStringBuffer
& rBuffer
, const sal_Char
* pcBeg
, const sal_Char
* pcEnd
)
79 /* Map attribute names to char-pointer of all attributes. This map is used
80 to find multiple occurrences of attributes with the same name. The
81 mapped pointers are used as map key in the next map below. */
82 typedef ::std::map
< OString
, const sal_Char
* > AttributeNameMap
;
83 AttributeNameMap aAttributeNames
;
85 /* Map the char-pointers of all attributes to the full attribute definition
86 string. This preserves the original order of the used attributes. */
87 typedef ::std::map
< const sal_Char
*, OString
> AttributeDataMap
;
88 AttributeDataMap aAttributes
;
91 const sal_Char
* pcNameBeg
= pcBeg
;
92 while( bOk
&& (pcNameBeg
< pcEnd
) )
94 // pcNameBeg points to begin of attribute name, find equality sign
95 const sal_Char
* pcEqualSign
= lclFindCharacter( pcNameBeg
, pcEnd
, '=' );
96 bOk
= (pcEqualSign
< pcEnd
);
99 // find end of attribute name (ignore whitespace between name and equality sign)
100 const sal_Char
* pcNameEnd
= lclTrimWhiteSpaceFromEnd( pcNameBeg
, pcEqualSign
);
101 bOk
= (pcNameBeg
< pcNameEnd
);
104 // find begin of attribute value (must be single or double quote)
105 const sal_Char
* pcValueBeg
= lclFindNonWhiteSpace( pcEqualSign
+ 1, pcEnd
);
106 bOk
= (pcValueBeg
< pcEnd
) && ((*pcValueBeg
== '\'') || (*pcValueBeg
== '"'));
109 // find end of attribute value (matching quote character)
110 const sal_Char
* pcValueEnd
= lclFindCharacter( pcValueBeg
+ 1, pcEnd
, *pcValueBeg
);
111 bOk
= (pcValueEnd
< pcEnd
);
115 OString
aAttribName( pcNameBeg
, static_cast< sal_Int32
>( pcNameEnd
- pcNameBeg
) );
116 OString
aAttribData( pcNameBeg
, static_cast< sal_Int32
>( pcValueEnd
- pcNameBeg
) );
117 // search for an existing attribute with the same name
118 AttributeNameMap::iterator aIt
= aAttributeNames
.find( aAttribName
);
119 // remove its definition from the data map
120 if( aIt
!= aAttributeNames
.end() )
121 aAttributes
.erase( aIt
->second
);
122 // insert the attribute into both maps
123 aAttributeNames
[ aAttribName
] = pcNameBeg
;
124 aAttributes
[ pcNameBeg
] = aAttribData
;
125 // continue with next attribute (skip whitespace after this attribute)
126 pcNameBeg
= pcValueEnd
;
127 if( pcNameBeg
< pcEnd
)
129 bOk
= lclIsWhiteSpace( *pcNameBeg
);
131 pcNameBeg
= lclFindNonWhiteSpace( pcNameBeg
+ 1, pcEnd
);
139 // if no error has occurred, build the resulting attribute list
141 for (auto const& attrib
: aAttributes
)
142 rBuffer
.append( ' ' ).append( attrib
.second
);
143 // on error, just append the complete passed string
145 lclAppendToBuffer( rBuffer
, pcBeg
, pcEnd
);
148 void lclProcessElement( OStringBuffer
& rBuffer
, const OString
& rElement
)
150 // check that passed string starts and ends with the brackets of an XML element
151 sal_Int32 nElementLen
= rElement
.getLength();
152 if( nElementLen
== 0 )
155 const sal_Char
* pcOpen
= rElement
.getStr();
156 const sal_Char
* pcClose
= pcOpen
+ nElementLen
- 1;
158 // no complete element found
159 if( (pcOpen
>= pcClose
) || (*pcOpen
!= '<') || (*pcClose
!= '>') )
161 // just append all passed characters
162 rBuffer
.append( rElement
);
165 // skip parser instructions: '<![...]>'
166 else if( (nElementLen
>= 5) && (pcOpen
[ 1 ] == '!') && (pcOpen
[ 2 ] == '[') && (pcClose
[ -1 ] == ']') )
171 // replace '<br>' element with newline
172 else if( (nElementLen
>= 4) && (pcOpen
[ 1 ] == 'b') && (pcOpen
[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen
+ 3, pcClose
) == pcClose
) )
174 rBuffer
.append( '\n' );
177 // check start elements and simple elements for repeated attributes
178 else if( pcOpen
[ 1 ] != '/' )
180 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
181 const sal_Char
* pcContentBeg
= pcOpen
+ 1;
182 bool bIsEmptyElement
= pcClose
[ -1 ] == '/';
183 const sal_Char
* pcContentEnd
= bIsEmptyElement
? (pcClose
- 1) : pcClose
;
184 // append opening bracket and element name to buffer
185 const sal_Char
* pcWhiteSpace
= lclFindWhiteSpace( pcContentBeg
, pcContentEnd
);
186 lclAppendToBuffer( rBuffer
, pcOpen
, pcWhiteSpace
);
187 // find begin of attributes, and process all attributes
188 const sal_Char
* pcAttribBeg
= lclFindNonWhiteSpace( pcWhiteSpace
, pcContentEnd
);
189 if( pcAttribBeg
< pcContentEnd
)
190 lclProcessAttribs( rBuffer
, pcAttribBeg
, pcContentEnd
);
192 if( bIsEmptyElement
)
193 rBuffer
.append( '/' );
194 rBuffer
.append( '>' );
197 // append end elements without further processing
200 rBuffer
.append( rElement
);
204 bool lclProcessCharacters( OStringBuffer
& rBuffer
, const OString
& rChars
)
206 /* MSO has a very weird way to store and handle whitespaces. The stream
207 may contain lots of spaces, tabs, and newlines which have to be handled
208 as single space character. This will be done in this function.
210 If the element text contains a literal line break, it will be stored as
211 <br> tag (without matching </br> element). This input stream wrapper
212 will replace this element with a literal LF character (see below).
214 A single space character for its own is stored as is. Example: The
217 represents a single space character. The XML parser will ignore this
218 space character completely without issuing a 'characters' event. The
219 VML import filter implementation has to react on this case manually.
221 A single space character following another character is stored
222 literally and must not be stripped away here. Example: The element
224 contains the three letters a, b, and c, followed by a space character.
226 Consecutive space characters, or a leading single space character, are
227 stored in a <span> element. If there are N space characters (N > 1),
228 then the <span> element contains exactly (N-1) NBSP (non-breaking
229 space) characters, followed by a regular space character. Examples:
231 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
232 represents 4 consecutive space characters. Has to be handled by the
233 implementation. The element
234 <font><span style='mso-spacerun:yes'> abc</span></font>
235 represents a space characters followed by the letters a, b, c. These
236 strings have to be handled by the VML import filter implementation.
239 // passed string ends with the leading opening bracket of an XML element
240 const sal_Char
* pcBeg
= rChars
.getStr();
241 const sal_Char
* pcEnd
= pcBeg
+ rChars
.getLength();
242 bool bHasBracket
= (pcBeg
< pcEnd
) && (pcEnd
[ -1 ] == '<');
243 if( bHasBracket
) --pcEnd
;
245 // skip leading whitespace
246 const sal_Char
* pcContentsBeg
= lclFindNonWhiteSpace( pcBeg
, pcEnd
);
247 while( pcContentsBeg
< pcEnd
)
249 const sal_Char
* pcWhitespaceBeg
= lclFindWhiteSpace( pcContentsBeg
+ 1, pcEnd
);
250 lclAppendToBuffer( rBuffer
, pcContentsBeg
, pcWhitespaceBeg
);
251 if( pcWhitespaceBeg
< pcEnd
)
252 rBuffer
.append( ' ' );
253 pcContentsBeg
= lclFindNonWhiteSpace( pcWhitespaceBeg
, pcEnd
);
261 InputStream::InputStream( const Reference
< XComponentContext
>& rxContext
, const Reference
< XInputStream
>& rxInStrm
) :
262 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
263 mxTextStrm( TextInputStream::createXTextInputStream( rxContext
, rxInStrm
, RTL_TEXTENCODING_ISO_8859_1
) ),
264 maOpeningBracket( 1 ),
265 maClosingBracket( 1 ),
266 maOpeningCData( "<![CDATA[" ),
267 maClosingCData( "]]>" ),
270 if (!mxTextStrm
.is())
272 maOpeningBracket
[ 0 ] = '<';
273 maClosingBracket
[ 0 ] = '>';
276 InputStream::~InputStream()
280 sal_Int32 SAL_CALL
InputStream::readBytes( Sequence
< sal_Int8
>& rData
, sal_Int32 nBytesToRead
)
282 if( nBytesToRead
< 0 )
285 rData
.realloc( nBytesToRead
);
286 sal_Int8
* pcDest
= rData
.getArray();
288 while( (nBytesToRead
> 0) && !mxTextStrm
->isEOF() )
291 sal_Int32 nReadSize
= ::std::min( nBytesToRead
, maBuffer
.getLength() - mnBufferPos
);
294 memcpy( pcDest
+ nRet
, maBuffer
.getStr() + mnBufferPos
, static_cast< size_t >( nReadSize
) );
295 mnBufferPos
+= nReadSize
;
296 nBytesToRead
-= nReadSize
;
300 if( nRet
< rData
.getLength() )
301 rData
.realloc( nRet
);
305 sal_Int32 SAL_CALL
InputStream::readSomeBytes( Sequence
< sal_Int8
>& rData
, sal_Int32 nMaxBytesToRead
)
307 return readBytes( rData
, nMaxBytesToRead
);
310 void SAL_CALL
InputStream::skipBytes( sal_Int32 nBytesToSkip
)
312 if( nBytesToSkip
< 0 )
315 while( (nBytesToSkip
> 0) && !mxTextStrm
->isEOF() )
318 sal_Int32 nSkipSize
= ::std::min( nBytesToSkip
, maBuffer
.getLength() - mnBufferPos
);
319 mnBufferPos
+= nSkipSize
;
320 nBytesToSkip
-= nSkipSize
;
324 sal_Int32 SAL_CALL
InputStream::available()
327 return maBuffer
.getLength() - mnBufferPos
;
330 void SAL_CALL
InputStream::closeInput()
332 mxTextStrm
->closeInput();
335 // private --------------------------------------------------------------------
337 void InputStream::updateBuffer()
339 while( (mnBufferPos
>= maBuffer
.getLength()) && !mxTextStrm
->isEOF() )
341 // collect new contents in a string buffer
342 OStringBuffer aBuffer
;
344 // read and process characters until the opening bracket of the next XML element
345 OString aChars
= readToElementBegin();
346 bool bHasOpeningBracket
= lclProcessCharacters( aBuffer
, aChars
);
348 // read and process characters until (and including) closing bracket (an XML element)
349 OSL_ENSURE( bHasOpeningBracket
|| mxTextStrm
->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
350 if( bHasOpeningBracket
&& !mxTextStrm
->isEOF() )
352 // read the element text (add the leading opening bracket manually)
353 OString aElement
= OString( '<' ) + readToElementEnd();
354 // check for CDATA part, starting with '<![CDATA['
355 if( aElement
.match( maOpeningCData
) )
357 // search the end tag ']]>'
358 while( ((aElement
.getLength() < maClosingCData
.getLength()) || !aElement
.endsWith( maClosingCData
)) && !mxTextStrm
->isEOF() )
359 aElement
+= readToElementEnd();
360 // copy the entire CDATA part
361 aBuffer
.append( aElement
);
365 // no CDATA part - process the contents of the element
366 lclProcessElement( aBuffer
, aElement
);
370 maBuffer
= aBuffer
.makeStringAndClear();
375 OString
InputStream::readToElementBegin()
377 return OUStringToOString( mxTextStrm
->readString( maOpeningBracket
, false ), RTL_TEXTENCODING_ISO_8859_1
);
380 OString
InputStream::readToElementEnd()
382 OString aText
= OUStringToOString( mxTextStrm
->readString( maClosingBracket
, false ), RTL_TEXTENCODING_ISO_8859_1
);
383 OSL_ENSURE( aText
.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
390 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */