1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
23 #include <sal/types.h>
25 #include <rtl/textenc.h>
26 #include <rtl/tencinfo.h>
28 #include <com/sun/star/io/XInputStream.hpp>
30 using namespace ::com::sun::star::uno
;
31 using namespace ::com::sun::star::io
;
34 #include "xml2utf.hxx"
35 #include <boost/scoped_array.hpp>
37 namespace sax_expatwrap
{
39 sal_Int32
XMLFile2UTFConverter::readAndConvert( Sequence
<sal_Int8
> &seq
, sal_Int32 nMaxToRead
)
40 throw ( IOException
, NotConnectedException
, BufferSizeExceededException
, RuntimeException
)
43 throw NotConnectedException();
46 // it should be possible to find the encoding attribute
47 // within the first 512 bytes == 128 chars in UCS-4
48 nMaxToRead
= ::std::max( sal_Int32(512) , nMaxToRead
);
52 Sequence
< sal_Int8
> seqStart
;
55 nRead
= m_in
->readSomeBytes( seq
, nMaxToRead
);
57 if( nRead
+ seqStart
.getLength())
59 // if nRead is 0, the file is already eof.
60 if( ! m_bStarted
&& nRead
)
62 // ensure that enough data is available to parse encoding
63 if( seqStart
.getLength() )
65 // prefix with what we had so far.
66 sal_Int32 nLength
= seq
.getLength();
67 seq
.realloc( seqStart
.getLength() + nLength
);
69 memmove (seq
.getArray() + seqStart
.getLength(),
72 memcpy (seq
.getArray(),
73 seqStart
.getConstArray(),
74 seqStart
.getLength());
77 // autodetection with the first bytes
78 if( ! isEncodingRecognizable( seq
) )
80 // remember what we have so far.
86 if( scanForEncoding( seq
) || !m_sEncoding
.isEmpty() ) {
87 // initialize decoding
90 seqStart
= Sequence
< sal_Int8
> ();
94 if( m_pText2Unicode
&& m_pUnicode2Text
&&
95 m_pText2Unicode
->canContinue() && m_pUnicode2Text
->canContinue() ) {
97 Sequence
<sal_Unicode
> seqUnicode
= m_pText2Unicode
->convert( seq
);
98 seq
= m_pUnicode2Text
->convert( seqUnicode
.getConstArray(), seqUnicode
.getLength() );
103 // it must now be ensured, that no encoding attribute exist anymore
104 // ( otherwise the expat-Parser will crash )
105 // This must be done after decoding !
106 // ( e.g. Files decoded in ucs-4 cannot be read properly )
108 removeEncoding( seq
);
110 nRead
= seq
.getLength();
119 XMLFile2UTFConverter::~XMLFile2UTFConverter()
121 if( m_pText2Unicode
)
122 delete m_pText2Unicode
;
123 if( m_pUnicode2Text
)
124 delete m_pUnicode2Text
;
128 void XMLFile2UTFConverter::removeEncoding( Sequence
<sal_Int8
> &seq
)
130 const sal_Int8
*pSource
= seq
.getArray();
131 if( ! strncmp( reinterpret_cast<const char *>(pSource
), "<?xml", 4) )
135 OString
str( reinterpret_cast<char const *>(pSource
), seq
.getLength() );
137 // cut sequence to first line break
138 // find first line break;
139 int nMax
= str
.indexOf( 10 );
142 str
= str
.copy( 0 , nMax
);
145 int nFound
= str
.indexOf( " encoding" );
148 int nStart
= str
.indexOf( "\"" , nFound
);
149 if( nStart
< 0 || str
.indexOf( "'" , nFound
) < nStart
)
151 nStart
= str
.indexOf( "'" , nFound
);
152 nStop
= str
.indexOf( "'" , nStart
+1 );
156 nStop
= str
.indexOf( "\"" , nStart
+1);
159 if( nStart
>= 0 && nStop
>= 0 && nStart
+1 < nStop
)
161 // remove encoding tag from file
162 memmove( &( seq
.getArray()[nFound
] ) ,
163 &( seq
.getArray()[nStop
+1]) ,
164 seq
.getLength() - nStop
-1);
165 seq
.realloc( seq
.getLength() - ( nStop
+1 - nFound
) );
166 // str = String( (char * ) seq.getArray() , seq.getLen() );
172 // Checks, if enough data has been accumulated to recognize the encoding
173 bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence
< sal_Int8
> &seq
)
175 const sal_Int8
*pSource
= seq
.getConstArray();
176 bool bCheckIfFirstClosingBracketExsists
= false;
178 if( seq
.getLength() < 8 ) {
179 // no recognition possible, when less than 8 bytes are available
183 if( ! strncmp( reinterpret_cast<const char *>(pSource
), "<?xml", 4 ) ) {
184 // scan if the <?xml tag finishes within this buffer
185 bCheckIfFirstClosingBracketExsists
= true;
187 else if( ('<' == pSource
[0] || '<' == pSource
[2] ) &&
188 ( ('?' == pSource
[4] || '?' == pSource
[6] ) ) )
191 bCheckIfFirstClosingBracketExsists
= true;
193 else if( ( '<' == pSource
[1] || '<' == pSource
[3] ) &&
194 ( '?' == pSource
[5] || '?' == pSource
[7] ) )
197 bCheckIfFirstClosingBracketExsists
= true;
200 if( bCheckIfFirstClosingBracketExsists
)
202 for( sal_Int32 i
= 0; i
< seq
.getLength() ; i
++ )
204 // whole <?xml tag is valid
205 if( '>' == pSource
[ i
] )
213 // No <? tag in front, no need for a bigger buffer
217 bool XMLFile2UTFConverter::scanForEncoding( Sequence
< sal_Int8
> &seq
)
219 const sal_uInt8
*pSource
= reinterpret_cast<const sal_uInt8
*>( seq
.getConstArray() );
222 if( seq
.getLength() < 4 ) {
223 // no recognition possible, when less than 4 bytes are available
227 // first level : detect possible file formats
228 if( ! strncmp( reinterpret_cast<const char *>(pSource
), "<?xml", 4 ) ) {
231 OString
str( reinterpret_cast<const char *>(pSource
), seq
.getLength() );
233 // cut sequence to first line break
234 //find first line break;
235 int nMax
= str
.indexOf( 10 );
238 str
= str
.copy( 0 , nMax
);
241 int nFound
= str
.indexOf( " encoding" );
244 int nStart
= str
.indexOf( "\"" , nFound
);
245 if( nStart
< 0 || str
.indexOf( "'" , nFound
) < nStart
)
247 nStart
= str
.indexOf( "'" , nFound
);
248 nStop
= str
.indexOf( "'" , nStart
+1 );
252 nStop
= str
.indexOf( "\"" , nStart
+1);
254 if( nStart
>= 0 && nStop
>= 0 && nStart
+1 < nStop
)
256 // encoding found finally
257 m_sEncoding
= str
.copy( nStart
+1 , nStop
- nStart
- 1 );
261 else if( 0xFE == pSource
[0] &&
262 0xFF == pSource
[1] ) {
264 // conversion is done so that encoding information can be easily extracted
265 m_sEncoding
= "utf-16";
267 else if( 0xFF == pSource
[0] &&
268 0xFE == pSource
[1] ) {
269 // UTF-16 little endian
270 // conversion is done so that encoding information can be easily extracted
271 m_sEncoding
= "utf-16";
273 else if( 0x00 == pSource
[0] && 0x3c == pSource
[1] && 0x00 == pSource
[2] && 0x3f == pSource
[3] ) {
274 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
275 // The byte order mark is simply added
277 // simply add the byte order mark !
278 seq
.realloc( seq
.getLength() + 2 );
279 memmove( &( seq
.getArray()[2] ) , seq
.getArray() , seq
.getLength() - 2 );
280 reinterpret_cast<sal_uInt8
*>(seq
.getArray())[0] = 0xFE;
281 reinterpret_cast<sal_uInt8
*>(seq
.getArray())[1] = 0xFF;
283 m_sEncoding
= "utf-16";
285 else if( 0x3c == pSource
[0] && 0x00 == pSource
[1] && 0x3f == pSource
[2] && 0x00 == pSource
[3] ) {
286 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
287 // The byte order mark is simply added
289 seq
.realloc( seq
.getLength() + 2 );
290 memmove( &( seq
.getArray()[2] ) , seq
.getArray() , seq
.getLength() - 2 );
291 reinterpret_cast<sal_uInt8
*>(seq
.getArray())[0] = 0xFF;
292 reinterpret_cast<sal_uInt8
*>(seq
.getArray())[1] = 0xFE;
294 m_sEncoding
= "utf-16";
296 else if( 0xEF == pSource
[0] &&
297 0xBB == pSource
[1] &&
300 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
301 // The BOM is removed.
302 memmove( seq
.getArray(), &( seq
.getArray()[3] ), seq
.getLength()-3 );
303 seq
.realloc( seq
.getLength() - 3 );
304 m_sEncoding
= "utf-8";
306 else if( 0x00 == pSource
[0] && 0x00 == pSource
[1] && 0x00 == pSource
[2] && 0x3c == pSource
[3] ) {
308 m_sEncoding
= "ucs-4";
310 else if( 0x3c == pSource
[0] && 0x00 == pSource
[1] && 0x00 == pSource
[2] && 0x00 == pSource
[3] ) {
311 // UCS-4 little endian
312 m_sEncoding
= "ucs-4";
314 /* TODO: no need to test for the moment since we return sal_False like default case anyway
315 else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
316 0xa7 == static_cast<unsigned char> (pSource[2]) &&
317 0x94 == static_cast<unsigned char> (pSource[3]) ) {
319 bReturn = sal_False; // must be extended
324 // UTF8 is directly recognized by the parser.
331 void XMLFile2UTFConverter::initializeDecoding()
334 if( !m_sEncoding
.isEmpty() )
336 rtl_TextEncoding encoding
= rtl_getTextEncodingFromMimeCharset( m_sEncoding
.getStr() );
337 if( encoding
!= RTL_TEXTENCODING_UTF8
)
339 m_pText2Unicode
= new Text2UnicodeConverter( m_sEncoding
);
340 m_pUnicode2Text
= new Unicode2TextConverter( RTL_TEXTENCODING_UTF8
);
348 // Text2UnicodeConverter
351 Text2UnicodeConverter::Text2UnicodeConverter( const OString
&sEncoding
)
352 : m_convText2Unicode(NULL
)
353 , m_contextText2Unicode(NULL
)
354 , m_rtlEncoding(RTL_TEXTENCODING_DONTKNOW
)
356 rtl_TextEncoding encoding
= rtl_getTextEncodingFromMimeCharset( sEncoding
.getStr() );
357 if( RTL_TEXTENCODING_DONTKNOW
== encoding
)
359 m_bCanContinue
= false;
360 m_bInitialized
= false;
368 Text2UnicodeConverter::~Text2UnicodeConverter()
372 rtl_destroyTextToUnicodeContext( m_convText2Unicode
, m_contextText2Unicode
);
373 rtl_destroyUnicodeToTextConverter( m_convText2Unicode
);
377 void Text2UnicodeConverter::init( rtl_TextEncoding encoding
)
379 m_bCanContinue
= true;
380 m_bInitialized
= true;
382 m_convText2Unicode
= rtl_createTextToUnicodeConverter(encoding
);
383 m_contextText2Unicode
= rtl_createTextToUnicodeContext( m_convText2Unicode
);
384 m_rtlEncoding
= encoding
;
388 Sequence
<sal_Unicode
> Text2UnicodeConverter::convert( const Sequence
<sal_Int8
> &seqText
)
391 sal_Size nSrcCvtBytes
= 0;
392 sal_Size nTargetCount
= 0;
393 sal_Size nSourceCount
= 0;
395 // the whole source size
396 sal_Int32 nSourceSize
= seqText
.getLength() + m_seqSource
.getLength();
397 Sequence
<sal_Unicode
> seqUnicode ( nSourceSize
);
399 const sal_Int8
*pbSource
= seqText
.getConstArray();
400 boost::scoped_array
<sal_Int8
> pbTempMem
;
402 if( m_seqSource
.getLength() ) {
403 // put old rest and new byte sequence into one array
404 pbTempMem
.reset(new sal_Int8
[ nSourceSize
]);
405 memcpy( pbTempMem
.get() , m_seqSource
.getConstArray() , m_seqSource
.getLength() );
406 memcpy( &(pbTempMem
[ m_seqSource
.getLength() ]) , seqText
.getConstArray() , seqText
.getLength() );
407 pbSource
= pbTempMem
.get();
410 m_seqSource
= Sequence
< sal_Int8
>();
415 /* All invalid characters are transformed to the unicode undefined char */
416 nTargetCount
+= rtl_convertTextToUnicode(
418 m_contextText2Unicode
,
419 reinterpret_cast<const char *>(&( pbSource
[nSourceCount
] )),
420 nSourceSize
- nSourceCount
,
421 &( seqUnicode
.getArray()[ nTargetCount
] ),
422 seqUnicode
.getLength() - nTargetCount
,
423 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
|
424 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
|
425 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
,
428 nSourceCount
+= nSrcCvtBytes
;
430 if( uiInfo
& RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
) {
431 // save necessary bytes for next conversion
432 seqUnicode
.realloc( seqUnicode
.getLength() * 2 );
437 if( uiInfo
& RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
) {
438 m_seqSource
.realloc( nSourceSize
- nSourceCount
);
439 memcpy( m_seqSource
.getArray() , &(pbSource
[nSourceCount
]) , nSourceSize
-nSourceCount
);
442 // set to correct unicode size
443 seqUnicode
.realloc( nTargetCount
);
452 // Unicode2TextConverter
455 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding
)
461 Unicode2TextConverter::~Unicode2TextConverter()
463 if( m_bInitialized
) {
464 rtl_destroyUnicodeToTextContext( m_convUnicode2Text
, m_contextUnicode2Text
);
465 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text
);
470 Sequence
<sal_Int8
> Unicode2TextConverter::convert(const sal_Unicode
*puSource
, sal_Int32 nSourceSize
)
472 boost::scoped_array
<sal_Unicode
> puTempMem
;
474 if( m_seqSource
.getLength() ) {
476 // put old rest and new byte sequence into one array
477 // In general when surrogates are used, they should be rarely
478 // cut off between two convert()-calls. So this code is used
479 // rarely and the extra copy is acceptable.
480 puTempMem
.reset(new sal_Unicode
[ nSourceSize
+ m_seqSource
.getLength()]);
481 memcpy( puTempMem
.get() ,
482 m_seqSource
.getConstArray() ,
483 m_seqSource
.getLength() * sizeof( sal_Unicode
) );
485 &(puTempMem
[ m_seqSource
.getLength() ]) ,
487 nSourceSize
*sizeof( sal_Unicode
) );
488 puSource
= puTempMem
.get();
489 nSourceSize
+= m_seqSource
.getLength();
491 m_seqSource
= Sequence
< sal_Unicode
> ();
495 sal_Size nTargetCount
= 0;
496 sal_Size nSourceCount
= 0;
499 sal_Size nSrcCvtChars
;
501 // take nSourceSize * 3 as preference
502 // this is an upper boundary for converting to utf8,
503 // which most often used as the target.
504 sal_Int32 nSeqSize
= nSourceSize
* 3;
506 Sequence
<sal_Int8
> seqText( nSeqSize
);
507 sal_Char
*pTarget
= reinterpret_cast<char *>(seqText
.getArray());
510 nTargetCount
+= rtl_convertUnicodeToText(
512 m_contextUnicode2Text
,
513 &( puSource
[nSourceCount
] ),
514 nSourceSize
- nSourceCount
,
515 &( pTarget
[nTargetCount
] ),
516 nSeqSize
- nTargetCount
,
517 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT
|
518 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT
,
521 nSourceCount
+= nSrcCvtChars
;
523 if( uiInfo
& RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
) {
524 nSeqSize
= nSeqSize
*2;
525 seqText
.realloc( nSeqSize
); // double array size
526 pTarget
= reinterpret_cast<char *>(seqText
.getArray());
533 if( uiInfo
& RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
) {
534 m_seqSource
.realloc( nSourceSize
- nSourceCount
);
535 memcpy( m_seqSource
.getArray() ,
536 &(puSource
[nSourceCount
]),
537 (nSourceSize
- nSourceCount
) * sizeof( sal_Unicode
) );
540 // reduce the size of the buffer (fast, no copy necessary)
541 seqText
.realloc( nTargetCount
);
546 void Unicode2TextConverter::init( rtl_TextEncoding encoding
)
548 m_bCanContinue
= true;
549 m_bInitialized
= true;
551 m_convUnicode2Text
= rtl_createUnicodeToTextConverter( encoding
);
552 m_contextUnicode2Text
= rtl_createUnicodeToTextContext( m_convUnicode2Text
);
553 m_rtlEncoding
= encoding
;
559 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */