Bump version to 5.0-14
[LibreOffice.git] / sax / source / expatwrap / xml2utf.cxx
blobaca05ed937d893a45a8a869a0371fc1822966e62
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
19 #include <string.h>
21 #include <algorithm>
23 #include <sal/types.h>
25 #include <rtl/textenc.h>
26 #include <rtl/tencinfo.h>
28 #include <com/sun/star/io/XInputStream.hpp>
30 using namespace ::com::sun::star::uno;
31 using namespace ::com::sun::star::io;
34 #include "xml2utf.hxx"
35 #include <boost/scoped_array.hpp>
37 namespace sax_expatwrap {
39 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
40 throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
42 if( ! m_in.is() ) {
43 throw NotConnectedException();
45 if( ! m_bStarted ) {
46 // it should be possible to find the encoding attribute
47 // within the first 512 bytes == 128 chars in UCS-4
48 nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
51 sal_Int32 nRead;
52 Sequence< sal_Int8 > seqStart;
53 while( true )
55 nRead = m_in->readSomeBytes( seq , nMaxToRead );
57 if( nRead + seqStart.getLength())
59 // if nRead is 0, the file is already eof.
60 if( ! m_bStarted && nRead )
62 // ensure that enough data is available to parse encoding
63 if( seqStart.getLength() )
65 // prefix with what we had so far.
66 sal_Int32 nLength = seq.getLength();
67 seq.realloc( seqStart.getLength() + nLength );
69 memmove (seq.getArray() + seqStart.getLength(),
70 seq.getConstArray(),
71 nLength);
72 memcpy (seq.getArray(),
73 seqStart.getConstArray(),
74 seqStart.getLength());
77 // autodetection with the first bytes
78 if( ! isEncodingRecognizable( seq ) )
80 // remember what we have so far.
81 seqStart = seq;
83 // read more !
84 continue;
86 if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
87 // initialize decoding
88 initializeDecoding();
90 seqStart = Sequence < sal_Int8 > ();
93 // do the encoding
94 if( m_pText2Unicode && m_pUnicode2Text &&
95 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
97 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
98 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
101 if( ! m_bStarted )
103 // it must now be ensured, that no encoding attribute exist anymore
104 // ( otherwise the expat-Parser will crash )
105 // This must be done after decoding !
106 // ( e.g. Files decoded in ucs-4 cannot be read properly )
107 m_bStarted = true;
108 removeEncoding( seq );
110 nRead = seq.getLength();
113 break;
115 return nRead;
119 XMLFile2UTFConverter::~XMLFile2UTFConverter()
121 if( m_pText2Unicode )
122 delete m_pText2Unicode;
123 if( m_pUnicode2Text )
124 delete m_pUnicode2Text;
128 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
130 const sal_Int8 *pSource = seq.getArray();
131 if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4) )
134 // scan for encoding
135 OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );
137 // cut sequence to first line break
138 // find first line break;
139 int nMax = str.indexOf( 10 );
140 if( nMax >= 0 )
142 str = str.copy( 0 , nMax );
145 int nFound = str.indexOf( " encoding" );
146 if( nFound >= 0 ) {
147 int nStop;
148 int nStart = str.indexOf( "\"" , nFound );
149 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
151 nStart = str.indexOf( "'" , nFound );
152 nStop = str.indexOf( "'" , nStart +1 );
154 else
156 nStop = str.indexOf( "\"" , nStart +1);
159 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
161 // remove encoding tag from file
162 memmove( &( seq.getArray()[nFound] ) ,
163 &( seq.getArray()[nStop+1]) ,
164 seq.getLength() - nStop -1);
165 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
166 // str = String( (char * ) seq.getArray() , seq.getLen() );
172 // Checks, if enough data has been accumulated to recognize the encoding
173 bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
175 const sal_Int8 *pSource = seq.getConstArray();
176 bool bCheckIfFirstClosingBracketExsists = false;
178 if( seq.getLength() < 8 ) {
179 // no recognition possible, when less than 8 bytes are available
180 return false;
183 if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4 ) ) {
184 // scan if the <?xml tag finishes within this buffer
185 bCheckIfFirstClosingBracketExsists = true;
187 else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
188 ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
190 // check for utf-16
191 bCheckIfFirstClosingBracketExsists = true;
193 else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
194 ( '?' == pSource[5] || '?' == pSource[7] ) )
196 // check for
197 bCheckIfFirstClosingBracketExsists = true;
200 if( bCheckIfFirstClosingBracketExsists )
202 for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
204 // whole <?xml tag is valid
205 if( '>' == pSource[ i ] )
207 return true;
210 return false;
213 // No <? tag in front, no need for a bigger buffer
214 return true;
217 bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
219 const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
220 bool bReturn = true;
222 if( seq.getLength() < 4 ) {
223 // no recognition possible, when less than 4 bytes are available
224 return false;
227 // first level : detect possible file formats
228 if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4 ) ) {
230 // scan for encoding
231 OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );
233 // cut sequence to first line break
234 //find first line break;
235 int nMax = str.indexOf( 10 );
236 if( nMax >= 0 )
238 str = str.copy( 0 , nMax );
241 int nFound = str.indexOf( " encoding" );
242 if( nFound >= 0 ) {
243 int nStop;
244 int nStart = str.indexOf( "\"" , nFound );
245 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
247 nStart = str.indexOf( "'" , nFound );
248 nStop = str.indexOf( "'" , nStart +1 );
250 else
252 nStop = str.indexOf( "\"" , nStart +1);
254 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
256 // encoding found finally
257 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
261 else if( 0xFE == pSource[0] &&
262 0xFF == pSource[1] ) {
263 // UTF-16 big endian
264 // conversion is done so that encoding information can be easily extracted
265 m_sEncoding = "utf-16";
267 else if( 0xFF == pSource[0] &&
268 0xFE == pSource[1] ) {
269 // UTF-16 little endian
270 // conversion is done so that encoding information can be easily extracted
271 m_sEncoding = "utf-16";
273 else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
274 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
275 // The byte order mark is simply added
277 // simply add the byte order mark !
278 seq.realloc( seq.getLength() + 2 );
279 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
280 reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
281 reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
283 m_sEncoding = "utf-16";
285 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
286 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
287 // The byte order mark is simply added
289 seq.realloc( seq.getLength() + 2 );
290 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
291 reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
292 reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;
294 m_sEncoding = "utf-16";
296 else if( 0xEF == pSource[0] &&
297 0xBB == pSource[1] &&
298 0xBF == pSource[2] )
300 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
301 // The BOM is removed.
302 memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
303 seq.realloc( seq.getLength() - 3 );
304 m_sEncoding = "utf-8";
306 else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
307 // UCS-4 big endian
308 m_sEncoding = "ucs-4";
310 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
311 // UCS-4 little endian
312 m_sEncoding = "ucs-4";
314 /* TODO: no need to test for the moment since we return sal_False like default case anyway
315 else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
316 0xa7 == static_cast<unsigned char> (pSource[2]) &&
317 0x94 == static_cast<unsigned char> (pSource[3]) ) {
318 // EBCDIC
319 bReturn = sal_False; // must be extended
322 else {
323 // other
324 // UTF8 is directly recognized by the parser.
325 bReturn = false;
328 return bReturn;
331 void XMLFile2UTFConverter::initializeDecoding()
334 if( !m_sEncoding.isEmpty() )
336 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
337 if( encoding != RTL_TEXTENCODING_UTF8 )
339 m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
340 m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
348 // Text2UnicodeConverter
351 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
352 : m_convText2Unicode(NULL)
353 , m_contextText2Unicode(NULL)
354 , m_rtlEncoding(RTL_TEXTENCODING_DONTKNOW)
356 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
357 if( RTL_TEXTENCODING_DONTKNOW == encoding )
359 m_bCanContinue = false;
360 m_bInitialized = false;
362 else
364 init( encoding );
368 Text2UnicodeConverter::~Text2UnicodeConverter()
370 if( m_bInitialized )
372 rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
373 rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
377 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
379 m_bCanContinue = true;
380 m_bInitialized = true;
382 m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
383 m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
384 m_rtlEncoding = encoding;
388 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
390 sal_uInt32 uiInfo;
391 sal_Size nSrcCvtBytes = 0;
392 sal_Size nTargetCount = 0;
393 sal_Size nSourceCount = 0;
395 // the whole source size
396 sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
397 Sequence<sal_Unicode> seqUnicode ( nSourceSize );
399 const sal_Int8 *pbSource = seqText.getConstArray();
400 boost::scoped_array<sal_Int8> pbTempMem;
402 if( m_seqSource.getLength() ) {
403 // put old rest and new byte sequence into one array
404 pbTempMem.reset(new sal_Int8[ nSourceSize ]);
405 memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
406 memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
407 pbSource = pbTempMem.get();
409 // set to zero again
410 m_seqSource = Sequence< sal_Int8 >();
413 while( true ) {
415 /* All invalid characters are transformed to the unicode undefined char */
416 nTargetCount += rtl_convertTextToUnicode(
417 m_convText2Unicode,
418 m_contextText2Unicode,
419 reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
420 nSourceSize - nSourceCount ,
421 &( seqUnicode.getArray()[ nTargetCount ] ),
422 seqUnicode.getLength() - nTargetCount,
423 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
424 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
425 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
426 &uiInfo,
427 &nSrcCvtBytes );
428 nSourceCount += nSrcCvtBytes;
430 if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
431 // save necessary bytes for next conversion
432 seqUnicode.realloc( seqUnicode.getLength() * 2 );
433 continue;
435 break;
437 if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
438 m_seqSource.realloc( nSourceSize - nSourceCount );
439 memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
442 // set to correct unicode size
443 seqUnicode.realloc( nTargetCount );
445 return seqUnicode;
452 // Unicode2TextConverter
455 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
457 init( encoding );
461 Unicode2TextConverter::~Unicode2TextConverter()
463 if( m_bInitialized ) {
464 rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
465 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
470 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
472 boost::scoped_array<sal_Unicode> puTempMem;
474 if( m_seqSource.getLength() ) {
475 // For surrogates !
476 // put old rest and new byte sequence into one array
477 // In general when surrogates are used, they should be rarely
478 // cut off between two convert()-calls. So this code is used
479 // rarely and the extra copy is acceptable.
480 puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
481 memcpy( puTempMem.get() ,
482 m_seqSource.getConstArray() ,
483 m_seqSource.getLength() * sizeof( sal_Unicode ) );
484 memcpy(
485 &(puTempMem[ m_seqSource.getLength() ]) ,
486 puSource ,
487 nSourceSize*sizeof( sal_Unicode ) );
488 puSource = puTempMem.get();
489 nSourceSize += m_seqSource.getLength();
491 m_seqSource = Sequence< sal_Unicode > ();
495 sal_Size nTargetCount = 0;
496 sal_Size nSourceCount = 0;
498 sal_uInt32 uiInfo;
499 sal_Size nSrcCvtChars;
501 // take nSourceSize * 3 as preference
502 // this is an upper boundary for converting to utf8,
503 // which most often used as the target.
504 sal_Int32 nSeqSize = nSourceSize * 3;
506 Sequence<sal_Int8> seqText( nSeqSize );
507 sal_Char *pTarget = reinterpret_cast<char *>(seqText.getArray());
508 while( true ) {
510 nTargetCount += rtl_convertUnicodeToText(
511 m_convUnicode2Text,
512 m_contextUnicode2Text,
513 &( puSource[nSourceCount] ),
514 nSourceSize - nSourceCount ,
515 &( pTarget[nTargetCount] ),
516 nSeqSize - nTargetCount,
517 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
518 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
519 &uiInfo,
520 &nSrcCvtChars);
521 nSourceCount += nSrcCvtChars;
523 if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
524 nSeqSize = nSeqSize *2;
525 seqText.realloc( nSeqSize ); // double array size
526 pTarget = reinterpret_cast<char *>(seqText.getArray());
527 continue;
529 break;
532 // for surrogates
533 if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
534 m_seqSource.realloc( nSourceSize - nSourceCount );
535 memcpy( m_seqSource.getArray() ,
536 &(puSource[nSourceCount]),
537 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
540 // reduce the size of the buffer (fast, no copy necessary)
541 seqText.realloc( nTargetCount );
543 return seqText;
546 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
548 m_bCanContinue = true;
549 m_bInitialized = true;
551 m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
552 m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
553 m_rtlEncoding = encoding;
559 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */