Update ooo320-m1
[ooovba.git] / sax / source / expatwrap / xml2utf.cxx
blob1c24a6c0c4250a86e239237fa084fe1c1e17898a
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: xml2utf.cxx,v $
10 * $Revision: 1.11.10.1 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
30 #include <string.h>
32 #include <sal/types.h>
34 #include <rtl/textenc.h>
35 #include <rtl/tencinfo.h>
38 #include <com/sun/star/io/XInputStream.hpp>
40 using namespace rtl;
41 using namespace ::com::sun::star::uno;
42 using namespace ::com::sun::star::io;
44 #include "xml2utf.hxx"
46 namespace sax_expatwrap {
48 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
49 throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
52 Sequence<sal_Int8> seqIn;
54 if( ! m_in.is() ) {
55 throw NotConnectedException();
57 if( ! m_bStarted ) {
58 nMaxToRead = Max( 512 , nMaxToRead ); // it should be possible to find the encoding attribute
59 // within the first 512 bytes == 128 chars in UCS-4
62 sal_Int32 nRead;
63 Sequence< sal_Int8 > seqStart;
64 while( sal_True )
66 nRead = m_in->readSomeBytes( seq , nMaxToRead );
68 if( nRead + seqStart.getLength())
70 // if nRead is 0, the file is already eof.
71 if( ! m_bStarted && nRead )
73 // ensure that enough data is available to parse encoding
74 if( seqStart.getLength() )
76 // prefix with what we had so far.
77 sal_Int32 nLength = seq.getLength();
78 seq.realloc( seqStart.getLength() + nLength );
80 memmove (seq.getArray() + seqStart.getLength(),
81 seq.getConstArray(),
82 nLength);
83 memcpy (seq.getArray(),
84 seqStart.getConstArray(),
85 seqStart.getLength());
88 // autodetection with the first bytes
89 if( ! isEncodingRecognizable( seq ) )
91 // remember what we have so far.
92 seqStart = seq;
94 // read more !
95 continue;
97 if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
98 // initialize decoding
99 initializeDecoding();
101 nRead = seq.getLength();
102 seqStart = Sequence < sal_Int8 > ();
105 // do the encoding
106 if( m_pText2Unicode && m_pUnicode2Text &&
107 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
109 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
110 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
113 if( ! m_bStarted )
115 // it must now be ensured, that no encoding attribute exist anymore
116 // ( otherwise the expat-Parser will crash )
117 // This must be done after decoding !
118 // ( e.g. Files decoded in ucs-4 cannot be read properly )
119 m_bStarted = sal_True;
120 removeEncoding( seq );
122 nRead = seq.getLength();
125 break;
127 return nRead;
131 XMLFile2UTFConverter::~XMLFile2UTFConverter()
133 if( m_pText2Unicode )
134 delete m_pText2Unicode;
135 if( m_pUnicode2Text )
136 delete m_pUnicode2Text;
140 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
142 const sal_Int8 *pSource = seq.getArray();
143 if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
146 // scan for encoding
147 OString str( (sal_Char * ) pSource , seq.getLength() );
149 // cut sequence to first line break
150 // find first line break;
151 int nMax = str.indexOf( 10 );
152 if( nMax >= 0 )
154 str = str.copy( 0 , nMax );
157 int nFound = str.indexOf( " encoding" );
158 if( nFound >= 0 ) {
159 int nStop;
160 int nStart = str.indexOf( "\"" , nFound );
161 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
163 nStart = str.indexOf( "'" , nFound );
164 nStop = str.indexOf( "'" , nStart +1 );
166 else
168 nStop = str.indexOf( "\"" , nStart +1);
171 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
173 // remove encoding tag from file
174 memmove( &( seq.getArray()[nFound] ) ,
175 &( seq.getArray()[nStop+1]) ,
176 seq.getLength() - nStop -1);
177 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
178 // str = String( (char * ) seq.getArray() , seq.getLen() );
184 // Checks, if enough data has been accumulated to recognize the encoding
185 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
187 const sal_Int8 *pSource = seq.getConstArray();
188 sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
190 if( seq.getLength() < 8 ) {
191 // no recognition possible, when less than 8 bytes are available
192 return sal_False;
195 if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
196 // scan if the <?xml tag finishes within this buffer
197 bCheckIfFirstClosingBracketExsists = sal_True;
199 else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
200 ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
202 // check for utf-16
203 bCheckIfFirstClosingBracketExsists = sal_True;
205 else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
206 ( '?' == pSource[5] || '?' == pSource[7] ) )
208 // check for
209 bCheckIfFirstClosingBracketExsists = sal_True;
212 if( bCheckIfFirstClosingBracketExsists )
214 for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
216 // whole <?xml tag is valid
217 if( '>' == pSource[ i ] )
219 return sal_True;
222 return sal_False;
225 // No <? tag in front, no need for a bigger buffer
226 return sal_True;
229 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
231 const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
232 sal_Bool bReturn = sal_True;
234 if( seq.getLength() < 4 ) {
235 // no recognition possible, when less than 4 bytes are available
236 return sal_False;
239 // first level : detect possible file formats
240 if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
242 // scan for encoding
243 OString str( (const sal_Char *) pSource , seq.getLength() );
245 // cut sequence to first line break
246 //find first line break;
247 int nMax = str.indexOf( 10 );
248 if( nMax >= 0 )
250 str = str.copy( 0 , nMax );
253 int nFound = str.indexOf( " encoding" );
254 if( nFound < str.getLength() ) {
255 int nStop;
256 int nStart = str.indexOf( "\"" , nFound );
257 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
259 nStart = str.indexOf( "'" , nFound );
260 nStop = str.indexOf( "'" , nStart +1 );
262 else
264 nStop = str.indexOf( "\"" , nStart +1);
266 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
268 // encoding found finally
269 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
273 else if( 0xFE == pSource[0] &&
274 0xFF == pSource[1] ) {
275 // UTF-16 big endian
276 // conversion is done so that encoding information can be easily extracted
277 m_sEncoding = "utf-16";
279 else if( 0xFF == pSource[0] &&
280 0xFE == pSource[1] ) {
281 // UTF-16 little endian
282 // conversion is done so that encoding information can be easily extracted
283 m_sEncoding = "utf-16";
285 else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
286 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
287 // The byte order mark is simply added
289 // simply add the byte order mark !
290 seq.realloc( seq.getLength() + 2 );
291 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
292 ((sal_uInt8*)seq.getArray())[0] = 0xFE;
293 ((sal_uInt8*)seq.getArray())[1] = 0xFF;
295 m_sEncoding = "utf-16";
297 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
298 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
299 // The byte order mark is simply added
301 seq.realloc( seq.getLength() + 2 );
302 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
303 ((sal_uInt8*)seq.getArray())[0] = 0xFF;
304 ((sal_uInt8*)seq.getArray())[1] = 0xFE;
306 m_sEncoding = "utf-16";
308 else if( 0xEF == pSource[0] &&
309 0xBB == pSource[1] &&
310 0xBF == pSource[2] )
312 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
313 // The BOM is removed.
314 memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
315 seq.realloc( seq.getLength() - 3 );
316 m_sEncoding = "utf-8";
318 else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
319 // UCS-4 big endian
320 m_sEncoding = "ucs-4";
322 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
323 // UCS-4 little endian
324 m_sEncoding = "ucs-4";
326 else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
327 0xa7 == static_cast<unsigned char> (pSource[2]) &&
328 0x94 == static_cast<unsigned char> (pSource[3]) ) {
329 // EBCDIC
330 bReturn = sal_False; // must be extended
332 else {
333 // other
334 // UTF8 is directly recognized by the parser.
335 bReturn = sal_False;
338 return bReturn;
341 void XMLFile2UTFConverter::initializeDecoding()
344 if( m_sEncoding.getLength() )
346 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
347 if( encoding != RTL_TEXTENCODING_UTF8 )
349 m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
350 m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
356 //----------------------------------------------
358 // Text2UnicodeConverter
360 //----------------------------------------------
361 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
363 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
364 if( RTL_TEXTENCODING_DONTKNOW == encoding )
366 m_bCanContinue = sal_False;
367 m_bInitialized = sal_False;
369 else
371 init( encoding );
375 Text2UnicodeConverter::~Text2UnicodeConverter()
377 if( m_bInitialized )
379 rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
380 rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
384 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
386 m_bCanContinue = sal_True;
387 m_bInitialized = sal_True;
389 m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
390 m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
391 m_rtlEncoding = encoding;
395 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
397 sal_uInt32 uiInfo;
398 sal_Size nSrcCvtBytes = 0;
399 sal_Size nTargetCount = 0;
400 sal_Size nSourceCount = 0;
402 // the whole source size
403 sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
404 Sequence<sal_Unicode> seqUnicode ( nSourceSize );
406 const sal_Int8 *pbSource = seqText.getConstArray();
407 sal_Int8 *pbTempMem = 0;
409 if( m_seqSource.getLength() ) {
410 // put old rest and new byte sequence into one array
411 pbTempMem = new sal_Int8[ nSourceSize ];
412 memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
413 memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
414 pbSource = pbTempMem;
416 // set to zero again
417 m_seqSource = Sequence< sal_Int8 >();
420 while( sal_True ) {
422 /* All invalid characters are transformed to the unicode undefined char */
423 nTargetCount += rtl_convertTextToUnicode(
424 m_convText2Unicode,
425 m_contextText2Unicode,
426 ( const sal_Char * ) &( pbSource[nSourceCount] ),
427 nSourceSize - nSourceCount ,
428 &( seqUnicode.getArray()[ nTargetCount ] ),
429 seqUnicode.getLength() - nTargetCount,
430 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
431 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
432 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
433 &uiInfo,
434 &nSrcCvtBytes );
435 nSourceCount += nSrcCvtBytes;
437 if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
438 // save necessary bytes for next conversion
439 seqUnicode.realloc( seqUnicode.getLength() * 2 );
440 continue;
442 break;
444 if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
445 m_seqSource.realloc( nSourceSize - nSourceCount );
446 memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
450 if( pbTempMem ) {
451 delete [] pbTempMem;
454 // set to correct unicode size
455 seqUnicode.realloc( nTargetCount );
457 return seqUnicode;
462 //----------------------------------------------
464 // Unicode2TextConverter
466 //----------------------------------------------
467 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
469 init( encoding );
473 Unicode2TextConverter::~Unicode2TextConverter()
475 if( m_bInitialized ) {
476 rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
477 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
482 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
484 sal_Unicode *puTempMem = 0;
486 if( m_seqSource.getLength() ) {
487 // For surrogates !
488 // put old rest and new byte sequence into one array
489 // In general when surrogates are used, they should be rarely
490 // cut off between two convert()-calls. So this code is used
491 // rarely and the extra copy is acceptable.
492 puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
493 memcpy( puTempMem ,
494 m_seqSource.getConstArray() ,
495 m_seqSource.getLength() * sizeof( sal_Unicode ) );
496 memcpy(
497 &(puTempMem[ m_seqSource.getLength() ]) ,
498 puSource ,
499 nSourceSize*sizeof( sal_Unicode ) );
500 puSource = puTempMem;
501 nSourceSize += m_seqSource.getLength();
503 m_seqSource = Sequence< sal_Unicode > ();
507 sal_Size nTargetCount = 0;
508 sal_Size nSourceCount = 0;
510 sal_uInt32 uiInfo;
511 sal_Size nSrcCvtChars;
513 // take nSourceSize * 3 as preference
514 // this is an upper boundary for converting to utf8,
515 // which most often used as the target.
516 sal_Int32 nSeqSize = nSourceSize * 3;
518 Sequence<sal_Int8> seqText( nSeqSize );
519 sal_Char *pTarget = (sal_Char *) seqText.getArray();
520 while( sal_True ) {
522 nTargetCount += rtl_convertUnicodeToText(
523 m_convUnicode2Text,
524 m_contextUnicode2Text,
525 &( puSource[nSourceCount] ),
526 nSourceSize - nSourceCount ,
527 &( pTarget[nTargetCount] ),
528 nSeqSize - nTargetCount,
529 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
530 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
531 &uiInfo,
532 &nSrcCvtChars);
533 nSourceCount += nSrcCvtChars;
535 if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
536 nSeqSize = nSeqSize *2;
537 seqText.realloc( nSeqSize ); // double array size
538 pTarget = ( sal_Char * ) seqText.getArray();
539 continue;
541 break;
544 // for surrogates
545 if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
546 m_seqSource.realloc( nSourceSize - nSourceCount );
547 memcpy( m_seqSource.getArray() ,
548 &(puSource[nSourceCount]),
549 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
552 if( puTempMem ) {
553 delete [] puTempMem;
556 // reduce the size of the buffer (fast, no copy necessary)
557 seqText.realloc( nTargetCount );
559 return seqText;
562 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
564 m_bCanContinue = sal_True;
565 m_bInitialized = sal_True;
567 m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
568 m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
569 m_rtlEncoding = encoding;