1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: xml2utf.cxx,v $
10 * $Revision: 1.11.10.1 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
32 #include <sal/types.h>
34 #include <rtl/textenc.h>
35 #include <rtl/tencinfo.h>
38 #include <com/sun/star/io/XInputStream.hpp>
41 using namespace ::com::sun::star::uno
;
42 using namespace ::com::sun::star::io
;
44 #include "xml2utf.hxx"
46 namespace sax_expatwrap
{
48 sal_Int32
XMLFile2UTFConverter::readAndConvert( Sequence
<sal_Int8
> &seq
, sal_Int32 nMaxToRead
)
49 throw ( IOException
, NotConnectedException
, BufferSizeExceededException
, RuntimeException
)
52 Sequence
<sal_Int8
> seqIn
;
55 throw NotConnectedException();
58 nMaxToRead
= Max( 512 , nMaxToRead
); // it should be possible to find the encoding attribute
59 // within the first 512 bytes == 128 chars in UCS-4
63 Sequence
< sal_Int8
> seqStart
;
66 nRead
= m_in
->readSomeBytes( seq
, nMaxToRead
);
68 if( nRead
+ seqStart
.getLength())
70 // if nRead is 0, the file is already eof.
71 if( ! m_bStarted
&& nRead
)
73 // ensure that enough data is available to parse encoding
74 if( seqStart
.getLength() )
76 // prefix with what we had so far.
77 sal_Int32 nLength
= seq
.getLength();
78 seq
.realloc( seqStart
.getLength() + nLength
);
80 memmove (seq
.getArray() + seqStart
.getLength(),
83 memcpy (seq
.getArray(),
84 seqStart
.getConstArray(),
85 seqStart
.getLength());
88 // autodetection with the first bytes
89 if( ! isEncodingRecognizable( seq
) )
91 // remember what we have so far.
97 if( scanForEncoding( seq
) || m_sEncoding
.getLength() ) {
98 // initialize decoding
101 nRead
= seq
.getLength();
102 seqStart
= Sequence
< sal_Int8
> ();
106 if( m_pText2Unicode
&& m_pUnicode2Text
&&
107 m_pText2Unicode
->canContinue() && m_pUnicode2Text
->canContinue() ) {
109 Sequence
<sal_Unicode
> seqUnicode
= m_pText2Unicode
->convert( seq
);
110 seq
= m_pUnicode2Text
->convert( seqUnicode
.getConstArray(), seqUnicode
.getLength() );
115 // it must now be ensured, that no encoding attribute exist anymore
116 // ( otherwise the expat-Parser will crash )
117 // This must be done after decoding !
118 // ( e.g. Files decoded in ucs-4 cannot be read properly )
119 m_bStarted
= sal_True
;
120 removeEncoding( seq
);
122 nRead
= seq
.getLength();
131 XMLFile2UTFConverter::~XMLFile2UTFConverter()
133 if( m_pText2Unicode
)
134 delete m_pText2Unicode
;
135 if( m_pUnicode2Text
)
136 delete m_pUnicode2Text
;
140 void XMLFile2UTFConverter::removeEncoding( Sequence
<sal_Int8
> &seq
)
142 const sal_Int8
*pSource
= seq
.getArray();
143 if( ! strncmp( (const char * ) pSource
, "<?xml" , 4) )
147 OString
str( (sal_Char
* ) pSource
, seq
.getLength() );
149 // cut sequence to first line break
150 // find first line break;
151 int nMax
= str
.indexOf( 10 );
154 str
= str
.copy( 0 , nMax
);
157 int nFound
= str
.indexOf( " encoding" );
160 int nStart
= str
.indexOf( "\"" , nFound
);
161 if( nStart
< 0 || str
.indexOf( "'" , nFound
) < nStart
)
163 nStart
= str
.indexOf( "'" , nFound
);
164 nStop
= str
.indexOf( "'" , nStart
+1 );
168 nStop
= str
.indexOf( "\"" , nStart
+1);
171 if( nStart
>= 0 && nStop
>= 0 && nStart
+1 < nStop
)
173 // remove encoding tag from file
174 memmove( &( seq
.getArray()[nFound
] ) ,
175 &( seq
.getArray()[nStop
+1]) ,
176 seq
.getLength() - nStop
-1);
177 seq
.realloc( seq
.getLength() - ( nStop
+1 - nFound
) );
178 // str = String( (char * ) seq.getArray() , seq.getLen() );
184 // Checks, if enough data has been accumulated to recognize the encoding
185 sal_Bool
XMLFile2UTFConverter::isEncodingRecognizable( const Sequence
< sal_Int8
> &seq
)
187 const sal_Int8
*pSource
= seq
.getConstArray();
188 sal_Bool bCheckIfFirstClosingBracketExsists
= sal_False
;
190 if( seq
.getLength() < 8 ) {
191 // no recognition possible, when less than 8 bytes are available
195 if( ! strncmp( (const char * ) pSource
, "<?xml" , 4 ) ) {
196 // scan if the <?xml tag finishes within this buffer
197 bCheckIfFirstClosingBracketExsists
= sal_True
;
199 else if( ('<' == pSource
[0] || '<' == pSource
[2] ) &&
200 ( ('?' == pSource
[4] || '?' == pSource
[6] ) ) )
203 bCheckIfFirstClosingBracketExsists
= sal_True
;
205 else if( ( '<' == pSource
[1] || '<' == pSource
[3] ) &&
206 ( '?' == pSource
[5] || '?' == pSource
[7] ) )
209 bCheckIfFirstClosingBracketExsists
= sal_True
;
212 if( bCheckIfFirstClosingBracketExsists
)
214 for( sal_Int32 i
= 0; i
< seq
.getLength() ; i
++ )
216 // whole <?xml tag is valid
217 if( '>' == pSource
[ i
] )
225 // No <? tag in front, no need for a bigger buffer
229 sal_Bool
XMLFile2UTFConverter::scanForEncoding( Sequence
< sal_Int8
> &seq
)
231 const sal_uInt8
*pSource
= reinterpret_cast<const sal_uInt8
*>( seq
.getConstArray() );
232 sal_Bool bReturn
= sal_True
;
234 if( seq
.getLength() < 4 ) {
235 // no recognition possible, when less than 4 bytes are available
239 // first level : detect possible file formats
240 if( ! strncmp( (const char * ) pSource
, "<?xml" , 4 ) ) {
243 OString
str( (const sal_Char
*) pSource
, seq
.getLength() );
245 // cut sequence to first line break
246 //find first line break;
247 int nMax
= str
.indexOf( 10 );
250 str
= str
.copy( 0 , nMax
);
253 int nFound
= str
.indexOf( " encoding" );
254 if( nFound
< str
.getLength() ) {
256 int nStart
= str
.indexOf( "\"" , nFound
);
257 if( nStart
< 0 || str
.indexOf( "'" , nFound
) < nStart
)
259 nStart
= str
.indexOf( "'" , nFound
);
260 nStop
= str
.indexOf( "'" , nStart
+1 );
264 nStop
= str
.indexOf( "\"" , nStart
+1);
266 if( nStart
>= 0 && nStop
>= 0 && nStart
+1 < nStop
)
268 // encoding found finally
269 m_sEncoding
= str
.copy( nStart
+1 , nStop
- nStart
- 1 );
273 else if( 0xFE == pSource
[0] &&
274 0xFF == pSource
[1] ) {
276 // conversion is done so that encoding information can be easily extracted
277 m_sEncoding
= "utf-16";
279 else if( 0xFF == pSource
[0] &&
280 0xFE == pSource
[1] ) {
281 // UTF-16 little endian
282 // conversion is done so that encoding information can be easily extracted
283 m_sEncoding
= "utf-16";
285 else if( 0x00 == pSource
[0] && 0x3c == pSource
[1] && 0x00 == pSource
[2] && 0x3f == pSource
[3] ) {
286 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
287 // The byte order mark is simply added
289 // simply add the byte order mark !
290 seq
.realloc( seq
.getLength() + 2 );
291 memmove( &( seq
.getArray()[2] ) , seq
.getArray() , seq
.getLength() - 2 );
292 ((sal_uInt8
*)seq
.getArray())[0] = 0xFE;
293 ((sal_uInt8
*)seq
.getArray())[1] = 0xFF;
295 m_sEncoding
= "utf-16";
297 else if( 0x3c == pSource
[0] && 0x00 == pSource
[1] && 0x3f == pSource
[2] && 0x00 == pSource
[3] ) {
298 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
299 // The byte order mark is simply added
301 seq
.realloc( seq
.getLength() + 2 );
302 memmove( &( seq
.getArray()[2] ) , seq
.getArray() , seq
.getLength() - 2 );
303 ((sal_uInt8
*)seq
.getArray())[0] = 0xFF;
304 ((sal_uInt8
*)seq
.getArray())[1] = 0xFE;
306 m_sEncoding
= "utf-16";
308 else if( 0xEF == pSource
[0] &&
309 0xBB == pSource
[1] &&
312 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
313 // The BOM is removed.
314 memmove( seq
.getArray(), &( seq
.getArray()[3] ), seq
.getLength()-3 );
315 seq
.realloc( seq
.getLength() - 3 );
316 m_sEncoding
= "utf-8";
318 else if( 0x00 == pSource
[0] && 0x00 == pSource
[1] && 0x00 == pSource
[2] && 0x3c == pSource
[3] ) {
320 m_sEncoding
= "ucs-4";
322 else if( 0x3c == pSource
[0] && 0x00 == pSource
[1] && 0x00 == pSource
[2] && 0x00 == pSource
[3] ) {
323 // UCS-4 little endian
324 m_sEncoding
= "ucs-4";
326 else if( 0x4c == pSource
[0] && 0x6f == pSource
[1] &&
327 0xa7 == static_cast<unsigned char> (pSource
[2]) &&
328 0x94 == static_cast<unsigned char> (pSource
[3]) ) {
330 bReturn
= sal_False
; // must be extended
334 // UTF8 is directly recognized by the parser.
341 void XMLFile2UTFConverter::initializeDecoding()
344 if( m_sEncoding
.getLength() )
346 rtl_TextEncoding encoding
= rtl_getTextEncodingFromMimeCharset( m_sEncoding
.getStr() );
347 if( encoding
!= RTL_TEXTENCODING_UTF8
)
349 m_pText2Unicode
= new Text2UnicodeConverter( m_sEncoding
);
350 m_pUnicode2Text
= new Unicode2TextConverter( RTL_TEXTENCODING_UTF8
);
356 //----------------------------------------------
358 // Text2UnicodeConverter
360 //----------------------------------------------
361 Text2UnicodeConverter::Text2UnicodeConverter( const OString
&sEncoding
)
363 rtl_TextEncoding encoding
= rtl_getTextEncodingFromMimeCharset( sEncoding
.getStr() );
364 if( RTL_TEXTENCODING_DONTKNOW
== encoding
)
366 m_bCanContinue
= sal_False
;
367 m_bInitialized
= sal_False
;
375 Text2UnicodeConverter::~Text2UnicodeConverter()
379 rtl_destroyTextToUnicodeContext( m_convText2Unicode
, m_contextText2Unicode
);
380 rtl_destroyUnicodeToTextConverter( m_convText2Unicode
);
384 void Text2UnicodeConverter::init( rtl_TextEncoding encoding
)
386 m_bCanContinue
= sal_True
;
387 m_bInitialized
= sal_True
;
389 m_convText2Unicode
= rtl_createTextToUnicodeConverter(encoding
);
390 m_contextText2Unicode
= rtl_createTextToUnicodeContext( m_convText2Unicode
);
391 m_rtlEncoding
= encoding
;
395 Sequence
<sal_Unicode
> Text2UnicodeConverter::convert( const Sequence
<sal_Int8
> &seqText
)
398 sal_Size nSrcCvtBytes
= 0;
399 sal_Size nTargetCount
= 0;
400 sal_Size nSourceCount
= 0;
402 // the whole source size
403 sal_Int32 nSourceSize
= seqText
.getLength() + m_seqSource
.getLength();
404 Sequence
<sal_Unicode
> seqUnicode ( nSourceSize
);
406 const sal_Int8
*pbSource
= seqText
.getConstArray();
407 sal_Int8
*pbTempMem
= 0;
409 if( m_seqSource
.getLength() ) {
410 // put old rest and new byte sequence into one array
411 pbTempMem
= new sal_Int8
[ nSourceSize
];
412 memcpy( pbTempMem
, m_seqSource
.getConstArray() , m_seqSource
.getLength() );
413 memcpy( &(pbTempMem
[ m_seqSource
.getLength() ]) , seqText
.getConstArray() , seqText
.getLength() );
414 pbSource
= pbTempMem
;
417 m_seqSource
= Sequence
< sal_Int8
>();
422 /* All invalid characters are transformed to the unicode undefined char */
423 nTargetCount
+= rtl_convertTextToUnicode(
425 m_contextText2Unicode
,
426 ( const sal_Char
* ) &( pbSource
[nSourceCount
] ),
427 nSourceSize
- nSourceCount
,
428 &( seqUnicode
.getArray()[ nTargetCount
] ),
429 seqUnicode
.getLength() - nTargetCount
,
430 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
|
431 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
|
432 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
,
435 nSourceCount
+= nSrcCvtBytes
;
437 if( uiInfo
& RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
) {
438 // save necessary bytes for next conversion
439 seqUnicode
.realloc( seqUnicode
.getLength() * 2 );
444 if( uiInfo
& RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
) {
445 m_seqSource
.realloc( nSourceSize
- nSourceCount
);
446 memcpy( m_seqSource
.getArray() , &(pbSource
[nSourceCount
]) , nSourceSize
-nSourceCount
);
454 // set to correct unicode size
455 seqUnicode
.realloc( nTargetCount
);
462 //----------------------------------------------
464 // Unicode2TextConverter
466 //----------------------------------------------
467 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding
)
473 Unicode2TextConverter::~Unicode2TextConverter()
475 if( m_bInitialized
) {
476 rtl_destroyUnicodeToTextContext( m_convUnicode2Text
, m_contextUnicode2Text
);
477 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text
);
482 Sequence
<sal_Int8
> Unicode2TextConverter::convert(const sal_Unicode
*puSource
, sal_Int32 nSourceSize
)
484 sal_Unicode
*puTempMem
= 0;
486 if( m_seqSource
.getLength() ) {
488 // put old rest and new byte sequence into one array
489 // In general when surrogates are used, they should be rarely
490 // cut off between two convert()-calls. So this code is used
491 // rarely and the extra copy is acceptable.
492 puTempMem
= new sal_Unicode
[ nSourceSize
+ m_seqSource
.getLength()];
494 m_seqSource
.getConstArray() ,
495 m_seqSource
.getLength() * sizeof( sal_Unicode
) );
497 &(puTempMem
[ m_seqSource
.getLength() ]) ,
499 nSourceSize
*sizeof( sal_Unicode
) );
500 puSource
= puTempMem
;
501 nSourceSize
+= m_seqSource
.getLength();
503 m_seqSource
= Sequence
< sal_Unicode
> ();
507 sal_Size nTargetCount
= 0;
508 sal_Size nSourceCount
= 0;
511 sal_Size nSrcCvtChars
;
513 // take nSourceSize * 3 as preference
514 // this is an upper boundary for converting to utf8,
515 // which most often used as the target.
516 sal_Int32 nSeqSize
= nSourceSize
* 3;
518 Sequence
<sal_Int8
> seqText( nSeqSize
);
519 sal_Char
*pTarget
= (sal_Char
*) seqText
.getArray();
522 nTargetCount
+= rtl_convertUnicodeToText(
524 m_contextUnicode2Text
,
525 &( puSource
[nSourceCount
] ),
526 nSourceSize
- nSourceCount
,
527 &( pTarget
[nTargetCount
] ),
528 nSeqSize
- nTargetCount
,
529 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT
|
530 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT
,
533 nSourceCount
+= nSrcCvtChars
;
535 if( uiInfo
& RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
) {
536 nSeqSize
= nSeqSize
*2;
537 seqText
.realloc( nSeqSize
); // double array size
538 pTarget
= ( sal_Char
* ) seqText
.getArray();
545 if( uiInfo
& RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
) {
546 m_seqSource
.realloc( nSourceSize
- nSourceCount
);
547 memcpy( m_seqSource
.getArray() ,
548 &(puSource
[nSourceCount
]),
549 (nSourceSize
- nSourceCount
) * sizeof( sal_Unicode
) );
556 // reduce the size of the buffer (fast, no copy necessary)
557 seqText
.realloc( nTargetCount
);
562 void Unicode2TextConverter::init( rtl_TextEncoding encoding
)
564 m_bCanContinue
= sal_True
;
565 m_bInitialized
= sal_True
;
567 m_convUnicode2Text
= rtl_createUnicodeToTextConverter( encoding
);
568 m_contextUnicode2Text
= rtl_createUnicodeToTextContext( m_convUnicode2Text
);
569 m_rtlEncoding
= encoding
;