1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
23 #include <sal/types.h>
25 #include <rtl/textenc.h>
26 #include <rtl/tencinfo.h>
27 #include <com/sun/star/io/NotConnectedException.hpp>
28 #include <com/sun/star/io/XInputStream.hpp>
29 #include <xml2utf.hxx>
33 using namespace ::com::sun::star::uno
;
34 using namespace ::com::sun::star::io
;
37 namespace sax_expatwrap
{
39 sal_Int32
XMLFile2UTFConverter::readAndConvert( Sequence
<sal_Int8
> &seq
, sal_Int32 nMaxToRead
)
42 throw NotConnectedException();
45 // it should be possible to find the encoding attribute
46 // within the first 512 bytes == 128 chars in UCS-4
47 nMaxToRead
= ::std::max( sal_Int32(512) , nMaxToRead
);
51 Sequence
< sal_Int8
> seqStart
;
54 nRead
= m_in
->readSomeBytes( seq
, nMaxToRead
);
56 if( nRead
+ seqStart
.getLength())
58 // if nRead is 0, the file is already eof.
59 if( ! m_bStarted
&& nRead
)
61 // ensure that enough data is available to parse encoding
62 if( seqStart
.hasElements() )
64 // prefix with what we had so far.
65 sal_Int32 nLength
= seq
.getLength();
66 seq
.realloc( seqStart
.getLength() + nLength
);
68 memmove (seq
.getArray() + seqStart
.getLength(),
71 memcpy (seq
.getArray(),
72 seqStart
.getConstArray(),
73 seqStart
.getLength());
76 // autodetection with the first bytes
77 if( ! isEncodingRecognizable( seq
) )
79 // remember what we have so far.
85 if( scanForEncoding( seq
) || !m_sEncoding
.isEmpty() ) {
86 // initialize decoding
89 seqStart
= Sequence
< sal_Int8
> ();
93 if( m_pText2Unicode
&& m_pUnicode2Text
&&
94 m_pText2Unicode
->canContinue() ) {
96 Sequence
<sal_Unicode
> seqUnicode
= m_pText2Unicode
->convert( seq
);
97 seq
= m_pUnicode2Text
->convert( seqUnicode
.getConstArray(), seqUnicode
.getLength() );
102 // it must now be ensured, that no encoding attribute exist anymore
103 // ( otherwise the expat-Parser will crash )
104 // This must be done after decoding !
105 // ( e.g. Files decoded in ucs-4 cannot be read properly )
107 removeEncoding( seq
);
109 nRead
= seq
.getLength();
117 void XMLFile2UTFConverter::removeEncoding( Sequence
<sal_Int8
> &seq
)
119 const sal_Int8
*pSource
= seq
.getArray();
120 if (seq
.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource
), "<?xml", 5))
124 OString
str( reinterpret_cast<char const *>(pSource
), seq
.getLength() );
126 // cut sequence to first line break
127 // find first line break;
128 int nMax
= str
.indexOf( 10 );
131 str
= str
.copy( 0 , nMax
);
134 int nFound
= str
.indexOf( " encoding" );
135 if( nFound
< 0 ) return;
138 int nStart
= str
.indexOf( "\"" , nFound
);
139 if( nStart
< 0 || str
.indexOf( "'" , nFound
) < nStart
)
141 nStart
= str
.indexOf( "'" , nFound
);
142 nStop
= str
.indexOf( "'" , nStart
+1 );
146 nStop
= str
.indexOf( "\"" , nStart
+1);
149 if( nStart
>= 0 && nStop
>= 0 && nStart
+1 < nStop
)
151 // remove encoding tag from file
152 memmove( &( seq
.getArray()[nFound
] ) ,
153 &( seq
.getArray()[nStop
+1]) ,
154 seq
.getLength() - nStop
-1);
155 seq
.realloc( seq
.getLength() - ( nStop
+1 - nFound
) );
159 // Checks, if enough data has been accumulated to recognize the encoding
160 bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence
< sal_Int8
> &seq
)
162 const sal_Int8
*pSource
= seq
.getConstArray();
163 bool bCheckIfFirstClosingBracketExists
= false;
165 if( seq
.getLength() < 8 ) {
166 // no recognition possible, when less than 8 bytes are available
170 if( ! strncmp( reinterpret_cast<const char *>(pSource
), "<?xml", 5 ) ) {
171 // scan if the <?xml tag finishes within this buffer
172 bCheckIfFirstClosingBracketExists
= true;
174 else if( ('<' == pSource
[0] || '<' == pSource
[2] ) &&
175 ('?' == pSource
[4] || '?' == pSource
[6] ) )
178 bCheckIfFirstClosingBracketExists
= true;
180 else if( ( '<' == pSource
[1] || '<' == pSource
[3] ) &&
181 ( '?' == pSource
[5] || '?' == pSource
[7] ) )
184 bCheckIfFirstClosingBracketExists
= true;
187 if( bCheckIfFirstClosingBracketExists
)
189 // whole <?xml tag is valid
190 return std::find(seq
.begin(), seq
.end(), '>') != seq
.end();
193 // No <? tag in front, no need for a bigger buffer
197 bool XMLFile2UTFConverter::scanForEncoding( Sequence
< sal_Int8
> &seq
)
199 const sal_uInt8
*pSource
= reinterpret_cast<const sal_uInt8
*>( seq
.getConstArray() );
202 if( seq
.getLength() < 4 ) {
203 // no recognition possible, when less than 4 bytes are available
207 // first level : detect possible file formats
208 if (seq
.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource
), "<?xml", 5)) {
210 OString
str( reinterpret_cast<const char *>(pSource
), seq
.getLength() );
212 // cut sequence to first line break
213 //find first line break;
214 int nMax
= str
.indexOf( 10 );
217 str
= str
.copy( 0 , nMax
);
220 int nFound
= str
.indexOf( " encoding" );
223 int nStart
= str
.indexOf( "\"" , nFound
);
224 if( nStart
< 0 || str
.indexOf( "'" , nFound
) < nStart
)
226 nStart
= str
.indexOf( "'" , nFound
);
227 nStop
= str
.indexOf( "'" , nStart
+1 );
231 nStop
= str
.indexOf( "\"" , nStart
+1);
233 if( nStart
>= 0 && nStop
>= 0 && nStart
+1 < nStop
)
235 // encoding found finally
236 m_sEncoding
= str
.copy( nStart
+1 , nStop
- nStart
- 1 );
240 else if( 0xFE == pSource
[0] &&
241 0xFF == pSource
[1] ) {
243 // conversion is done so that encoding information can be easily extracted
244 m_sEncoding
= "utf-16"_ostr
;
246 else if( 0xFF == pSource
[0] &&
247 0xFE == pSource
[1] ) {
248 // UTF-16 little endian
249 // conversion is done so that encoding information can be easily extracted
250 m_sEncoding
= "utf-16"_ostr
;
252 else if( 0x00 == pSource
[0] && 0x3c == pSource
[1] && 0x00 == pSource
[2] && 0x3f == pSource
[3] ) {
253 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
254 // The byte order mark is simply added
256 // simply add the byte order mark !
257 seq
.realloc( seq
.getLength() + 2 );
258 memmove( &( seq
.getArray()[2] ) , seq
.getArray() , seq
.getLength() - 2 );
259 reinterpret_cast<sal_uInt8
*>(seq
.getArray())[0] = 0xFE;
260 reinterpret_cast<sal_uInt8
*>(seq
.getArray())[1] = 0xFF;
262 m_sEncoding
= "utf-16"_ostr
;
264 else if( 0x3c == pSource
[0] && 0x00 == pSource
[1] && 0x3f == pSource
[2] && 0x00 == pSource
[3] ) {
265 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
266 // The byte order mark is simply added
268 seq
.realloc( seq
.getLength() + 2 );
269 memmove( &( seq
.getArray()[2] ) , seq
.getArray() , seq
.getLength() - 2 );
270 reinterpret_cast<sal_uInt8
*>(seq
.getArray())[0] = 0xFF;
271 reinterpret_cast<sal_uInt8
*>(seq
.getArray())[1] = 0xFE;
273 m_sEncoding
= "utf-16"_ostr
;
275 else if( 0xEF == pSource
[0] &&
276 0xBB == pSource
[1] &&
279 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
280 // The BOM is removed.
281 memmove( seq
.getArray(), &( seq
.getArray()[3] ), seq
.getLength()-3 );
282 seq
.realloc( seq
.getLength() - 3 );
283 m_sEncoding
= "utf-8"_ostr
;
285 else if( 0x00 == pSource
[0] && 0x00 == pSource
[1] && 0x00 == pSource
[2] && 0x3c == pSource
[3] ) {
287 m_sEncoding
= "ucs-4"_ostr
;
289 else if( 0x3c == pSource
[0] && 0x00 == pSource
[1] && 0x00 == pSource
[2] && 0x00 == pSource
[3] ) {
290 // UCS-4 little endian
291 m_sEncoding
= "ucs-4"_ostr
;
293 /* TODO: no need to test for the moment since we return sal_False like default case anyway
294 else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
295 0xa7 == static_cast<unsigned char> (pSource[2]) &&
296 0x94 == static_cast<unsigned char> (pSource[3]) ) {
298 bReturn = sal_False; // must be extended
303 // UTF8 is directly recognized by the parser.
310 void XMLFile2UTFConverter::initializeDecoding()
313 if( !m_sEncoding
.isEmpty() )
315 rtl_TextEncoding encoding
= rtl_getTextEncodingFromMimeCharset( m_sEncoding
.getStr() );
316 if( encoding
!= RTL_TEXTENCODING_UTF8
)
318 m_pText2Unicode
= std::make_unique
<Text2UnicodeConverter
>( m_sEncoding
);
319 m_pUnicode2Text
= std::make_unique
<Unicode2TextConverter
>( RTL_TEXTENCODING_UTF8
);
325 // Text2UnicodeConverter
328 Text2UnicodeConverter::Text2UnicodeConverter( const OString
&sEncoding
)
329 : m_convText2Unicode(nullptr)
330 , m_contextText2Unicode(nullptr)
332 rtl_TextEncoding encoding
= rtl_getTextEncodingFromMimeCharset( sEncoding
.getStr() );
333 if( RTL_TEXTENCODING_DONTKNOW
== encoding
)
335 m_bCanContinue
= false;
336 m_bInitialized
= false;
344 Text2UnicodeConverter::~Text2UnicodeConverter()
348 rtl_destroyTextToUnicodeContext( m_convText2Unicode
, m_contextText2Unicode
);
349 rtl_destroyUnicodeToTextConverter( m_convText2Unicode
);
353 void Text2UnicodeConverter::init( rtl_TextEncoding encoding
)
355 m_bCanContinue
= true;
356 m_bInitialized
= true;
358 m_convText2Unicode
= rtl_createTextToUnicodeConverter(encoding
);
359 m_contextText2Unicode
= rtl_createTextToUnicodeContext( m_convText2Unicode
);
363 Sequence
<sal_Unicode
> Text2UnicodeConverter::convert( const Sequence
<sal_Int8
> &seqText
)
366 sal_Size nSrcCvtBytes
= 0;
367 sal_Size nTargetCount
= 0;
368 sal_Size nSourceCount
= 0;
370 // the whole source size
371 sal_Int32 nSourceSize
= seqText
.getLength() + m_seqSource
.getLength();
372 Sequence
<sal_Unicode
> seqUnicode ( nSourceSize
);
374 const sal_Int8
*pbSource
= seqText
.getConstArray();
375 std::unique_ptr
<sal_Int8
[]> pbTempMem
;
377 if( m_seqSource
.hasElements() ) {
378 // put old rest and new byte sequence into one array
379 pbTempMem
.reset(new sal_Int8
[ nSourceSize
]);
380 memcpy( pbTempMem
.get() , m_seqSource
.getConstArray() , m_seqSource
.getLength() );
381 memcpy( &(pbTempMem
[ m_seqSource
.getLength() ]) , seqText
.getConstArray() , seqText
.getLength() );
382 pbSource
= pbTempMem
.get();
385 m_seqSource
= Sequence
< sal_Int8
>();
390 /* All invalid characters are transformed to the unicode undefined char */
391 nTargetCount
+= rtl_convertTextToUnicode(
393 m_contextText2Unicode
,
394 reinterpret_cast<const char *>(&( pbSource
[nSourceCount
] )),
395 nSourceSize
- nSourceCount
,
396 &( seqUnicode
.getArray()[ nTargetCount
] ),
397 seqUnicode
.getLength() - nTargetCount
,
398 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
|
399 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
|
400 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
,
403 nSourceCount
+= nSrcCvtBytes
;
405 if( uiInfo
& RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
) {
406 // save necessary bytes for next conversion
407 seqUnicode
.realloc( seqUnicode
.getLength() * 2 );
412 if( uiInfo
& RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) {
413 m_seqSource
.realloc( nSourceSize
- nSourceCount
);
414 memcpy( m_seqSource
.getArray() , &(pbSource
[nSourceCount
]) , nSourceSize
-nSourceCount
);
417 // set to correct unicode size
418 seqUnicode
.realloc( nTargetCount
);
424 // Unicode2TextConverter
427 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding
)
429 m_convUnicode2Text
= rtl_createUnicodeToTextConverter( encoding
);
430 m_contextUnicode2Text
= rtl_createUnicodeToTextContext( m_convUnicode2Text
);
434 Unicode2TextConverter::~Unicode2TextConverter()
436 rtl_destroyUnicodeToTextContext( m_convUnicode2Text
, m_contextUnicode2Text
);
437 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text
);
441 Sequence
<sal_Int8
> Unicode2TextConverter::convert(const sal_Unicode
*puSource
, sal_Int32 nSourceSize
)
443 std::unique_ptr
<sal_Unicode
[]> puTempMem
;
445 if( m_seqSource
.hasElements() ) {
447 // put old rest and new byte sequence into one array
448 // In general when surrogates are used, they should be rarely
449 // cut off between two convert()-calls. So this code is used
450 // rarely and the extra copy is acceptable.
451 puTempMem
.reset(new sal_Unicode
[ nSourceSize
+ m_seqSource
.getLength()]);
452 memcpy( puTempMem
.get() ,
453 m_seqSource
.getConstArray() ,
454 m_seqSource
.getLength() * sizeof( sal_Unicode
) );
456 &(puTempMem
[ m_seqSource
.getLength() ]) ,
458 nSourceSize
*sizeof( sal_Unicode
) );
459 puSource
= puTempMem
.get();
460 nSourceSize
+= m_seqSource
.getLength();
462 m_seqSource
= Sequence
< sal_Unicode
> ();
466 sal_Size nTargetCount
= 0;
467 sal_Size nSourceCount
= 0;
470 sal_Size nSrcCvtChars
;
472 // take nSourceSize * 3 as preference
473 // this is an upper boundary for converting to utf8,
474 // which most often used as the target.
475 sal_Int32 nSeqSize
= nSourceSize
* 3;
477 Sequence
<sal_Int8
> seqText( nSeqSize
);
478 char *pTarget
= reinterpret_cast<char *>(seqText
.getArray());
481 nTargetCount
+= rtl_convertUnicodeToText(
483 m_contextUnicode2Text
,
484 &( puSource
[nSourceCount
] ),
485 nSourceSize
- nSourceCount
,
486 &( pTarget
[nTargetCount
] ),
487 nSeqSize
- nTargetCount
,
488 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT
|
489 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT
,
492 nSourceCount
+= nSrcCvtChars
;
494 if( uiInfo
& RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
) {
495 nSeqSize
= nSeqSize
*2;
496 seqText
.realloc( nSeqSize
); // double array size
497 pTarget
= reinterpret_cast<char *>(seqText
.getArray());
504 if( uiInfo
& RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
) {
505 m_seqSource
.realloc( nSourceSize
- nSourceCount
);
506 memcpy( m_seqSource
.getArray() ,
507 &(puSource
[nSourceCount
]),
508 (nSourceSize
- nSourceCount
) * sizeof( sal_Unicode
) );
511 // reduce the size of the buffer (fast, no copy necessary)
512 seqText
.realloc( nTargetCount
);
519 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */