sax/source/expatwrap/xml2utf.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19 #include <string.h>
  20
  21 #include <algorithm>
  22
  23 #include <sal/types.h>
  24
  25 #include <rtl/textenc.h>
  26 #include <rtl/tencinfo.h>
  27 #include <com/sun/star/io/NotConnectedException.hpp>
  28 #include <com/sun/star/io/XInputStream.hpp>
  29 #include <xml2utf.hxx>
  30 #include <memory>
  31
  32
  33 using namespace ::com::sun::star::uno;
  34 using namespace ::com::sun::star::io;
  35
  36
  37 namespace sax_expatwrap {
  38
  39 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
  40 {
  41     if( ! m_in.is() ) {
  42         throw NotConnectedException();
  43     }
  44     if( ! m_bStarted ) {
  45         // it should be possible to find the encoding attribute
  46         // within the first 512 bytes == 128 chars in UCS-4
  47         nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
  48     }
  49
  50     sal_Int32 nRead;
  51     Sequence< sal_Int8 > seqStart;
  52     while( true )
  53     {
  54         nRead = m_in->readSomeBytes( seq , nMaxToRead );
  55
  56         if( nRead + seqStart.getLength())
  57         {
  58             // if nRead is 0, the file is already eof.
  59             if( ! m_bStarted && nRead )
  60             {
  61                 // ensure that enough data is available to parse encoding
  62                 if( seqStart.hasElements() )
  63                 {
  64                   // prefix with what we had so far.
  65                   sal_Int32 nLength = seq.getLength();
  66                   seq.realloc( seqStart.getLength() + nLength );
  67
  68                   memmove (seq.getArray() + seqStart.getLength(),
  69                        seq.getConstArray(),
  70                        nLength);
  71                   memcpy  (seq.getArray(),
  72                        seqStart.getConstArray(),
  73                        seqStart.getLength());
  74                 }
  75
  76                 // autodetection with the first bytes
  77                 if( ! isEncodingRecognizable( seq ) )
  78                 {
  79                   // remember what we have so far.
  80                   seqStart = seq;
  81
  82                   // read more !
  83                   continue;
  84                 }
  85                 if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
  86                     // initialize decoding
  87                     initializeDecoding();
  88                 }
  89                 seqStart = Sequence < sal_Int8 > ();
  90             }
  91
  92             // do the encoding
  93             if( m_pText2Unicode && m_pUnicode2Text &&
  94                 m_pText2Unicode->canContinue() ) {
  95
  96                 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
  97                 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
  98             }
  99
 100             if( ! m_bStarted )
 101             {
 102                 // it must now be ensured, that no encoding attribute exist anymore
 103                 // ( otherwise the expat-Parser will crash )
 104                 // This must be done after decoding !
 105                 // ( e.g. Files decoded in ucs-4 cannot be read properly )
 106                 m_bStarted = true;
 107                 removeEncoding( seq );
 108             }
 109             nRead = seq.getLength();
 110         }
 111
 112         break;
 113     }
 114     return nRead;
 115 }
 116
 117 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
 118 {
 119     const sal_Int8 *pSource = seq.getArray();
 120     if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5))
 121         return;
 122
 123     // scan for encoding
 124     OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );
 125
 126     // cut sequence to first line break
 127     // find first line break;
 128     int nMax = str.indexOf( 10 );
 129     if( nMax >= 0 )
 130     {
 131         str = str.copy( 0 , nMax );
 132     }
 133
 134     int nFound = str.indexOf( " encoding" );
 135     if( nFound < 0 )        return;
 136
 137     int nStop;
 138     int nStart = str.indexOf( "\"" , nFound );
 139     if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
 140     {
 141         nStart = str.indexOf( "'" , nFound );
 142         nStop  = str.indexOf( "'" , nStart +1 );
 143     }
 144     else
 145     {
 146         nStop  = str.indexOf( "\"" , nStart +1);
 147     }
 148
 149     if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
 150     {
 151         // remove encoding tag from file
 152         memmove(        &( seq.getArray()[nFound] ) ,
 153                         &( seq.getArray()[nStop+1]) ,
 154                         seq.getLength() - nStop -1);
 155         seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
 156     }
 157 }
 158
 159 // Checks, if enough data has been accumulated to recognize the encoding
 160 bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
 161 {
 162     const sal_Int8 *pSource = seq.getConstArray();
 163     bool bCheckIfFirstClosingBracketExists = false;
 164
 165     if( seq.getLength() < 8 ) {
 166         // no recognition possible, when less than 8 bytes are available
 167         return false;
 168     }
 169
 170     if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) {
 171         // scan if the <?xml tag finishes within this buffer
 172         bCheckIfFirstClosingBracketExists = true;
 173     }
 174     else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
 175              ('?' == pSource[4] || '?' == pSource[6] ) )
 176     {
 177         // check for utf-16
 178         bCheckIfFirstClosingBracketExists = true;
 179     }
 180     else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
 181              ( '?' == pSource[5] || '?' == pSource[7] ) )
 182     {
 183         // check for
 184         bCheckIfFirstClosingBracketExists = true;
 185     }
 186
 187     if( bCheckIfFirstClosingBracketExists )
 188     {
 189         // whole <?xml tag is valid
 190         return std::find(seq.begin(), seq.end(), '>') != seq.end();
 191     }
 192
 193     // No <? tag in front, no need for a bigger buffer
 194     return true;
 195 }
 196
 197 bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
 198 {
 199     const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
 200     bool bReturn = true;
 201
 202     if( seq.getLength() < 4 ) {
 203         // no recognition possible, when less than 4 bytes are available
 204         return false;
 205     }
 206
 207     // first level : detect possible file formats
 208     if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) {
 209         // scan for encoding
 210         OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );
 211
 212         // cut sequence to first line break
 213         //find first line break;
 214         int nMax = str.indexOf( 10 );
 215         if( nMax >= 0 )
 216         {
 217             str = str.copy( 0 , nMax );
 218         }
 219
 220         int nFound = str.indexOf( " encoding" );
 221         if( nFound >= 0 ) {
 222             int nStop;
 223             int nStart = str.indexOf( "\"" , nFound );
 224             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
 225             {
 226                 nStart = str.indexOf( "'" , nFound );
 227                 nStop  = str.indexOf( "'" , nStart +1 );
 228             }
 229             else
 230             {
 231                 nStop  = str.indexOf( "\"" , nStart +1);
 232             }
 233             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
 234             {
 235                 // encoding found finally
 236                 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
 237             }
 238         }
 239     }
 240     else if( 0xFE == pSource[0] &&
 241              0xFF == pSource[1] ) {
 242         // UTF-16 big endian
 243         // conversion is done so that encoding information can be easily extracted
 244         m_sEncoding = "utf-16"_ostr;
 245     }
 246     else if( 0xFF == pSource[0] &&
 247              0xFE == pSource[1] ) {
 248         // UTF-16 little endian
 249         // conversion is done so that encoding information can be easily extracted
 250         m_sEncoding = "utf-16"_ostr;
 251     }
 252     else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
 253         // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
 254         // The byte order mark is simply added
 255
 256         // simply add the byte order mark !
 257         seq.realloc( seq.getLength() + 2 );
 258         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
 259         reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
 260         reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
 261
 262         m_sEncoding = "utf-16"_ostr;
 263     }
 264     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
 265         // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
 266         // The byte order mark is simply added
 267
 268         seq.realloc( seq.getLength() + 2 );
 269         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
 270         reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
 271         reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;
 272
 273         m_sEncoding = "utf-16"_ostr;
 274     }
 275     else if( 0xEF == pSource[0] &&
 276              0xBB == pSource[1] &&
 277              0xBF == pSource[2] )
 278     {
 279         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
 280         // The BOM is removed.
 281         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
 282         seq.realloc( seq.getLength() - 3 );
 283         m_sEncoding = "utf-8"_ostr;
 284     }
 285     else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
 286         // UCS-4 big endian
 287         m_sEncoding = "ucs-4"_ostr;
 288     }
 289     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
 290         // UCS-4 little endian
 291         m_sEncoding = "ucs-4"_ostr;
 292     }
 293 /* TODO: no need to test for the moment since we return sal_False like default case anyway
 294     else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
 295              0xa7 == static_cast<unsigned char> (pSource[2]) &&
 296              0x94 == static_cast<unsigned char> (pSource[3]) ) {
 297         // EBCDIC
 298         bReturn = sal_False;   // must be extended
 299     }
 300 */
 301     else {
 302         // other
 303         // UTF8 is directly recognized by the parser.
 304         bReturn = false;
 305     }
 306
 307     return bReturn;
 308 }
 309
 310 void XMLFile2UTFConverter::initializeDecoding()
 311 {
 312
 313     if( !m_sEncoding.isEmpty() )
 314     {
 315         rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
 316         if( encoding != RTL_TEXTENCODING_UTF8 )
 317         {
 318             m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding );
 319             m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 );
 320         }
 321     }
 322 }
 323
 324
 325 // Text2UnicodeConverter
 326
 327
 328 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
 329     : m_convText2Unicode(nullptr)
 330     , m_contextText2Unicode(nullptr)
 331 {
 332     rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
 333     if( RTL_TEXTENCODING_DONTKNOW == encoding )
 334     {
 335         m_bCanContinue = false;
 336         m_bInitialized = false;
 337     }
 338     else
 339     {
 340         init( encoding );
 341     }
 342 }
 343
 344 Text2UnicodeConverter::~Text2UnicodeConverter()
 345 {
 346     if( m_bInitialized )
 347     {
 348         rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
 349         rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
 350     }
 351 }
 352
 353 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
 354 {
 355     m_bCanContinue = true;
 356     m_bInitialized = true;
 357
 358     m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);
 359     m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
 360 }
 361
 362
 363 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
 364 {
 365     sal_uInt32 uiInfo;
 366     sal_Size nSrcCvtBytes   = 0;
 367     sal_Size nTargetCount   = 0;
 368     sal_Size nSourceCount   = 0;
 369
 370     // the whole source size
 371     sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();
 372     Sequence<sal_Unicode>   seqUnicode ( nSourceSize );
 373
 374     const sal_Int8 *pbSource = seqText.getConstArray();
 375     std::unique_ptr<sal_Int8[]> pbTempMem;
 376
 377     if( m_seqSource.hasElements() ) {
 378         // put old rest and new byte sequence into one array
 379         pbTempMem.reset(new sal_Int8[ nSourceSize ]);
 380         memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
 381         memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
 382         pbSource = pbTempMem.get();
 383
 384         // set to zero again
 385         m_seqSource = Sequence< sal_Int8 >();
 386     }
 387
 388     while( true ) {
 389
 390         /* All invalid characters are transformed to the unicode undefined char */
 391         nTargetCount +=     rtl_convertTextToUnicode(
 392                                     m_convText2Unicode,
 393                                     m_contextText2Unicode,
 394                                     reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
 395                                     nSourceSize - nSourceCount ,
 396                                     &( seqUnicode.getArray()[ nTargetCount ] ),
 397                                     seqUnicode.getLength() - nTargetCount,
 398                                     RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
 399                                     RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
 400                                     RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
 401                                     &uiInfo,
 402                                     &nSrcCvtBytes );
 403         nSourceCount += nSrcCvtBytes;
 404
 405         if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) {
 406             // save necessary bytes for next conversion
 407             seqUnicode.realloc( seqUnicode.getLength() * 2 );
 408             continue;
 409         }
 410         break;
 411     }
 412     if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) {
 413         m_seqSource.realloc( nSourceSize - nSourceCount );
 414         memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
 415     }
 416
 417     // set to correct unicode size
 418     seqUnicode.realloc( nTargetCount );
 419
 420     return seqUnicode;
 421 }
 422
 423
 424 // Unicode2TextConverter
 425
 426
 427 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
 428 {
 429     m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );
 430     m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
 431 }
 432
 433
 434 Unicode2TextConverter::~Unicode2TextConverter()
 435 {
 436     rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
 437     rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
 438 }
 439
 440
 441 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
 442 {
 443     std::unique_ptr<sal_Unicode[]> puTempMem;
 444
 445     if( m_seqSource.hasElements() ) {
 446         // For surrogates !
 447         // put old rest and new byte sequence into one array
 448         // In general when surrogates are used, they should be rarely
 449         // cut off between two convert()-calls. So this code is used
 450         // rarely and the extra copy is acceptable.
 451         puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
 452         memcpy( puTempMem.get() ,
 453                 m_seqSource.getConstArray() ,
 454                 m_seqSource.getLength() * sizeof( sal_Unicode ) );
 455         memcpy(
 456             &(puTempMem[ m_seqSource.getLength() ]) ,
 457             puSource ,
 458             nSourceSize*sizeof( sal_Unicode ) );
 459         puSource = puTempMem.get();
 460         nSourceSize += m_seqSource.getLength();
 461
 462         m_seqSource = Sequence< sal_Unicode > ();
 463     }
 464
 465
 466     sal_Size nTargetCount = 0;
 467     sal_Size nSourceCount = 0;
 468
 469     sal_uInt32 uiInfo;
 470     sal_Size nSrcCvtChars;
 471
 472     // take nSourceSize * 3 as preference
 473     // this is an upper boundary for converting to utf8,
 474     // which most often used as the target.
 475     sal_Int32 nSeqSize =  nSourceSize * 3;
 476
 477     Sequence<sal_Int8>  seqText( nSeqSize );
 478     char *pTarget = reinterpret_cast<char *>(seqText.getArray());
 479     while( true ) {
 480
 481         nTargetCount += rtl_convertUnicodeToText(
 482                                     m_convUnicode2Text,
 483                                     m_contextUnicode2Text,
 484                                     &( puSource[nSourceCount] ),
 485                                     nSourceSize - nSourceCount ,
 486                                     &( pTarget[nTargetCount] ),
 487                                     nSeqSize - nTargetCount,
 488                                     RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
 489                                     RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
 490                                     &uiInfo,
 491                                     &nSrcCvtChars);
 492         nSourceCount += nSrcCvtChars;
 493
 494         if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
 495             nSeqSize = nSeqSize *2;
 496             seqText.realloc( nSeqSize );  // double array size
 497             pTarget = reinterpret_cast<char *>(seqText.getArray());
 498             continue;
 499         }
 500         break;
 501     }
 502
 503     // for surrogates
 504     if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
 505         m_seqSource.realloc( nSourceSize - nSourceCount );
 506         memcpy( m_seqSource.getArray() ,
 507                 &(puSource[nSourceCount]),
 508                 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
 509     }
 510
 511     // reduce the size of the buffer (fast, no copy necessary)
 512     seqText.realloc( nTargetCount );
 513
 514     return seqText;
 515 }
 516
 517 }
 518
 519 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */