sax/source/expatwrap/xml2utf.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19 #include <string.h>
  20
  21 #include <algorithm>
  22
  23 #include <sal/types.h>
  24
  25 #include <rtl/textenc.h>
  26 #include <rtl/tencinfo.h>
  27
  28 #include <com/sun/star/io/XInputStream.hpp>
  29
  30 using namespace ::com::sun::star::uno;
  31 using namespace ::com::sun::star::io;
  32
  33
  34 #include "xml2utf.hxx"
  35 #include <boost/scoped_array.hpp>
  36
  37 namespace sax_expatwrap {
  38
  39 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
  40     throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
  41 {
  42     if( ! m_in.is() ) {
  43         throw NotConnectedException();
  44     }
  45     if( ! m_bStarted ) {
  46         // it should be possible to find the encoding attribute
  47         // within the first 512 bytes == 128 chars in UCS-4
  48         nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
  49     }
  50
  51     sal_Int32 nRead;
  52     Sequence< sal_Int8 > seqStart;
  53     while( true )
  54     {
  55         nRead = m_in->readSomeBytes( seq , nMaxToRead );
  56
  57         if( nRead + seqStart.getLength())
  58         {
  59             // if nRead is 0, the file is already eof.
  60             if( ! m_bStarted && nRead )
  61             {
  62                 // ensure that enough data is available to parse encoding
  63                 if( seqStart.getLength() )
  64                 {
  65                   // prefix with what we had so far.
  66                   sal_Int32 nLength = seq.getLength();
  67                   seq.realloc( seqStart.getLength() + nLength );
  68
  69                   memmove (seq.getArray() + seqStart.getLength(),
  70                        seq.getConstArray(),
  71                        nLength);
  72                   memcpy  (seq.getArray(),
  73                        seqStart.getConstArray(),
  74                        seqStart.getLength());
  75                 }
  76
  77                 // autodetection with the first bytes
  78                 if( ! isEncodingRecognizable( seq ) )
  79                 {
  80                   // remember what we have so far.
  81                   seqStart = seq;
  82
  83                   // read more !
  84                   continue;
  85                 }
  86                 if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
  87                     // initialize decoding
  88                     initializeDecoding();
  89                 }
  90                 seqStart = Sequence < sal_Int8 > ();
  91             }
  92
  93             // do the encoding
  94             if( m_pText2Unicode && m_pUnicode2Text &&
  95                 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
  96
  97                 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
  98                 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
  99             }
 100
 101             if( ! m_bStarted )
 102             {
 103                 // it must now be ensured, that no encoding attribute exist anymore
 104                 // ( otherwise the expat-Parser will crash )
 105                 // This must be done after decoding !
 106                 // ( e.g. Files decoded in ucs-4 cannot be read properly )
 107                 m_bStarted = true;
 108                 removeEncoding( seq );
 109             }
 110             nRead = seq.getLength();
 111         }
 112
 113         break;
 114     }
 115     return nRead;
 116 }
 117
 118
 119 XMLFile2UTFConverter::~XMLFile2UTFConverter()
 120 {
 121     if( m_pText2Unicode )
 122         delete m_pText2Unicode;
 123     if( m_pUnicode2Text )
 124         delete m_pUnicode2Text;
 125 }
 126
 127
 128 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
 129 {
 130     const sal_Int8 *pSource = seq.getArray();
 131     if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4) )
 132     {
 133
 134         // scan for encoding
 135         OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );
 136
 137         // cut sequence to first line break
 138         // find first line break;
 139         int nMax = str.indexOf( 10 );
 140         if( nMax >= 0 )
 141         {
 142             str = str.copy( 0 , nMax );
 143         }
 144
 145         int nFound = str.indexOf( " encoding" );
 146         if( nFound >= 0 ) {
 147             int nStop;
 148             int nStart = str.indexOf( "\"" , nFound );
 149             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
 150             {
 151                 nStart = str.indexOf( "'" , nFound );
 152                 nStop  = str.indexOf( "'" , nStart +1 );
 153             }
 154             else
 155             {
 156                 nStop  = str.indexOf( "\"" , nStart +1);
 157             }
 158
 159             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
 160             {
 161                 // remove encoding tag from file
 162                 memmove(        &( seq.getArray()[nFound] ) ,
 163                                 &( seq.getArray()[nStop+1]) ,
 164                                 seq.getLength() - nStop -1);
 165                 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
 166 //              str = String( (char * ) seq.getArray() , seq.getLen() );
 167             }
 168         }
 169     }
 170 }
 171
 172 // Checks, if enough data has been accumulated to recognize the encoding
 173 bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
 174 {
 175     const sal_Int8 *pSource = seq.getConstArray();
 176     bool bCheckIfFirstClosingBracketExsists = false;
 177
 178     if( seq.getLength() < 8 ) {
 179         // no recognition possible, when less than 8 bytes are available
 180         return false;
 181     }
 182
 183     if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4 ) ) {
 184         // scan if the <?xml tag finishes within this buffer
 185         bCheckIfFirstClosingBracketExsists = true;
 186     }
 187     else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
 188              ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
 189     {
 190         // check for utf-16
 191         bCheckIfFirstClosingBracketExsists = true;
 192     }
 193     else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
 194              ( '?' == pSource[5] || '?' == pSource[7] ) )
 195     {
 196         // check for
 197         bCheckIfFirstClosingBracketExsists = true;
 198     }
 199
 200     if( bCheckIfFirstClosingBracketExsists )
 201     {
 202         for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
 203         {
 204             // whole <?xml tag is valid
 205             if( '>' == pSource[ i ] )
 206             {
 207                 return true;
 208             }
 209         }
 210         return false;
 211     }
 212
 213     // No <? tag in front, no need for a bigger buffer
 214     return true;
 215 }
 216
 217 bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
 218 {
 219     const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
 220     bool bReturn = true;
 221
 222     if( seq.getLength() < 4 ) {
 223         // no recognition possible, when less than 4 bytes are available
 224         return false;
 225     }
 226
 227     // first level : detect possible file formats
 228     if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4 ) ) {
 229
 230         // scan for encoding
 231         OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );
 232
 233         // cut sequence to first line break
 234         //find first line break;
 235         int nMax = str.indexOf( 10 );
 236         if( nMax >= 0 )
 237         {
 238             str = str.copy( 0 , nMax );
 239         }
 240
 241         int nFound = str.indexOf( " encoding" );
 242         if( nFound >= 0 ) {
 243             int nStop;
 244             int nStart = str.indexOf( "\"" , nFound );
 245             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
 246             {
 247                 nStart = str.indexOf( "'" , nFound );
 248                 nStop  = str.indexOf( "'" , nStart +1 );
 249             }
 250             else
 251             {
 252                 nStop  = str.indexOf( "\"" , nStart +1);
 253             }
 254             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
 255             {
 256                 // encoding found finally
 257                 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
 258             }
 259         }
 260     }
 261     else if( 0xFE == pSource[0] &&
 262              0xFF == pSource[1] ) {
 263         // UTF-16 big endian
 264         // conversion is done so that encoding information can be easily extracted
 265         m_sEncoding = "utf-16";
 266     }
 267     else if( 0xFF == pSource[0] &&
 268              0xFE == pSource[1] ) {
 269         // UTF-16 little endian
 270         // conversion is done so that encoding information can be easily extracted
 271         m_sEncoding = "utf-16";
 272     }
 273     else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
 274         // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
 275         // The byte order mark is simply added
 276
 277         // simply add the byte order mark !
 278         seq.realloc( seq.getLength() + 2 );
 279         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
 280         reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
 281         reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
 282
 283         m_sEncoding = "utf-16";
 284     }
 285     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
 286         // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
 287         // The byte order mark is simply added
 288
 289         seq.realloc( seq.getLength() + 2 );
 290         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
 291         reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
 292         reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;
 293
 294         m_sEncoding = "utf-16";
 295     }
 296     else if( 0xEF == pSource[0] &&
 297              0xBB == pSource[1] &&
 298              0xBF == pSource[2] )
 299     {
 300         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
 301         // The BOM is removed.
 302         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
 303         seq.realloc( seq.getLength() - 3 );
 304         m_sEncoding = "utf-8";
 305     }
 306     else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
 307         // UCS-4 big endian
 308         m_sEncoding = "ucs-4";
 309     }
 310     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
 311         // UCS-4 little endian
 312         m_sEncoding = "ucs-4";
 313     }
 314 /* TODO: no need to test for the moment since we return sal_False like default case anyway
 315     else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
 316              0xa7 == static_cast<unsigned char> (pSource[2]) &&
 317              0x94 == static_cast<unsigned char> (pSource[3]) ) {
 318         // EBCDIC
 319         bReturn = sal_False;   // must be extended
 320     }
 321 */
 322     else {
 323         // other
 324         // UTF8 is directly recognized by the parser.
 325         bReturn = false;
 326     }
 327
 328     return bReturn;
 329 }
 330
 331 void XMLFile2UTFConverter::initializeDecoding()
 332 {
 333
 334     if( !m_sEncoding.isEmpty() )
 335     {
 336         rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
 337         if( encoding != RTL_TEXTENCODING_UTF8 )
 338         {
 339             m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
 340             m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
 341         }
 342     }
 343 }
 344
 345
 346
 347
 348 // Text2UnicodeConverter
 349
 350
 351 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
 352     : m_convText2Unicode(NULL)
 353     , m_contextText2Unicode(NULL)
 354     , m_rtlEncoding(RTL_TEXTENCODING_DONTKNOW)
 355 {
 356     rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
 357     if( RTL_TEXTENCODING_DONTKNOW == encoding )
 358     {
 359         m_bCanContinue = false;
 360         m_bInitialized = false;
 361     }
 362     else
 363     {
 364         init( encoding );
 365     }
 366 }
 367
 368 Text2UnicodeConverter::~Text2UnicodeConverter()
 369 {
 370     if( m_bInitialized )
 371     {
 372         rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
 373         rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
 374     }
 375 }
 376
 377 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
 378 {
 379     m_bCanContinue = true;
 380     m_bInitialized = true;
 381
 382     m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);
 383     m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
 384     m_rtlEncoding = encoding;
 385 }
 386
 387
 388 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
 389 {
 390     sal_uInt32 uiInfo;
 391     sal_Size nSrcCvtBytes   = 0;
 392     sal_Size nTargetCount   = 0;
 393     sal_Size nSourceCount   = 0;
 394
 395     // the whole source size
 396     sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();
 397     Sequence<sal_Unicode>   seqUnicode ( nSourceSize );
 398
 399     const sal_Int8 *pbSource = seqText.getConstArray();
 400     boost::scoped_array<sal_Int8> pbTempMem;
 401
 402     if( m_seqSource.getLength() ) {
 403         // put old rest and new byte sequence into one array
 404         pbTempMem.reset(new sal_Int8[ nSourceSize ]);
 405         memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
 406         memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
 407         pbSource = pbTempMem.get();
 408
 409         // set to zero again
 410         m_seqSource = Sequence< sal_Int8 >();
 411     }
 412
 413     while( true ) {
 414
 415         /* All invalid characters are transformed to the unicode undefined char */
 416         nTargetCount +=     rtl_convertTextToUnicode(
 417                                     m_convText2Unicode,
 418                                     m_contextText2Unicode,
 419                                     reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
 420                                     nSourceSize - nSourceCount ,
 421                                     &( seqUnicode.getArray()[ nTargetCount ] ),
 422                                     seqUnicode.getLength() - nTargetCount,
 423                                     RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
 424                                     RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
 425                                     RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
 426                                     &uiInfo,
 427                                     &nSrcCvtBytes );
 428         nSourceCount += nSrcCvtBytes;
 429
 430         if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
 431             // save necessary bytes for next conversion
 432             seqUnicode.realloc( seqUnicode.getLength() * 2 );
 433             continue;
 434         }
 435         break;
 436     }
 437     if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
 438         m_seqSource.realloc( nSourceSize - nSourceCount );
 439         memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
 440     }
 441
 442     // set to correct unicode size
 443     seqUnicode.realloc( nTargetCount );
 444
 445     return seqUnicode;
 446 }
 447
 448
 449
 450
 451
 452 // Unicode2TextConverter
 453
 454
 455 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
 456 {
 457     init( encoding );
 458 }
 459
 460
 461 Unicode2TextConverter::~Unicode2TextConverter()
 462 {
 463     if( m_bInitialized ) {
 464         rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
 465         rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
 466     }
 467 }
 468
 469
 470 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
 471 {
 472     boost::scoped_array<sal_Unicode> puTempMem;
 473
 474     if( m_seqSource.getLength() ) {
 475         // For surrogates !
 476         // put old rest and new byte sequence into one array
 477         // In general when surrogates are used, they should be rarely
 478         // cut off between two convert()-calls. So this code is used
 479         // rarely and the extra copy is acceptable.
 480         puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
 481         memcpy( puTempMem.get() ,
 482                 m_seqSource.getConstArray() ,
 483                 m_seqSource.getLength() * sizeof( sal_Unicode ) );
 484         memcpy(
 485             &(puTempMem[ m_seqSource.getLength() ]) ,
 486             puSource ,
 487             nSourceSize*sizeof( sal_Unicode ) );
 488         puSource = puTempMem.get();
 489         nSourceSize += m_seqSource.getLength();
 490
 491         m_seqSource = Sequence< sal_Unicode > ();
 492     }
 493
 494
 495     sal_Size nTargetCount = 0;
 496     sal_Size nSourceCount = 0;
 497
 498     sal_uInt32 uiInfo;
 499     sal_Size nSrcCvtChars;
 500
 501     // take nSourceSize * 3 as preference
 502     // this is an upper boundary for converting to utf8,
 503     // which most often used as the target.
 504     sal_Int32 nSeqSize =  nSourceSize * 3;
 505
 506     Sequence<sal_Int8>  seqText( nSeqSize );
 507     sal_Char *pTarget = reinterpret_cast<char *>(seqText.getArray());
 508     while( true ) {
 509
 510         nTargetCount += rtl_convertUnicodeToText(
 511                                     m_convUnicode2Text,
 512                                     m_contextUnicode2Text,
 513                                     &( puSource[nSourceCount] ),
 514                                     nSourceSize - nSourceCount ,
 515                                     &( pTarget[nTargetCount] ),
 516                                     nSeqSize - nTargetCount,
 517                                     RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
 518                                     RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
 519                                     &uiInfo,
 520                                     &nSrcCvtChars);
 521         nSourceCount += nSrcCvtChars;
 522
 523         if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
 524             nSeqSize = nSeqSize *2;
 525             seqText.realloc( nSeqSize );  // double array size
 526             pTarget = reinterpret_cast<char *>(seqText.getArray());
 527             continue;
 528         }
 529         break;
 530     }
 531
 532     // for surrogates
 533     if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
 534         m_seqSource.realloc( nSourceSize - nSourceCount );
 535         memcpy( m_seqSource.getArray() ,
 536                 &(puSource[nSourceCount]),
 537                 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
 538     }
 539
 540     // reduce the size of the buffer (fast, no copy necessary)
 541     seqText.realloc( nTargetCount );
 542
 543     return seqText;
 544 }
 545
 546 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
 547 {
 548     m_bCanContinue = true;
 549     m_bInitialized = true;
 550
 551     m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );
 552     m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
 553     m_rtlEncoding = encoding;
 554 };
 555
 556
 557 }
 558
 559 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */