sax/source/expatwrap/xml2utf.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: xml2utf.cxx,v $
  10  * $Revision: 1.11.10.1 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30 #include <string.h>
  31
  32 #include <sal/types.h>
  33
  34 #include <rtl/textenc.h>
  35 #include <rtl/tencinfo.h>
  36
  37
  38 #include <com/sun/star/io/XInputStream.hpp>
  39
  40 using namespace rtl;
  41 using namespace ::com::sun::star::uno;
  42 using namespace ::com::sun::star::io;
  43
  44 #include "xml2utf.hxx"
  45
  46 namespace sax_expatwrap {
  47
  48 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
  49     throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
  50 {
  51
  52     Sequence<sal_Int8> seqIn;
  53
  54     if( ! m_in.is() ) {
  55         throw NotConnectedException();
  56     }
  57     if( ! m_bStarted ) {
  58         nMaxToRead = Max( 512 , nMaxToRead );   // it should be possible to find the encoding attribute
  59                                                  // within the first 512 bytes == 128 chars in UCS-4
  60     }
  61
  62     sal_Int32 nRead;
  63     Sequence< sal_Int8 > seqStart;
  64     while( sal_True )
  65     {
  66         nRead = m_in->readSomeBytes( seq , nMaxToRead );
  67
  68         if( nRead + seqStart.getLength())
  69         {
  70             // if nRead is 0, the file is already eof.
  71             if( ! m_bStarted && nRead )
  72             {
  73                 // ensure that enough data is available to parse encoding
  74                 if( seqStart.getLength() )
  75                 {
  76                   // prefix with what we had so far.
  77                   sal_Int32 nLength = seq.getLength();
  78                   seq.realloc( seqStart.getLength() + nLength );
  79
  80                   memmove (seq.getArray() + seqStart.getLength(),
  81                        seq.getConstArray(),
  82                        nLength);
  83                   memcpy  (seq.getArray(),
  84                        seqStart.getConstArray(),
  85                        seqStart.getLength());
  86                 }
  87
  88                 // autodetection with the first bytes
  89                 if( ! isEncodingRecognizable( seq ) )
  90                 {
  91                   // remember what we have so far.
  92                   seqStart = seq;
  93
  94                   // read more !
  95                   continue;
  96                 }
  97                 if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
  98                     // initialize decoding
  99                     initializeDecoding();
 100                 }
 101                 nRead = seq.getLength();
 102                 seqStart = Sequence < sal_Int8 > ();
 103             }
 104
 105             // do the encoding
 106             if( m_pText2Unicode && m_pUnicode2Text &&
 107                 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
 108
 109                 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
 110                 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(),     seqUnicode.getLength() );
 111             }
 112
 113             if( ! m_bStarted )
 114             {
 115                 // it must now be ensured, that no encoding attribute exist anymore
 116                 // ( otherwise the expat-Parser will crash )
 117                 // This must be done after decoding !
 118                 // ( e.g. Files decoded in ucs-4 cannot be read properly )
 119                 m_bStarted = sal_True;
 120                 removeEncoding( seq );
 121             }
 122             nRead = seq.getLength();
 123         }
 124
 125         break;
 126     }
 127     return nRead;
 128 }
 129
 130
 131 XMLFile2UTFConverter::~XMLFile2UTFConverter()
 132 {
 133     if( m_pText2Unicode )
 134         delete m_pText2Unicode;
 135     if( m_pUnicode2Text )
 136         delete m_pUnicode2Text;
 137 }
 138
 139
 140 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
 141 {
 142     const sal_Int8 *pSource = seq.getArray();
 143     if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
 144     {
 145
 146         // scan for encoding
 147         OString str( (sal_Char * ) pSource , seq.getLength() );
 148
 149         // cut sequence to first line break
 150         // find first line break;
 151         int nMax = str.indexOf( 10 );
 152         if( nMax >= 0 )
 153         {
 154             str = str.copy( 0 , nMax );
 155         }
 156
 157         int nFound = str.indexOf( " encoding" );
 158         if( nFound >= 0 ) {
 159             int nStop;
 160             int nStart = str.indexOf( "\"" , nFound );
 161             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
 162             {
 163                 nStart = str.indexOf( "'" , nFound );
 164                 nStop  = str.indexOf( "'" , nStart +1 );
 165             }
 166             else
 167             {
 168                 nStop  = str.indexOf( "\"" , nStart +1);
 169             }
 170
 171             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
 172             {
 173                 // remove encoding tag from file
 174                 memmove(        &( seq.getArray()[nFound] ) ,
 175                                 &( seq.getArray()[nStop+1]) ,
 176                                 seq.getLength() - nStop -1);
 177                 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
 178 //                              str = String( (char * ) seq.getArray() , seq.getLen() );
 179             }
 180         }
 181     }
 182 }
 183
 184 // Checks, if enough data has been accumulated to recognize the encoding
 185 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
 186 {
 187     const sal_Int8 *pSource = seq.getConstArray();
 188     sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
 189
 190     if( seq.getLength() < 8 ) {
 191         // no recognition possible, when less than 8 bytes are available
 192         return sal_False;
 193     }
 194
 195     if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
 196         // scan if the <?xml tag finishes within this buffer
 197         bCheckIfFirstClosingBracketExsists = sal_True;
 198     }
 199     else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
 200              ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
 201     {
 202         // check for utf-16
 203         bCheckIfFirstClosingBracketExsists = sal_True;
 204     }
 205     else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
 206              ( '?' == pSource[5] || '?' == pSource[7] ) )
 207     {
 208         // check for
 209         bCheckIfFirstClosingBracketExsists = sal_True;
 210     }
 211
 212     if( bCheckIfFirstClosingBracketExsists )
 213     {
 214         for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
 215         {
 216             // whole <?xml tag is valid
 217             if( '>' == pSource[ i ] )
 218             {
 219                 return sal_True;
 220             }
 221         }
 222         return sal_False;
 223     }
 224
 225     // No <? tag in front, no need for a bigger buffer
 226     return sal_True;
 227 }
 228
 229 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
 230 {
 231     const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
 232     sal_Bool bReturn = sal_True;
 233
 234     if( seq.getLength() < 4 ) {
 235         // no recognition possible, when less than 4 bytes are available
 236         return sal_False;
 237     }
 238
 239     // first level : detect possible file formats
 240     if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
 241
 242         // scan for encoding
 243         OString str( (const sal_Char *) pSource , seq.getLength() );
 244
 245         // cut sequence to first line break
 246         //find first line break;
 247         int nMax = str.indexOf( 10 );
 248         if( nMax >= 0 )
 249         {
 250             str = str.copy( 0 , nMax );
 251         }
 252
 253         int nFound = str.indexOf( " encoding" );
 254         if( nFound < str.getLength() ) {
 255             int nStop;
 256             int nStart = str.indexOf( "\"" , nFound );
 257             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
 258             {
 259                 nStart = str.indexOf( "'" , nFound );
 260                 nStop  = str.indexOf( "'" , nStart +1 );
 261             }
 262             else
 263             {
 264                 nStop  = str.indexOf( "\"" , nStart +1);
 265             }
 266             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
 267             {
 268                 // encoding found finally
 269                 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
 270             }
 271         }
 272     }
 273     else if( 0xFE == pSource[0] &&
 274              0xFF == pSource[1] ) {
 275         // UTF-16 big endian
 276         // conversion is done so that encoding information can be easily extracted
 277         m_sEncoding = "utf-16";
 278     }
 279     else if( 0xFF == pSource[0] &&
 280              0xFE == pSource[1] ) {
 281         // UTF-16 little endian
 282         // conversion is done so that encoding information can be easily extracted
 283         m_sEncoding = "utf-16";
 284     }
 285     else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
 286         // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
 287         // The byte order mark is simply added
 288
 289         // simply add the byte order mark !
 290         seq.realloc( seq.getLength() + 2 );
 291         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
 292         ((sal_uInt8*)seq.getArray())[0] = 0xFE;
 293         ((sal_uInt8*)seq.getArray())[1] = 0xFF;
 294
 295         m_sEncoding = "utf-16";
 296     }
 297     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
 298         // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
 299         // The byte order mark is simply added
 300
 301         seq.realloc( seq.getLength() + 2 );
 302         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
 303         ((sal_uInt8*)seq.getArray())[0] = 0xFF;
 304         ((sal_uInt8*)seq.getArray())[1] = 0xFE;
 305
 306         m_sEncoding = "utf-16";
 307     }
 308     else if( 0xEF == pSource[0] &&
 309              0xBB == pSource[1] &&
 310              0xBF == pSource[2] )
 311     {
 312         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
 313         // The BOM is removed.
 314         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
 315         seq.realloc( seq.getLength() - 3 );
 316         m_sEncoding = "utf-8";
 317     }
 318     else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
 319         // UCS-4 big endian
 320         m_sEncoding = "ucs-4";
 321     }
 322     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
 323         // UCS-4 little endian
 324         m_sEncoding = "ucs-4";
 325     }
 326     else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
 327              0xa7 == static_cast<unsigned char> (pSource[2]) &&
 328              0x94 == static_cast<unsigned char> (pSource[3]) ) {
 329         // EBCDIC
 330         bReturn = sal_False;   // must be extended
 331     }
 332     else {
 333         // other
 334         // UTF8 is directly recognized by the parser.
 335         bReturn = sal_False;
 336     }
 337
 338     return bReturn;
 339 }
 340
 341 void XMLFile2UTFConverter::initializeDecoding()
 342 {
 343
 344     if( m_sEncoding.getLength() )
 345     {
 346         rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
 347         if( encoding != RTL_TEXTENCODING_UTF8 )
 348         {
 349             m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
 350             m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
 351         }
 352     }
 353 }
 354
 355
 356 //----------------------------------------------
 357 //
 358 // Text2UnicodeConverter
 359 //
 360 //----------------------------------------------
 361 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
 362 {
 363     rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
 364     if( RTL_TEXTENCODING_DONTKNOW == encoding )
 365     {
 366         m_bCanContinue = sal_False;
 367         m_bInitialized = sal_False;
 368     }
 369     else
 370     {
 371         init( encoding );
 372     }
 373 }
 374
 375 Text2UnicodeConverter::~Text2UnicodeConverter()
 376 {
 377     if( m_bInitialized )
 378     {
 379         rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
 380         rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
 381     }
 382 }
 383
 384 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
 385 {
 386     m_bCanContinue = sal_True;
 387     m_bInitialized = sal_True;
 388
 389     m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);
 390     m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
 391     m_rtlEncoding = encoding;
 392 }
 393
 394
 395 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
 396 {
 397     sal_uInt32 uiInfo;
 398     sal_Size nSrcCvtBytes       = 0;
 399     sal_Size nTargetCount       = 0;
 400     sal_Size nSourceCount   = 0;
 401
 402     // the whole source size
 403     sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();
 404     Sequence<sal_Unicode>       seqUnicode ( nSourceSize );
 405
 406     const sal_Int8 *pbSource = seqText.getConstArray();
 407     sal_Int8 *pbTempMem = 0;
 408
 409     if( m_seqSource.getLength() ) {
 410         // put old rest and new byte sequence into one array
 411         pbTempMem = new sal_Int8[ nSourceSize ];
 412         memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
 413         memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
 414         pbSource = pbTempMem;
 415
 416         // set to zero again
 417         m_seqSource = Sequence< sal_Int8 >();
 418     }
 419
 420     while( sal_True ) {
 421
 422         /* All invalid characters are transformed to the unicode undefined char */
 423         nTargetCount +=         rtl_convertTextToUnicode(
 424                                     m_convText2Unicode,
 425                                     m_contextText2Unicode,
 426                                     ( const sal_Char * ) &( pbSource[nSourceCount] ),
 427                                     nSourceSize - nSourceCount ,
 428                                     &( seqUnicode.getArray()[ nTargetCount ] ),
 429                                     seqUnicode.getLength() - nTargetCount,
 430                                     RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
 431                                     RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
 432                                     RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
 433                                     &uiInfo,
 434                                     &nSrcCvtBytes );
 435         nSourceCount += nSrcCvtBytes;
 436
 437         if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
 438             // save necessary bytes for next conversion
 439             seqUnicode.realloc( seqUnicode.getLength() * 2 );
 440             continue;
 441         }
 442         break;
 443     }
 444     if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
 445         m_seqSource.realloc( nSourceSize - nSourceCount );
 446         memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
 447     }
 448
 449
 450     if( pbTempMem ) {
 451         delete [] pbTempMem;
 452     }
 453
 454     // set to correct unicode size
 455     seqUnicode.realloc( nTargetCount );
 456
 457     return seqUnicode;
 458 }
 459
 460
 461
 462 //----------------------------------------------
 463 //
 464 // Unicode2TextConverter
 465 //
 466 //----------------------------------------------
 467 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
 468 {
 469     init( encoding );
 470 }
 471
 472
 473 Unicode2TextConverter::~Unicode2TextConverter()
 474 {
 475     if( m_bInitialized ) {
 476         rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
 477         rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
 478     }
 479 }
 480
 481
 482 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
 483 {
 484     sal_Unicode *puTempMem = 0;
 485
 486     if( m_seqSource.getLength() ) {
 487         // For surrogates !
 488         // put old rest and new byte sequence into one array
 489         // In general when surrogates are used, they should be rarely
 490         // cut off between two convert()-calls. So this code is used
 491         // rarely and the extra copy is acceptable.
 492         puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
 493         memcpy( puTempMem ,
 494                 m_seqSource.getConstArray() ,
 495                 m_seqSource.getLength() * sizeof( sal_Unicode ) );
 496         memcpy(
 497             &(puTempMem[ m_seqSource.getLength() ]) ,
 498             puSource ,
 499             nSourceSize*sizeof( sal_Unicode ) );
 500         puSource = puTempMem;
 501         nSourceSize += m_seqSource.getLength();
 502
 503         m_seqSource = Sequence< sal_Unicode > ();
 504     }
 505
 506
 507     sal_Size nTargetCount = 0;
 508     sal_Size nSourceCount = 0;
 509
 510     sal_uInt32 uiInfo;
 511     sal_Size nSrcCvtChars;
 512
 513     // take nSourceSize * 3 as preference
 514     // this is an upper boundary for converting to utf8,
 515     // which most often used as the target.
 516     sal_Int32 nSeqSize =  nSourceSize * 3;
 517
 518     Sequence<sal_Int8>  seqText( nSeqSize );
 519     sal_Char *pTarget = (sal_Char *) seqText.getArray();
 520     while( sal_True ) {
 521
 522         nTargetCount += rtl_convertUnicodeToText(
 523                                     m_convUnicode2Text,
 524                                     m_contextUnicode2Text,
 525                                     &( puSource[nSourceCount] ),
 526                                     nSourceSize - nSourceCount ,
 527                                     &( pTarget[nTargetCount] ),
 528                                     nSeqSize - nTargetCount,
 529                                     RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
 530                                     RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
 531                                     &uiInfo,
 532                                     &nSrcCvtChars);
 533         nSourceCount += nSrcCvtChars;
 534
 535         if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
 536             nSeqSize = nSeqSize *2;
 537             seqText.realloc( nSeqSize );  // double array size
 538             pTarget = ( sal_Char * ) seqText.getArray();
 539             continue;
 540         }
 541         break;
 542     }
 543
 544     // for surrogates
 545     if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
 546         m_seqSource.realloc( nSourceSize - nSourceCount );
 547         memcpy( m_seqSource.getArray() ,
 548                 &(puSource[nSourceCount]),
 549                 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
 550     }
 551
 552     if( puTempMem ) {
 553         delete [] puTempMem;
 554     }
 555
 556     // reduce the size of the buffer (fast, no copy necessary)
 557     seqText.realloc( nTargetCount );
 558
 559     return seqText;
 560 }
 561
 562 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
 563 {
 564     m_bCanContinue = sal_True;
 565     m_bInitialized = sal_True;
 566
 567     m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );
 568     m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
 569     m_rtlEncoding = encoding;
 570 };
 571
 572
 573 }