svtools/source/svrtf/parrtf.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20
  21 #include <rtl/tencinfo.h>
  22 #include <tools/stream.hxx>
  23 #include <tools/debug.hxx>
  24 #include <svtools/rtftoken.h>
  25 #include <svtools/rtfkeywd.hxx>
  26 #include <svtools/parrtf.hxx>
  27 #include <comphelper/string.hxx>
  28
  29 const int MAX_STRING_LEN = 1024;
  30 const int MAX_TOKEN_LEN = 128;
  31
  32 #define RTF_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
  33 #define RTF_ISALPHA( c ) comphelper::string::isalphaAscii(c)
  34
  35 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
  36     : SvParser( rIn, nStackSize )
  37     , nOpenBrakets(0)
  38     , eCodeSet(RTL_TEXTENCODING_MS_1252)
  39     , eUNICodeSet(RTL_TEXTENCODING_MS_1252)    // default ist ANSI-CodeSet
  40     , nUCharOverread(1)
  41 {
  42     // default ist ANSI-CodeSet
  43     SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
  44     bRTF_InTextRead = false;
  45 }
  46
  47 SvRTFParser::~SvRTFParser()
  48 {
  49 }
  50
  51
  52
  53
  54 int SvRTFParser::_GetNextToken()
  55 {
  56     int nRet = 0;
  57     do {
  58         bool bNextCh = true;
  59         switch( nNextCh )
  60         {
  61         case '\\':
  62             {
  63                 // control charaters
  64                 switch( nNextCh = GetNextChar() )
  65                 {
  66                 case '{':
  67                 case '}':
  68                 case '\\':
  69                 case '+':       // I found it in a RTF-file
  70                 case '~':       // nonbreaking space
  71                 case '-':       // optional hyphen
  72                 case '_':       // nonbreaking hyphen
  73                 case '\'':      // HexValue
  74                     nNextCh = '\\';
  75                     rInput.SeekRel( -1 );
  76                     ScanText();
  77                     nRet = RTF_TEXTTOKEN;
  78                     bNextCh = 0 == nNextCh;
  79                     break;
  80
  81                 case '*':       // ignoreflag
  82                     nRet = RTF_IGNOREFLAG;
  83                     break;
  84                 case ':':       // subentry in an index entry
  85                     nRet = RTF_SUBENTRYINDEX;
  86                     break;
  87                 case '|':       // formula-character
  88                     nRet = RTF_FORMULA;
  89                     break;
  90
  91                 case 0x0a:
  92                 case 0x0d:
  93                     nRet = RTF_PAR;
  94                     break;
  95
  96                 default:
  97                     if( RTF_ISALPHA( nNextCh ) )
  98                     {
  99                         aToken = "\\";
 100                         {
 101                             OUStringBuffer aStrBuffer;
 102                             aStrBuffer.setLength( MAX_TOKEN_LEN );
 103                             sal_Int32 nStrLen = 0;
 104                             do {
 105                                 aStrBuffer[nStrLen++] = nNextCh;
 106                                 if( MAX_TOKEN_LEN == nStrLen )
 107                                 {
 108                                     aToken += aStrBuffer.toString();
 109                                     nStrLen = 0;
 110                                 }
 111                                 nNextCh = GetNextChar();
 112                             } while( RTF_ISALPHA( nNextCh ) );
 113                             if( nStrLen )
 114                             {
 115                                 aToken += aStrBuffer.makeStringAndClear();
 116                             }
 117                         }
 118
 119                         // minus before numeric parameters
 120                         bool bNegValue = false;
 121                         if( '-' == nNextCh )
 122                         {
 123                             bNegValue = true;
 124                             nNextCh = GetNextChar();
 125                         }
 126
 127                         // possible numeric parameter
 128                         if( RTF_ISDIGIT( nNextCh ) )
 129                         {
 130                             nTokenValue = 0;
 131                             do {
 132                                 nTokenValue *= 10;
 133                                 nTokenValue += nNextCh - '0';
 134                                 nNextCh = GetNextChar();
 135                             } while( RTF_ISDIGIT( nNextCh ) );
 136                             if( bNegValue )
 137                                 nTokenValue = -nTokenValue;
 138                             bTokenHasValue=true;
 139                         }
 140                         else if( bNegValue )        // restore minus
 141                         {
 142                             nNextCh = '-';
 143                             rInput.SeekRel( -1 );
 144                         }
 145                         if( ' ' == nNextCh )        // blank is part of token!
 146                             nNextCh = GetNextChar();
 147
 148                         // search for the token in the table:
 149                         if( 0 == (nRet = GetRTFToken( aToken )) )
 150                             // Unknown Control
 151                             nRet = RTF_UNKNOWNCONTROL;
 152
 153                         // bug 76812 - unicode token handled as normal text
 154                         bNextCh = false;
 155                         switch( nRet )
 156                         {
 157                         case RTF_UC:
 158                             if( 0 <= nTokenValue )
 159                             {
 160                                 nUCharOverread = (sal_uInt8)nTokenValue;
 161                                 //cmc: other ifdef breaks #i3584
 162                                 aParserStates.top().
 163                                     nUCharOverread = nUCharOverread;
 164                             }
 165                             aToken.clear(); // #i47831# erase token to prevent the token from being treated as text
 166                             // read next token
 167                             nRet = 0;
 168                             break;
 169
 170                         case RTF_UPR:
 171                             if (!_inSkipGroup) {
 172                             // UPR - overread the group with the ansi
 173                             //       information
 174                             while( '{' != _GetNextToken() )
 175                                 ;
 176                             SkipGroup();
 177                             _GetNextToken();  // overread the last bracket
 178                             nRet = 0;
 179                             }
 180                             break;
 181
 182                         case RTF_U:
 183                             if( !bRTF_InTextRead )
 184                             {
 185                                 nRet = RTF_TEXTTOKEN;
 186                                 aToken = OUString( (sal_Unicode)nTokenValue );
 187
 188                                 // overread the next n "RTF" characters. This
 189                                 // can be also \{, \}, \'88
 190                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
 191                                 {
 192                                     sal_Unicode cAnsi = nNextCh;
 193                                     while( 0xD == cAnsi )
 194                                         cAnsi = GetNextChar();
 195                                     while( 0xA == cAnsi )
 196                                         cAnsi = GetNextChar();
 197
 198                                     if( '\\' == cAnsi &&
 199                                         '\'' == ( cAnsi = GetNextChar() ))
 200                                         // read on HexValue
 201                                         cAnsi = GetHexValue();
 202                                     nNextCh = GetNextChar();
 203                                 }
 204                                 ScanText();
 205                                 bNextCh = 0 == nNextCh;
 206                             }
 207                             break;
 208                         }
 209                     }
 210                     else if( SVPAR_PENDING != eState )
 211                     {
 212                         // Bug 34631 - "\ " read on - Blank as character
 213                         // eState = SVPAR_ERROR;
 214                         bNextCh = false;
 215                     }
 216                     break;
 217                 }
 218             }
 219             break;
 220
 221         case sal_Unicode(EOF):
 222             eState = SVPAR_ACCEPTED;
 223             nRet = nNextCh;
 224             break;
 225
 226         case '{':
 227             {
 228                 if( 0 <= nOpenBrakets )
 229                 {
 230                     RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
 231                     aParserStates.push( aState );
 232                 }
 233                 ++nOpenBrakets;
 234                 DBG_ASSERT(
 235                     static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
 236                     "ParserStateStack unequal to bracket count" );
 237                 nRet = nNextCh;
 238             }
 239             break;
 240
 241         case '}':
 242             --nOpenBrakets;
 243             if( 0 <= nOpenBrakets )
 244             {
 245                 aParserStates.pop();
 246                 if( !aParserStates.empty() )
 247                 {
 248                     const RtfParserState_Impl& rRPS =
 249                             aParserStates.top();
 250                     nUCharOverread = rRPS.nUCharOverread;
 251                     SetSrcEncoding( rRPS.eCodeSet );
 252                 }
 253                 else
 254                 {
 255                     nUCharOverread = 1;
 256                     SetSrcEncoding( GetCodeSet() );
 257                 }
 258             }
 259             DBG_ASSERT(
 260                 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
 261                 "ParserStateStack unequal to bracket count" );
 262             nRet = nNextCh;
 263             break;
 264
 265         case 0x0d:
 266         case 0x0a:
 267             break;
 268
 269         default:
 270             // now normal text follows
 271             ScanText();
 272             nRet = RTF_TEXTTOKEN;
 273             bNextCh = 0 == nNextCh;
 274             break;
 275         }
 276
 277         if( bNextCh )
 278             nNextCh = GetNextChar();
 279
 280     } while( !nRet && SVPAR_WORKING == eState );
 281     return nRet;
 282 }
 283
 284
 285 sal_Unicode SvRTFParser::GetHexValue()
 286 {
 287     // collect Hex values
 288     int n;
 289     sal_Unicode nHexVal = 0;
 290
 291     for( n = 0; n < 2; ++n )
 292     {
 293         nHexVal *= 16;
 294         nNextCh = GetNextChar();
 295         if( nNextCh >= '0' && nNextCh <= '9' )
 296             nHexVal += (nNextCh - 48);
 297         else if( nNextCh >= 'a' && nNextCh <= 'f' )
 298             nHexVal += (nNextCh - 87);
 299         else if( nNextCh >= 'A' && nNextCh <= 'F' )
 300             nHexVal += (nNextCh - 55);
 301     }
 302     return nHexVal;
 303 }
 304
 305 void SvRTFParser::ScanText( const sal_Unicode cBreak )
 306 {
 307     OUStringBuffer aStrBuffer;
 308     bool bContinue = true;
 309     while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
 310     {
 311         bool bNextCh = true;
 312         switch( nNextCh )
 313         {
 314         case '\\':
 315             {
 316                 switch (nNextCh = GetNextChar())
 317                 {
 318                 case '\'':
 319                     {
 320
 321                         OStringBuffer aByteString;
 322                         while (true)
 323                         {
 324                             char c = (char)GetHexValue();
 325                             /*
 326                              * Note: \'00 is a valid internal character in  a
 327                              * string in RTF. OStringBuffer supports
 328                              * appending nulls fine
 329                              */
 330                             aByteString.append(c);
 331
 332                             bool bBreak = false;
 333                             sal_Char nSlash = '\\';
 334                             while (!bBreak)
 335                             {
 336                                 wchar_t __next=GetNextChar();
 337                                 if (__next>0xFF) // fix for #i43933# and #i35653#
 338                                 {
 339                                     if (!aByteString.isEmpty())
 340                                         aStrBuffer.append( OStringToOUString(aByteString.makeStringAndClear(), GetSrcEncoding()) );
 341                                     aStrBuffer.append((sal_Unicode)__next);
 342
 343                                     continue;
 344                                 }
 345                                 nSlash = (sal_Char)__next;
 346                                 while (nSlash == 0xD || nSlash == 0xA)
 347                                     nSlash = (sal_Char)GetNextChar();
 348
 349                                 switch (nSlash)
 350                                 {
 351                                     case '{':
 352                                     case '}':
 353                                     case '\\':
 354                                         bBreak = true;
 355                                         break;
 356                                     default:
 357                                         aByteString.append(nSlash);
 358                                         break;
 359                                 }
 360                             }
 361
 362                             nNextCh = GetNextChar();
 363
 364                             if (nSlash != '\\' || nNextCh != '\'')
 365                             {
 366                                 rInput.SeekRel(-1);
 367                                 nNextCh = nSlash;
 368                                 break;
 369                             }
 370                         }
 371
 372                         bNextCh = false;
 373
 374                         if (!aByteString.isEmpty())
 375                             aStrBuffer.append( OStringToOUString(aByteString.makeStringAndClear(), GetSrcEncoding()) );
 376                     }
 377                     break;
 378                 case '\\':
 379                 case '}':
 380                 case '{':
 381                 case '+':       // I found in a RTF file
 382                     aStrBuffer.append(nNextCh);
 383                     break;
 384                 case '~':       // nonbreaking space
 385                     aStrBuffer.append(static_cast< sal_Unicode >(0xA0));
 386                     break;
 387                 case '-':       // optional hyphen
 388                     aStrBuffer.append(static_cast< sal_Unicode >(0xAD));
 389                     break;
 390                 case '_':       // nonbreaking hyphen
 391                     aStrBuffer.append(static_cast< sal_Unicode >(0x2011));
 392                     break;
 393
 394                 case 'u':
 395                     // read UNI-Code characters
 396                     {
 397                         nNextCh = GetNextChar();
 398                         rInput.SeekRel( -2 );
 399
 400                         if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
 401                         {
 402                             bRTF_InTextRead = true;
 403
 404                             OUString sSave( aToken );
 405                             nNextCh = '\\';
 406                             #ifdef DBG_UTIL
 407                             int nToken =
 408                             #endif
 409                                 _GetNextToken();
 410                             DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
 411                             // dont convert symbol chars
 412                             aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
 413
 414                             // overread the next n "RTF" characters. This
 415                             // can be also \{, \}, \'88
 416                             for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
 417                             {
 418                                 sal_Unicode cAnsi = nNextCh;
 419                                 while( 0xD == cAnsi )
 420                                     cAnsi = GetNextChar();
 421                                 while( 0xA == cAnsi )
 422                                     cAnsi = GetNextChar();
 423
 424                                 if( '\\' == cAnsi &&
 425                                     '\'' == ( cAnsi = GetNextChar() ))
 426                                     // HexValue ueberlesen
 427                                     cAnsi = GetHexValue();
 428                                 nNextCh = GetNextChar();
 429                             }
 430                             bNextCh = false;
 431                             aToken = sSave;
 432                             bRTF_InTextRead = false;
 433                         }
 434                         else if ( 'c' == nNextCh )
 435                         {
 436                             // Prevent text breaking into multiple tokens.
 437                             rInput.SeekRel( 2 );
 438                             nNextCh = GetNextChar();
 439                             if (RTF_ISDIGIT( nNextCh ))
 440                             {
 441                                 sal_uInt8 nNewOverread = 0 ;
 442                                 do {
 443                                     nNewOverread *= 10;
 444                                     nNewOverread += nNextCh - '0';
 445                                     nNextCh = GetNextChar();
 446                                 } while ( RTF_ISDIGIT( nNextCh ) );
 447                                 nUCharOverread = nNewOverread;
 448                                 aParserStates.top().nUCharOverread = nNewOverread;
 449                             }
 450                             bNextCh = 0x20 == nNextCh;
 451                         }
 452                         else
 453                         {
 454                             nNextCh = '\\';
 455                             bContinue = false;        // abort, string together
 456                         }
 457                     }
 458                     break;
 459
 460                 default:
 461                     rInput.SeekRel( -1 );
 462                     nNextCh = '\\';
 463                     bContinue = false;        // abort, string together
 464                     break;
 465                 }
 466             }
 467             break;
 468
 469         case sal_Unicode(EOF): eState = SVPAR_ERROR;
 470                 // continue
 471         case '{':
 472         case '}':
 473             bContinue = false;
 474             break;
 475
 476         case 0x0a:
 477         case 0x0d:
 478             break;
 479
 480         default:
 481             if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
 482                 bContinue = false;
 483             else
 484             {
 485                 do {
 486                     // all other characters end up in the text
 487                     aStrBuffer.append(nNextCh);
 488
 489                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
 490                     {
 491                         if (!aStrBuffer.isEmpty())
 492                             aToken += aStrBuffer.toString();
 493                         return;
 494                     }
 495                 } while
 496                 (
 497                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
 498                     (aStrBuffer.getLength() < MAX_STRING_LEN)
 499                 );
 500                 bNextCh = false;
 501             }
 502         }
 503
 504         if( bContinue && bNextCh )
 505             nNextCh = GetNextChar();
 506     }
 507
 508     if (!aStrBuffer.isEmpty())
 509         aToken += aStrBuffer.makeStringAndClear();
 510 }
 511
 512
 513 short SvRTFParser::_inSkipGroup=0;
 514
 515 void SvRTFParser::SkipGroup()
 516 {
 517 short nBrackets=1;
 518 if (_inSkipGroup>0)
 519     return;
 520 _inSkipGroup++;
 521 //#i16185# fecking \bin keyword
 522     do
 523     {
 524         switch (nNextCh)
 525         {
 526             case '{':
 527                 ++nBrackets;
 528                 break;
 529             case '}':
 530                 if (!--nBrackets) {
 531                     _inSkipGroup--;
 532                     return;
 533                 }
 534                 break;
 535         }
 536         int nToken = _GetNextToken();
 537         if (nToken == RTF_BIN)
 538         {
 539             rInput.SeekRel(-1);
 540             rInput.SeekRel(nTokenValue);
 541             nNextCh = GetNextChar();
 542         }
 543         while (nNextCh==0xa || nNextCh==0xd)
 544         {
 545             nNextCh = GetNextChar();
 546         }
 547     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
 548
 549     if( SVPAR_PENDING != eState && '}' != nNextCh )
 550         eState = SVPAR_ERROR;
 551     _inSkipGroup--;
 552 }
 553
 554 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
 555 void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
 556 void SvRTFParser::ReadOLEData()     { SkipGroup(); }
 557
 558
 559 SvParserState SvRTFParser::CallParser()
 560 {
 561     sal_Char cFirstCh;
 562     nNextChPos = rInput.Tell();
 563     rInput.ReadChar( cFirstCh ); nNextCh = cFirstCh;
 564     eState = SVPAR_WORKING;
 565     nOpenBrakets = 0;
 566     SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
 567     eUNICodeSet = RTL_TEXTENCODING_MS_1252;     // default is ANSI-CodeSet
 568
 569     // the first two tokens should be '{' and \\rtf !!
 570     if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
 571     {
 572         AddFirstRef();
 573         Continue( 0 );
 574         if( SVPAR_PENDING != eState )
 575             ReleaseRef();       // now parser is not needed anymore
 576     }
 577     else
 578         eState = SVPAR_ERROR;
 579
 580     return eState;
 581 }
 582
 583 void SvRTFParser::Continue( int nToken )
 584 {
 585 //  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
 586 //              "Characterset was changed." );
 587
 588     if( !nToken )
 589         nToken = GetNextToken();
 590
 591     while( IsParserWorking() )
 592     {
 593         SaveState( nToken );
 594         switch( nToken )
 595         {
 596         case '}':
 597             if( nOpenBrakets )
 598                 goto NEXTTOKEN;
 599             eState = SVPAR_ACCEPTED;
 600             break;
 601
 602         case '{':
 603             // a unknown group ?
 604             {
 605                 if( RTF_IGNOREFLAG != GetNextToken() )
 606                     nToken = SkipToken( -1 );
 607                 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
 608                     nToken = SkipToken( -2 );
 609                 else
 610                 {
 611                     // filter immediately
 612                     ReadUnknownData();
 613                     nToken = GetNextToken();
 614                     if( '}' != nToken )
 615                         eState = SVPAR_ERROR;
 616                     break;      // move to next token!!
 617                 }
 618             }
 619             goto NEXTTOKEN;
 620
 621         case RTF_UNKNOWNCONTROL:
 622             break;      // skip unknown token
 623         case RTF_NEXTTYPE:
 624         case RTF_ANSITYPE:
 625             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
 626             break;
 627         case RTF_MACTYPE:
 628             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
 629             break;
 630         case RTF_PCTYPE:
 631             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
 632             break;
 633         case RTF_PCATYPE:
 634             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
 635             break;
 636         case RTF_ANSICPG:
 637             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
 638             SetSrcEncoding(eCodeSet);
 639             break;
 640         default:
 641 NEXTTOKEN:
 642             NextToken( nToken );
 643             break;
 644         }
 645         if( IsParserWorking() )
 646             SaveState( 0 );         // processed till here,
 647                                     // continue with new token!
 648         nToken = GetNextToken();
 649     }
 650     if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
 651         eState = SVPAR_ERROR;
 652 }
 653
 654 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
 655 {
 656     if (eEnc == RTL_TEXTENCODING_DONTKNOW)
 657         eEnc = GetCodeSet();
 658
 659     if (!aParserStates.empty())
 660         aParserStates.top().eCodeSet = eEnc;
 661     SetSrcEncoding(eEnc);
 662 }
 663
 664 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */