svtools/source/svrtf/parrtf.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <sal/config.h>
  21 #include <sal/log.hxx>
  22
  23 #include <comphelper/scopeguard.hxx>
  24
  25 #include <rtl/character.hxx>
  26 #include <rtl/strbuf.hxx>
  27 #include <rtl/tencinfo.h>
  28 #include <rtl/ustrbuf.hxx>
  29 #include <tools/stream.hxx>
  30 #include <tools/debug.hxx>
  31 #include <svtools/rtftoken.h>
  32 #include <svtools/parrtf.hxx>
  33
  34 const int MAX_STRING_LEN = 1024;
  35
  36 #define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
  37 #define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
  38
  39 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
  40     : SvParser<int>( rIn, nStackSize )
  41     , nOpenBrackets(0)
  42     , eCodeSet(RTL_TEXTENCODING_MS_1252)
  43     , nUCharOverread(1)
  44 {
  45     // default is ANSI-CodeSet
  46     SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
  47     bRTF_InTextRead = false;
  48 }
  49
  50 SvRTFParser::~SvRTFParser()
  51 {
  52 }
  53
  54
  55 int SvRTFParser::GetNextToken_()
  56 {
  57     int nRet = 0;
  58     do {
  59         bool bNextCh = true;
  60         switch( nNextCh )
  61         {
  62         case '\\':
  63             {
  64                 // control characters
  65                 nNextCh = GetNextChar();
  66                 switch( nNextCh )
  67                 {
  68                 case '{':
  69                 case '}':
  70                 case '\\':
  71                 case '+':       // I found it in a RTF-file
  72                 case '~':       // nonbreaking space
  73                 case '-':       // optional hyphen
  74                 case '_':       // nonbreaking hyphen
  75                 case '\'':      // HexValue
  76                     nNextCh = '\\';
  77                     rInput.SeekRel( -1 );
  78                     ScanText();
  79                     nRet = RTF_TEXTTOKEN;
  80                     bNextCh = 0 == nNextCh;
  81                     break;
  82
  83                 case '*':       // ignoreflag
  84                     nRet = RTF_IGNOREFLAG;
  85                     break;
  86                 case ':':       // subentry in an index entry
  87                     nRet = RTF_SUBENTRYINDEX;
  88                     break;
  89                 case '|':       // formula-character
  90                     nRet = RTF_FORMULA;
  91                     break;
  92
  93                 case 0x0a:
  94                 case 0x0d:
  95                     nRet = RTF_PAR;
  96                     break;
  97
  98                 default:
  99                     if( RTF_ISALPHA( nNextCh ) )
 100                     {
 101                         aToken = "\\";
 102                         {
 103                             do {
 104                                 aToken.appendUtf32(nNextCh);
 105                                 nNextCh = GetNextChar();
 106                             } while( RTF_ISALPHA( nNextCh ) );
 107                         }
 108
 109                         // minus before numeric parameters
 110                         bool bNegValue = false;
 111                         if( '-' == nNextCh )
 112                         {
 113                             bNegValue = true;
 114                             nNextCh = GetNextChar();
 115                         }
 116
 117                         // possible numeric parameter
 118                         if( RTF_ISDIGIT( nNextCh ) )
 119                         {
 120                             OUStringBuffer aNumber;
 121                             do {
 122                                 aNumber.append(static_cast<sal_Unicode>(nNextCh));
 123                                 nNextCh = GetNextChar();
 124                             } while( RTF_ISDIGIT( nNextCh ) );
 125                             nTokenValue = OUString::unacquired(aNumber).toInt32();
 126                             if( bNegValue )
 127                                 nTokenValue = -nTokenValue;
 128                             bTokenHasValue=true;
 129                         }
 130                         else if( bNegValue )        // restore minus
 131                         {
 132                             nNextCh = '-';
 133                             rInput.SeekRel( -1 );
 134                         }
 135                         if( ' ' == nNextCh )        // blank is part of token!
 136                             nNextCh = GetNextChar();
 137
 138                         // search for the token in the table:
 139                         if( 0 == (nRet = GetRTFToken( aToken )) )
 140                             // Unknown Control
 141                             nRet = RTF_UNKNOWNCONTROL;
 142
 143                         // bug 76812 - unicode token handled as normal text
 144                         bNextCh = false;
 145                         switch( nRet )
 146                         {
 147                         case RTF_UC:
 148                             if( 0 <= nTokenValue )
 149                             {
 150                                 nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
 151                                 if (!aParserStates.empty())
 152                                 {
 153                                     //cmc: other ifdef breaks #i3584
 154                                     aParserStates.top().nUCharOverread = nUCharOverread;
 155                                 }
 156                             }
 157                             aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
 158                             // read next token
 159                             nRet = 0;
 160                             break;
 161
 162                         case RTF_UPR:
 163                             if (!_inSkipGroup) {
 164                             // UPR - overread the group with the ansi
 165                             //       information
 166                             int nNextToken;
 167                             do
 168                             {
 169                                 nNextToken = GetNextToken_();
 170                             }
 171                             while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());
 172
 173                             SkipGroup();
 174                             GetNextToken_();  // overread the last bracket
 175                             nRet = 0;
 176                             }
 177                             break;
 178
 179                         case RTF_U:
 180                             if( !bRTF_InTextRead )
 181                             {
 182                                 nRet = RTF_TEXTTOKEN;
 183                                 aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );
 184
 185                                 // overread the next n "RTF" characters. This
 186                                 // can be also \{, \}, \'88
 187                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
 188                                 {
 189                                     sal_uInt32 cAnsi = nNextCh;
 190                                     while( 0xD == cAnsi )
 191                                         cAnsi = GetNextChar();
 192                                     while( 0xA == cAnsi )
 193                                         cAnsi = GetNextChar();
 194
 195                                     if( '\\' == cAnsi &&
 196                                         '\'' == GetNextChar() )
 197                                         // skip HexValue
 198                                         GetHexValue();
 199                                     nNextCh = GetNextChar();
 200                                 }
 201                                 ScanText();
 202                                 bNextCh = 0 == nNextCh;
 203                             }
 204                             break;
 205                         }
 206                     }
 207                     else if( SvParserState::Pending != eState )
 208                     {
 209                         // Bug 34631 - "\ " read on - Blank as character
 210                         // eState = SvParserState::Error;
 211                         bNextCh = false;
 212                     }
 213                     break;
 214                 }
 215             }
 216             break;
 217
 218         case sal_Unicode(EOF):
 219             eState = SvParserState::Accepted;
 220             nRet = nNextCh;
 221             break;
 222
 223         case '{':
 224             {
 225                 if( 0 <= nOpenBrackets )
 226                 {
 227                     RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
 228                     aParserStates.push( aState );
 229                 }
 230                 ++nOpenBrackets;
 231                 DBG_ASSERT(
 232                     static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
 233                     "ParserStateStack unequal to bracket count" );
 234                 nRet = nNextCh;
 235             }
 236             break;
 237
 238         case '}':
 239             --nOpenBrackets;
 240             if( 0 <= nOpenBrackets )
 241             {
 242                 aParserStates.pop();
 243                 if( !aParserStates.empty() )
 244                 {
 245                     const RtfParserState_Impl& rRPS =
 246                             aParserStates.top();
 247                     nUCharOverread = rRPS.nUCharOverread;
 248                     SetSrcEncoding( rRPS.eCodeSet );
 249                 }
 250                 else
 251                 {
 252                     nUCharOverread = 1;
 253                     SetSrcEncoding( GetCodeSet() );
 254                 }
 255             }
 256             DBG_ASSERT(
 257                 static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
 258                 "ParserStateStack unequal to bracket count" );
 259             nRet = nNextCh;
 260             break;
 261
 262         case 0x0d:
 263         case 0x0a:
 264             break;
 265
 266         default:
 267             // now normal text follows
 268             ScanText();
 269             nRet = RTF_TEXTTOKEN;
 270             bNextCh = 0 == nNextCh;
 271             break;
 272         }
 273
 274         if( bNextCh )
 275             nNextCh = GetNextChar();
 276
 277     } while( !nRet && SvParserState::Working == eState );
 278     return nRet;
 279 }
 280
 281
 282 sal_Unicode SvRTFParser::GetHexValue()
 283 {
 284     // collect Hex values
 285     int n;
 286     sal_Unicode nHexVal = 0;
 287
 288     for( n = 0; n < 2; ++n )
 289     {
 290         nHexVal *= 16;
 291         nNextCh = GetNextChar();
 292         if( nNextCh >= '0' && nNextCh <= '9' )
 293             nHexVal += (nNextCh - 48);
 294         else if( nNextCh >= 'a' && nNextCh <= 'f' )
 295             nHexVal += (nNextCh - 87);
 296         else if( nNextCh >= 'A' && nNextCh <= 'F' )
 297             nHexVal += (nNextCh - 55);
 298     }
 299     return nHexVal;
 300 }
 301
 302 void SvRTFParser::ScanText()
 303 {
 304     const sal_Unicode cBreak = 0;
 305     OUStringBuffer aStrBuffer;
 306     bool bContinue = true;
 307     while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
 308     {
 309         bool bNextCh = true;
 310         switch( nNextCh )
 311         {
 312         case '\\':
 313             {
 314                 nNextCh = GetNextChar();
 315                 switch (nNextCh)
 316                 {
 317                 case '\'':
 318                     {
 319
 320                         OStringBuffer aByteString;
 321                         while (true)
 322                         {
 323                             char c = static_cast<char>(GetHexValue());
 324                             /*
 325                              * Note: \'00 is a valid internal character in  a
 326                              * string in RTF. OStringBuffer supports
 327                              * appending nulls fine
 328                              */
 329                             aByteString.append(c);
 330
 331                             bool bBreak = false;
 332                             bool bEOF = false;
 333                             char nSlash = '\\';
 334                             while (!bBreak)
 335                             {
 336                                 auto next = GetNextChar();
 337                                 if (sal_Unicode(EOF) == next)
 338                                 {
 339                                     bEOF = true;
 340                                     break;
 341                                 }
 342                                 if (next>0xFF) // fix for #i43933# and #i35653#
 343                                 {
 344                                     if (!aByteString.isEmpty())
 345                                     {
 346                                         aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
 347                                         aByteString.setLength(0);
 348                                     }
 349                                     aStrBuffer.append(static_cast<sal_Unicode>(next));
 350
 351                                     continue;
 352                                 }
 353                                 nSlash = static_cast<char>(next);
 354                                 while (nSlash == 0xD || nSlash == 0xA)
 355                                     nSlash = static_cast<char>(GetNextChar());
 356
 357                                 switch (nSlash)
 358                                 {
 359                                     case '{':
 360                                     case '}':
 361                                     case '\\':
 362                                         bBreak = true;
 363                                         break;
 364                                     default:
 365                                         aByteString.append(nSlash);
 366                                         break;
 367                                 }
 368                             }
 369
 370                             if (bEOF)
 371                             {
 372                                 bContinue = false;        // abort, string together
 373                                 break;
 374                             }
 375
 376                             nNextCh = GetNextChar();
 377
 378                             if (nSlash != '\\' || nNextCh != '\'')
 379                             {
 380                                 rInput.SeekRel(-1);
 381                                 nNextCh = static_cast<unsigned char>(nSlash);
 382                                 break;
 383                             }
 384                         }
 385
 386                         bNextCh = false;
 387
 388                         if (!aByteString.isEmpty())
 389                         {
 390                             aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
 391                             aByteString.setLength(0);
 392                         }
 393                     }
 394                     break;
 395                 case '\\':
 396                 case '}':
 397                 case '{':
 398                 case '+':       // I found in a RTF file
 399                     aStrBuffer.append(sal_Unicode(nNextCh));
 400                     break;
 401                 case '~':       // nonbreaking space
 402                     aStrBuffer.append(u'\x00A0');
 403                     break;
 404                 case '-':       // optional hyphen
 405                     aStrBuffer.append(u'\x00AD');
 406                     break;
 407                 case '_':       // nonbreaking hyphen
 408                     aStrBuffer.append(u'\x2011');
 409                     break;
 410
 411                 case 'u':
 412                     // read UNI-Code characters
 413                     {
 414                         nNextCh = GetNextChar();
 415                         rInput.SeekRel( -2 );
 416
 417                         if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
 418                         {
 419                             bRTF_InTextRead = true;
 420
 421                             OUString sSave( aToken ); // GetNextToken_() overwrites this
 422                             nNextCh = '\\';
 423                             int nToken = GetNextToken_();
 424                             DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
 425                             // don't convert symbol chars
 426                             aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
 427
 428                             // overread the next n "RTF" characters. This
 429                             // can be also \{, \}, \'88
 430                             for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
 431                             {
 432                                 sal_Unicode cAnsi = nNextCh;
 433                                 while( 0xD == cAnsi )
 434                                     cAnsi = GetNextChar();
 435                                 while( 0xA == cAnsi )
 436                                     cAnsi = GetNextChar();
 437
 438                                 if( '\\' == cAnsi &&
 439                                     '\'' == GetNextChar() )
 440                                     // skip HexValue
 441                                     GetHexValue();
 442                                 nNextCh = GetNextChar();
 443                             }
 444                             bNextCh = false;
 445                             aToken = sSave;
 446                             bRTF_InTextRead = false;
 447                         }
 448                         else if ( 'c' == nNextCh )
 449                         {
 450                             // Prevent text breaking into multiple tokens.
 451                             rInput.SeekRel( 2 );
 452                             nNextCh = GetNextChar();
 453                             if (RTF_ISDIGIT( nNextCh ))
 454                             {
 455                                 sal_uInt8 nNewOverread = 0 ;
 456                                 do {
 457                                     nNewOverread *= 10;
 458                                     nNewOverread += nNextCh - '0';
 459                                     nNextCh = GetNextChar();
 460                                 } while ( RTF_ISDIGIT( nNextCh ) );
 461                                 nUCharOverread = nNewOverread;
 462                                 if (!aParserStates.empty())
 463                                     aParserStates.top().nUCharOverread = nNewOverread;
 464                             }
 465                             bNextCh = 0x20 == nNextCh;
 466                         }
 467                         else
 468                         {
 469                             nNextCh = '\\';
 470                             bContinue = false;        // abort, string together
 471                         }
 472                     }
 473                     break;
 474
 475                 default:
 476                     rInput.SeekRel( -1 );
 477                     nNextCh = '\\';
 478                     bContinue = false;        // abort, string together
 479                     break;
 480                 }
 481             }
 482             break;
 483
 484         case sal_Unicode(EOF):
 485             eState = SvParserState::Error;
 486             [[fallthrough]];
 487         case '{':
 488         case '}':
 489             bContinue = false;
 490             break;
 491
 492         case 0x0a:
 493         case 0x0d:
 494             break;
 495
 496         default:
 497             if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
 498                 bContinue = false;
 499             else
 500             {
 501                 do {
 502                     // all other characters end up in the text
 503                     aStrBuffer.appendUtf32(nNextCh);
 504
 505                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
 506                     {
 507                         if (!aStrBuffer.isEmpty())
 508                             aToken.append( aStrBuffer );
 509                         return;
 510                     }
 511                 } while
 512                 (
 513                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
 514                     (aStrBuffer.getLength() < MAX_STRING_LEN)
 515                 );
 516                 bNextCh = false;
 517             }
 518         }
 519
 520         if( bContinue && bNextCh )
 521             nNextCh = GetNextChar();
 522     }
 523
 524     if (!aStrBuffer.isEmpty())
 525         aToken.append( aStrBuffer );
 526 }
 527
 528
 529 short SvRTFParser::_inSkipGroup=0;
 530
 531 void SvRTFParser::SkipGroup()
 532 {
 533     short nBrackets=1;
 534     if (_inSkipGroup>0)
 535         return;
 536     _inSkipGroup++;
 537 //#i16185# faking \bin keyword
 538     do
 539     {
 540         switch (nNextCh)
 541         {
 542             case '{':
 543                 ++nBrackets;
 544                 break;
 545             case '}':
 546                 if (!--nBrackets) {
 547                     _inSkipGroup--;
 548                     return;
 549                 }
 550                 break;
 551         }
 552         int nToken = GetNextToken_();
 553         if (nToken == RTF_BIN)
 554         {
 555             rInput.SeekRel(-1);
 556             SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
 557             if (nTokenValue > 0)
 558                 rInput.SeekRel(nTokenValue);
 559             nNextCh = GetNextChar();
 560         }
 561         while (nNextCh==0xa || nNextCh==0xd)
 562         {
 563             nNextCh = GetNextChar();
 564         }
 565     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
 566
 567     if( SvParserState::Pending != eState && '}' != nNextCh )
 568         eState = SvParserState::Error;
 569     _inSkipGroup--;
 570 }
 571
 572 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
 573 void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
 574
 575
 576 SvParserState SvRTFParser::CallParser()
 577 {
 578     char cFirstCh(0);
 579     nNextChPos = rInput.Tell();
 580     rInput.ReadChar( cFirstCh );
 581     nNextCh = static_cast<unsigned char>(cFirstCh);
 582     eState = SvParserState::Working;
 583     nOpenBrackets = 0;
 584     eCodeSet = RTL_TEXTENCODING_MS_1252;
 585     SetSrcEncoding( eCodeSet );
 586
 587     // the first two tokens should be '{' and \\rtf !!
 588     if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
 589     {
 590         AddFirstRef();
 591         // call ReleaseRef at end of this scope, even in the face of exceptions
 592         comphelper::ScopeGuard g([this] {
 593             if( SvParserState::Pending != eState )
 594                 ReleaseRef();       // now parser is not needed anymore
 595         });
 596         Continue( 0 );
 597     }
 598     else
 599         eState = SvParserState::Error;
 600
 601     return eState;
 602 }
 603
 604 void SvRTFParser::Continue( int nToken )
 605 {
 606 //  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
 607 //              "Characterset was changed." );
 608
 609     if( !nToken )
 610         nToken = GetNextToken();
 611
 612     bool bLooping = false;
 613
 614     while (IsParserWorking() && !bLooping)
 615     {
 616         auto nCurrentTokenIndex = m_nTokenIndex;
 617         auto nCurrentToken = nToken;
 618
 619         SaveState( nToken );
 620         switch( nToken )
 621         {
 622         case '}':
 623             if( nOpenBrackets )
 624                 goto NEXTTOKEN;
 625             eState = SvParserState::Accepted;
 626             break;
 627
 628         case '{':
 629             // an unknown group ?
 630             {
 631                 if( RTF_IGNOREFLAG != GetNextToken() )
 632                     nToken = SkipToken();
 633                 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
 634                     nToken = SkipToken( -2 );
 635                 else
 636                 {
 637                     // filter immediately
 638                     ReadUnknownData();
 639                     nToken = GetNextToken();
 640                     if( '}' != nToken )
 641                         eState = SvParserState::Error;
 642                     break;      // move to next token!!
 643                 }
 644             }
 645             goto NEXTTOKEN;
 646
 647         case RTF_UNKNOWNCONTROL:
 648             break;      // skip unknown token
 649         case RTF_NEXTTYPE:
 650         case RTF_ANSITYPE:
 651             eCodeSet = RTL_TEXTENCODING_MS_1252;
 652             SetSrcEncoding( eCodeSet );
 653             break;
 654         case RTF_MACTYPE:
 655             eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
 656             SetSrcEncoding( eCodeSet );
 657             break;
 658         case RTF_PCTYPE:
 659             eCodeSet = RTL_TEXTENCODING_IBM_437;
 660             SetSrcEncoding( eCodeSet );
 661             break;
 662         case RTF_PCATYPE:
 663             eCodeSet = RTL_TEXTENCODING_IBM_850;
 664             SetSrcEncoding( eCodeSet );
 665             break;
 666         case RTF_ANSICPG:
 667             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
 668             SetSrcEncoding(eCodeSet);
 669             break;
 670         default:
 671 NEXTTOKEN:
 672             NextToken( nToken );
 673             break;
 674         }
 675         if( IsParserWorking() )
 676             SaveState( 0 );         // processed till here,
 677                                     // continue with new token!
 678         nToken = GetNextToken();
 679         bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
 680     }
 681     if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
 682         eState = SvParserState::Error;
 683 }
 684
 685 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
 686 {
 687     if (eEnc == RTL_TEXTENCODING_DONTKNOW)
 688         eEnc = GetCodeSet();
 689
 690     if (!aParserStates.empty())
 691         aParserStates.top().eCodeSet = eEnc;
 692     SetSrcEncoding(eEnc);
 693 }
 694
 695 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */