svtools/source/svrtf/parrtf.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <sal/config.h>
  21 #include <sal/log.hxx>
  22
  23 #include <comphelper/scopeguard.hxx>
  24
  25 #include <rtl/character.hxx>
  26 #include <rtl/strbuf.hxx>
  27 #include <rtl/tencinfo.h>
  28 #include <rtl/ustrbuf.hxx>
  29 #include <tools/stream.hxx>
  30 #include <tools/debug.hxx>
  31 #include <svtools/rtftoken.h>
  32 #include <svtools/parrtf.hxx>
  33
  34 const int MAX_STRING_LEN = 1024;
  35
  36 #define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
  37 #define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
  38
  39 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
  40     : SvParser<int>( rIn, nStackSize )
  41     , nOpenBrackets(0)
  42     , nUPRLevel(0)
  43     , eCodeSet(RTL_TEXTENCODING_MS_1252)
  44     , nUCharOverread(1)
  45 {
  46     // default is ANSI-CodeSet
  47     SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
  48     bRTF_InTextRead = false;
  49 }
  50
  51 SvRTFParser::~SvRTFParser()
  52 {
  53 }
  54
  55
  56 int SvRTFParser::GetNextToken_()
  57 {
  58     int nRet = 0;
  59     do {
  60         bool bNextCh = true;
  61         switch( nNextCh )
  62         {
  63         case '\\':
  64             {
  65                 // control characters
  66                 nNextCh = GetNextChar();
  67                 switch( nNextCh )
  68                 {
  69                 case '{':
  70                 case '}':
  71                 case '\\':
  72                 case '+':       // I found it in a RTF-file
  73                 case '~':       // nonbreaking space
  74                 case '-':       // optional hyphen
  75                 case '_':       // nonbreaking hyphen
  76                 case '\'':      // HexValue
  77                     nNextCh = '\\';
  78                     rInput.SeekRel( -1 );
  79                     ScanText();
  80                     nRet = RTF_TEXTTOKEN;
  81                     bNextCh = 0 == nNextCh;
  82                     break;
  83
  84                 case '*':       // ignoreflag
  85                     nRet = RTF_IGNOREFLAG;
  86                     break;
  87                 case ':':       // subentry in an index entry
  88                     nRet = RTF_SUBENTRYINDEX;
  89                     break;
  90                 case '|':       // formula-character
  91                     nRet = RTF_FORMULA;
  92                     break;
  93
  94                 case 0x0a:
  95                 case 0x0d:
  96                     nRet = RTF_PAR;
  97                     break;
  98
  99                 default:
 100                     if( RTF_ISALPHA( nNextCh ) )
 101                     {
 102                         aToken = "\\";
 103                         {
 104                             do {
 105                                 aToken.appendUtf32(nNextCh);
 106                                 nNextCh = GetNextChar();
 107                             } while( RTF_ISALPHA( nNextCh ) );
 108                         }
 109
 110                         // minus before numeric parameters
 111                         bool bNegValue = false;
 112                         if( '-' == nNextCh )
 113                         {
 114                             bNegValue = true;
 115                             nNextCh = GetNextChar();
 116                         }
 117
 118                         // possible numeric parameter
 119                         if( RTF_ISDIGIT( nNextCh ) )
 120                         {
 121                             OUStringBuffer aNumber;
 122                             do {
 123                                 aNumber.append(static_cast<sal_Unicode>(nNextCh));
 124                                 nNextCh = GetNextChar();
 125                             } while( RTF_ISDIGIT( nNextCh ) );
 126                             nTokenValue = OUString::unacquired(aNumber).toInt32();
 127                             if( bNegValue )
 128                                 nTokenValue = -nTokenValue;
 129                             bTokenHasValue=true;
 130                         }
 131                         else if( bNegValue )        // restore minus
 132                         {
 133                             nNextCh = '-';
 134                             rInput.SeekRel( -1 );
 135                         }
 136                         if( ' ' == nNextCh )        // blank is part of token!
 137                             nNextCh = GetNextChar();
 138
 139                         // search for the token in the table:
 140                         if( 0 == (nRet = GetRTFToken( aToken )) )
 141                             // Unknown Control
 142                             nRet = RTF_UNKNOWNCONTROL;
 143
 144                         // bug 76812 - unicode token handled as normal text
 145                         bNextCh = false;
 146                         switch( nRet )
 147                         {
 148                         case RTF_UC:
 149                             if( 0 <= nTokenValue )
 150                             {
 151                                 nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
 152                                 if (!aParserStates.empty())
 153                                 {
 154                                     //cmc: other ifdef breaks #i3584
 155                                     aParserStates.top().nUCharOverread = nUCharOverread;
 156                                 }
 157                             }
 158                             aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
 159                             // read next token
 160                             nRet = 0;
 161                             break;
 162
 163                         case RTF_UPR:
 164                             if (!_inSkipGroup)
 165                             {
 166                                 if (nUPRLevel > 256) // fairly sure > 1 is probably an error, but provide some leeway
 167                                 {
 168                                     SAL_WARN("svtools", "urp stack too deep");
 169                                     eState = SvParserState::Error;
 170                                     break;
 171                                 }
 172
 173                                 ++nUPRLevel;
 174
 175                                 // UPR - overread the group with the ansi
 176                                 //       information
 177                                 int nNextToken;
 178                                 do
 179                                 {
 180                                     nNextToken = GetNextToken_();
 181                                 }
 182                                 while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());
 183
 184                                 SkipGroup();
 185                                 GetNextToken_();  // overread the last bracket
 186                                 nRet = 0;
 187
 188                                 --nUPRLevel;
 189                             }
 190                             break;
 191
 192                         case RTF_U:
 193                             if( !bRTF_InTextRead )
 194                             {
 195                                 nRet = RTF_TEXTTOKEN;
 196                                 aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );
 197
 198                                 // overread the next n "RTF" characters. This
 199                                 // can be also \{, \}, \'88
 200                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
 201                                 {
 202                                     sal_uInt32 cAnsi = nNextCh;
 203                                     while( 0xD == cAnsi )
 204                                         cAnsi = GetNextChar();
 205                                     while( 0xA == cAnsi )
 206                                         cAnsi = GetNextChar();
 207
 208                                     if( '\\' == cAnsi &&
 209                                         '\'' == GetNextChar() )
 210                                         // skip HexValue
 211                                         GetHexValue();
 212                                     nNextCh = GetNextChar();
 213                                 }
 214                                 ScanText();
 215                                 bNextCh = 0 == nNextCh;
 216                             }
 217                             break;
 218                         }
 219                     }
 220                     else if( SvParserState::Pending != eState )
 221                     {
 222                         // Bug 34631 - "\ " read on - Blank as character
 223                         // eState = SvParserState::Error;
 224                         bNextCh = false;
 225                     }
 226                     break;
 227                 }
 228             }
 229             break;
 230
 231         case sal_Unicode(EOF):
 232             eState = SvParserState::Accepted;
 233             nRet = nNextCh;
 234             break;
 235
 236         case '{':
 237             {
 238                 if( 0 <= nOpenBrackets )
 239                 {
 240                     RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
 241                     aParserStates.push( aState );
 242                 }
 243                 ++nOpenBrackets;
 244                 DBG_ASSERT(
 245                     static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
 246                     "ParserStateStack unequal to bracket count" );
 247                 nRet = nNextCh;
 248             }
 249             break;
 250
 251         case '}':
 252             --nOpenBrackets;
 253             if( 0 <= nOpenBrackets )
 254             {
 255                 aParserStates.pop();
 256                 if( !aParserStates.empty() )
 257                 {
 258                     const RtfParserState_Impl& rRPS =
 259                             aParserStates.top();
 260                     nUCharOverread = rRPS.nUCharOverread;
 261                     SetSrcEncoding( rRPS.eCodeSet );
 262                 }
 263                 else
 264                 {
 265                     nUCharOverread = 1;
 266                     SetSrcEncoding( GetCodeSet() );
 267                 }
 268             }
 269             DBG_ASSERT(
 270                 static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
 271                 "ParserStateStack unequal to bracket count" );
 272             nRet = nNextCh;
 273             break;
 274
 275         case 0x0d:
 276         case 0x0a:
 277             break;
 278
 279         default:
 280             // now normal text follows
 281             ScanText();
 282             nRet = RTF_TEXTTOKEN;
 283             bNextCh = 0 == nNextCh;
 284             break;
 285         }
 286
 287         if( bNextCh )
 288             nNextCh = GetNextChar();
 289
 290     } while( !nRet && SvParserState::Working == eState );
 291     return nRet;
 292 }
 293
 294
 295 sal_Unicode SvRTFParser::GetHexValue()
 296 {
 297     // collect Hex values
 298     int n;
 299     sal_Unicode nHexVal = 0;
 300
 301     for( n = 0; n < 2; ++n )
 302     {
 303         nHexVal *= 16;
 304         nNextCh = GetNextChar();
 305         if( nNextCh >= '0' && nNextCh <= '9' )
 306             nHexVal += (nNextCh - 48);
 307         else if( nNextCh >= 'a' && nNextCh <= 'f' )
 308             nHexVal += (nNextCh - 87);
 309         else if( nNextCh >= 'A' && nNextCh <= 'F' )
 310             nHexVal += (nNextCh - 55);
 311     }
 312     return nHexVal;
 313 }
 314
 315 void SvRTFParser::ScanText()
 316 {
 317     const sal_Unicode cBreak = 0;
 318     OUStringBuffer aStrBuffer;
 319     bool bContinue = true;
 320     while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
 321     {
 322         bool bNextCh = true;
 323         switch( nNextCh )
 324         {
 325         case '\\':
 326             {
 327                 nNextCh = GetNextChar();
 328                 switch (nNextCh)
 329                 {
 330                 case '\'':
 331                     {
 332
 333                         OStringBuffer aByteString;
 334                         while (true)
 335                         {
 336                             char c = static_cast<char>(GetHexValue());
 337                             /*
 338                              * Note: \'00 is a valid internal character in  a
 339                              * string in RTF. OStringBuffer supports
 340                              * appending nulls fine
 341                              */
 342                             aByteString.append(c);
 343
 344                             bool bBreak = false;
 345                             bool bEOF = false;
 346                             char nSlash = '\\';
 347                             while (!bBreak)
 348                             {
 349                                 auto next = GetNextChar();
 350                                 if (sal_Unicode(EOF) == next)
 351                                 {
 352                                     bEOF = true;
 353                                     break;
 354                                 }
 355                                 if (next>0xFF) // fix for #i43933# and #i35653#
 356                                 {
 357                                     if (!aByteString.isEmpty())
 358                                     {
 359                                         aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
 360                                         aByteString.setLength(0);
 361                                     }
 362                                     aStrBuffer.append(static_cast<sal_Unicode>(next));
 363
 364                                     continue;
 365                                 }
 366                                 nSlash = static_cast<char>(next);
 367                                 while (nSlash == 0xD || nSlash == 0xA)
 368                                     nSlash = static_cast<char>(GetNextChar());
 369
 370                                 switch (nSlash)
 371                                 {
 372                                     case '{':
 373                                     case '}':
 374                                     case '\\':
 375                                         bBreak = true;
 376                                         break;
 377                                     default:
 378                                         aByteString.append(nSlash);
 379                                         break;
 380                                 }
 381                             }
 382
 383                             if (bEOF)
 384                             {
 385                                 bContinue = false;        // abort, string together
 386                                 break;
 387                             }
 388
 389                             nNextCh = GetNextChar();
 390
 391                             if (nSlash != '\\' || nNextCh != '\'')
 392                             {
 393                                 rInput.SeekRel(-1);
 394                                 nNextCh = static_cast<unsigned char>(nSlash);
 395                                 break;
 396                             }
 397                         }
 398
 399                         bNextCh = false;
 400
 401                         if (!aByteString.isEmpty())
 402                         {
 403                             aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
 404                             aByteString.setLength(0);
 405                         }
 406                     }
 407                     break;
 408                 case '\\':
 409                 case '}':
 410                 case '{':
 411                 case '+':       // I found in a RTF file
 412                     aStrBuffer.append(sal_Unicode(nNextCh));
 413                     break;
 414                 case '~':       // nonbreaking space
 415                     aStrBuffer.append(u'\x00A0');
 416                     break;
 417                 case '-':       // optional hyphen
 418                     aStrBuffer.append(u'\x00AD');
 419                     break;
 420                 case '_':       // nonbreaking hyphen
 421                     aStrBuffer.append(u'\x2011');
 422                     break;
 423
 424                 case 'u':
 425                     // read UNI-Code characters
 426                     {
 427                         nNextCh = GetNextChar();
 428                         rInput.SeekRel( -2 );
 429
 430                         if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
 431                         {
 432                             bRTF_InTextRead = true;
 433
 434                             OUString sSave( aToken ); // GetNextToken_() overwrites this
 435                             nNextCh = '\\';
 436                             int nToken = GetNextToken_();
 437                             DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
 438                             // don't convert symbol chars
 439                             aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
 440
 441                             // overread the next n "RTF" characters. This
 442                             // can be also \{, \}, \'88
 443                             for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
 444                             {
 445                                 sal_Unicode cAnsi = nNextCh;
 446                                 while( 0xD == cAnsi )
 447                                     cAnsi = GetNextChar();
 448                                 while( 0xA == cAnsi )
 449                                     cAnsi = GetNextChar();
 450
 451                                 if( '\\' == cAnsi &&
 452                                     '\'' == GetNextChar() )
 453                                     // skip HexValue
 454                                     GetHexValue();
 455                                 nNextCh = GetNextChar();
 456                             }
 457                             bNextCh = false;
 458                             aToken = sSave;
 459                             bRTF_InTextRead = false;
 460                         }
 461                         else if ( 'c' == nNextCh )
 462                         {
 463                             // Prevent text breaking into multiple tokens.
 464                             rInput.SeekRel( 2 );
 465                             nNextCh = GetNextChar();
 466                             if (RTF_ISDIGIT( nNextCh ))
 467                             {
 468                                 sal_uInt8 nNewOverread = 0 ;
 469                                 do {
 470                                     nNewOverread *= 10;
 471                                     nNewOverread += nNextCh - '0';
 472                                     nNextCh = GetNextChar();
 473                                 } while ( RTF_ISDIGIT( nNextCh ) );
 474                                 nUCharOverread = nNewOverread;
 475                                 if (!aParserStates.empty())
 476                                     aParserStates.top().nUCharOverread = nNewOverread;
 477                             }
 478                             bNextCh = 0x20 == nNextCh;
 479                         }
 480                         else
 481                         {
 482                             nNextCh = '\\';
 483                             bContinue = false;        // abort, string together
 484                         }
 485                     }
 486                     break;
 487
 488                 default:
 489                     rInput.SeekRel( -1 );
 490                     nNextCh = '\\';
 491                     bContinue = false;        // abort, string together
 492                     break;
 493                 }
 494             }
 495             break;
 496
 497         case sal_Unicode(EOF):
 498             eState = SvParserState::Error;
 499             [[fallthrough]];
 500         case '{':
 501         case '}':
 502             bContinue = false;
 503             break;
 504
 505         case 0x0a:
 506         case 0x0d:
 507             break;
 508
 509         default:
 510             if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
 511                 bContinue = false;
 512             else
 513             {
 514                 do {
 515                     // all other characters end up in the text
 516                     aStrBuffer.appendUtf32(nNextCh);
 517
 518                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
 519                     {
 520                         if (!aStrBuffer.isEmpty())
 521                             aToken.append( aStrBuffer );
 522                         return;
 523                     }
 524                 } while
 525                 (
 526                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
 527                     (aStrBuffer.getLength() < MAX_STRING_LEN)
 528                 );
 529                 bNextCh = false;
 530             }
 531         }
 532
 533         if( bContinue && bNextCh )
 534             nNextCh = GetNextChar();
 535     }
 536
 537     if (!aStrBuffer.isEmpty())
 538         aToken.append( aStrBuffer );
 539 }
 540
 541
 542 short SvRTFParser::_inSkipGroup=0;
 543
 544 void SvRTFParser::SkipGroup()
 545 {
 546     short nBrackets=1;
 547     if (_inSkipGroup>0)
 548         return;
 549     _inSkipGroup++;
 550 //#i16185# faking \bin keyword
 551     do
 552     {
 553         switch (nNextCh)
 554         {
 555             case '{':
 556                 ++nBrackets;
 557                 break;
 558             case '}':
 559                 if (!--nBrackets) {
 560                     _inSkipGroup--;
 561                     return;
 562                 }
 563                 break;
 564         }
 565         int nToken = GetNextToken_();
 566         if (nToken == RTF_BIN)
 567         {
 568             rInput.SeekRel(-1);
 569             SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
 570             if (nTokenValue > 0)
 571                 rInput.SeekRel(nTokenValue);
 572             nNextCh = GetNextChar();
 573         }
 574         while (nNextCh==0xa || nNextCh==0xd)
 575         {
 576             nNextCh = GetNextChar();
 577         }
 578     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
 579
 580     if( SvParserState::Pending != eState && '}' != nNextCh )
 581         eState = SvParserState::Error;
 582     _inSkipGroup--;
 583 }
 584
 585 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
 586 void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
 587
 588
 589 SvParserState SvRTFParser::CallParser()
 590 {
 591     char cFirstCh(0);
 592     nNextChPos = rInput.Tell();
 593     rInput.ReadChar( cFirstCh );
 594     nNextCh = static_cast<unsigned char>(cFirstCh);
 595     eState = SvParserState::Working;
 596     nOpenBrackets = 0;
 597     eCodeSet = RTL_TEXTENCODING_MS_1252;
 598     SetSrcEncoding( eCodeSet );
 599
 600     // the first two tokens should be '{' and \\rtf !!
 601     if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
 602     {
 603         AddFirstRef();
 604         // call ReleaseRef at end of this scope, even in the face of exceptions
 605         comphelper::ScopeGuard g([this] {
 606             if( SvParserState::Pending != eState )
 607                 ReleaseRef();       // now parser is not needed anymore
 608         });
 609         Continue( 0 );
 610     }
 611     else
 612         eState = SvParserState::Error;
 613
 614     return eState;
 615 }
 616
 617 void SvRTFParser::Continue( int nToken )
 618 {
 619 //  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
 620 //              "Characterset was changed." );
 621
 622     if( !nToken )
 623         nToken = GetNextToken();
 624
 625     bool bLooping = false;
 626
 627     while (IsParserWorking() && !bLooping)
 628     {
 629         auto nCurrentTokenIndex = m_nTokenIndex;
 630         auto nCurrentToken = nToken;
 631
 632         SaveState( nToken );
 633         switch( nToken )
 634         {
 635         case '}':
 636             if( nOpenBrackets )
 637                 goto NEXTTOKEN;
 638             eState = SvParserState::Accepted;
 639             break;
 640
 641         case '{':
 642             // an unknown group ?
 643             {
 644                 if( RTF_IGNOREFLAG != GetNextToken() )
 645                     nToken = SkipToken();
 646                 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
 647                     nToken = SkipToken( -2 );
 648                 else
 649                 {
 650                     // filter immediately
 651                     ReadUnknownData();
 652                     nToken = GetNextToken();
 653                     if( '}' != nToken )
 654                         eState = SvParserState::Error;
 655                     break;      // move to next token!!
 656                 }
 657             }
 658             goto NEXTTOKEN;
 659
 660         case RTF_UNKNOWNCONTROL:
 661             break;      // skip unknown token
 662         case RTF_NEXTTYPE:
 663         case RTF_ANSITYPE:
 664             eCodeSet = RTL_TEXTENCODING_MS_1252;
 665             SetSrcEncoding( eCodeSet );
 666             break;
 667         case RTF_MACTYPE:
 668             eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
 669             SetSrcEncoding( eCodeSet );
 670             break;
 671         case RTF_PCTYPE:
 672             eCodeSet = RTL_TEXTENCODING_IBM_437;
 673             SetSrcEncoding( eCodeSet );
 674             break;
 675         case RTF_PCATYPE:
 676             eCodeSet = RTL_TEXTENCODING_IBM_850;
 677             SetSrcEncoding( eCodeSet );
 678             break;
 679         case RTF_ANSICPG:
 680             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
 681             SetSrcEncoding(eCodeSet);
 682             break;
 683         default:
 684 NEXTTOKEN:
 685             NextToken( nToken );
 686             break;
 687         }
 688         if( IsParserWorking() )
 689             SaveState( 0 );         // processed till here,
 690                                     // continue with new token!
 691         nToken = GetNextToken();
 692         bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
 693     }
 694     if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
 695         eState = SvParserState::Error;
 696 }
 697
 698 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
 699 {
 700     if (eEnc == RTL_TEXTENCODING_DONTKNOW)
 701         eEnc = GetCodeSet();
 702
 703     if (!aParserStates.empty())
 704         aParserStates.top().eCodeSet = eEnc;
 705     SetSrcEncoding(eEnc);
 706 }
 707
 708 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */