svtools/source/svrtf/parrtf.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20
  21 #include <stdio.h>                      // for EOF
  22 #include <rtl/tencinfo.h>
  23 #include <tools/stream.hxx>
  24 #include <tools/debug.hxx>
  25 #include <svtools/rtftoken.h>
  26 #include <svtools/rtfkeywd.hxx>
  27 #include <svtools/parrtf.hxx>
  28 #include <comphelper/string.hxx>
  29
  30 const int MAX_STRING_LEN = 1024;
  31 const int MAX_TOKEN_LEN = 128;
  32
  33 #define RTF_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
  34 #define RTF_ISALPHA( c ) comphelper::string::isalphaAscii(c)
  35
  36 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
  37     : SvParser( rIn, nStackSize ),
  38     eUNICodeSet( RTL_TEXTENCODING_MS_1252 ),    // default ist ANSI-CodeSet
  39     nUCharOverread( 1 )
  40 {
  41     // default ist ANSI-CodeSet
  42     SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
  43     bRTF_InTextRead = false;
  44 }
  45
  46 SvRTFParser::~SvRTFParser()
  47 {
  48 }
  49
  50
  51
  52
  53 int SvRTFParser::_GetNextToken()
  54 {
  55     int nRet = 0;
  56     do {
  57         int bNextCh = true;
  58         switch( nNextCh )
  59         {
  60         case '\\':
  61             {
  62                 // Steuerzeichen
  63                 switch( nNextCh = GetNextChar() )
  64                 {
  65                 case '{':
  66                 case '}':
  67                 case '\\':
  68                 case '+':       // habe ich in einem RTF-File gefunden
  69                 case '~':       // nonbreaking space
  70                 case '-':       // optional hyphen
  71                 case '_':       // nonbreaking hyphen
  72                 case '\'':      // HexValue
  73                     nNextCh = '\\';
  74                     rInput.SeekRel( -1 );
  75                     ScanText();
  76                     nRet = RTF_TEXTTOKEN;
  77                     bNextCh = 0 == nNextCh;
  78                     break;
  79
  80                 case '*':       // ignoreflag
  81                     nRet = RTF_IGNOREFLAG;
  82                     break;
  83                 case ':':       // subentry in an index entry
  84                     nRet = RTF_SUBENTRYINDEX;
  85                     break;
  86                 case '|':       // formula-charakter
  87                     nRet = RTF_FORMULA;
  88                     break;
  89
  90                 case 0x0a:
  91                 case 0x0d:
  92                     nRet = RTF_PAR;
  93                     break;
  94
  95                 default:
  96                     if( RTF_ISALPHA( nNextCh ) )
  97                     {
  98                         aToken = '\\';
  99                         {
 100                             String aStrBuffer;
 101                             sal_Unicode* pStr = aStrBuffer.AllocBuffer(
 102                                                             MAX_TOKEN_LEN );
 103                             xub_StrLen nStrLen = 0;
 104                             do {
 105                                 *(pStr + nStrLen++) = nNextCh;
 106                                 if( MAX_TOKEN_LEN == nStrLen )
 107                                 {
 108                                     aToken += aStrBuffer;
 109                                     aToken.GetBufferAccess();  // make unique string!
 110                                     nStrLen = 0;
 111                                 }
 112                                 nNextCh = GetNextChar();
 113                             } while( RTF_ISALPHA( nNextCh ) );
 114                             if( nStrLen )
 115                             {
 116                                 aStrBuffer.ReleaseBufferAccess( nStrLen );
 117                                 aToken += aStrBuffer;
 118                             }
 119                         }
 120
 121                         // Minus fuer numerischen Parameter
 122                         int bNegValue = false;
 123                         if( '-' == nNextCh )
 124                         {
 125                             bNegValue = true;
 126                             nNextCh = GetNextChar();
 127                         }
 128
 129                         // evt. Numerischer Parameter
 130                         if( RTF_ISDIGIT( nNextCh ) )
 131                         {
 132                             nTokenValue = 0;
 133                             do {
 134                                 nTokenValue *= 10;
 135                                 nTokenValue += nNextCh - '0';
 136                                 nNextCh = GetNextChar();
 137                             } while( RTF_ISDIGIT( nNextCh ) );
 138                             if( bNegValue )
 139                                 nTokenValue = -nTokenValue;
 140                             bTokenHasValue=true;
 141                         }
 142                         else if( bNegValue )        // das Minus wieder zurueck
 143                         {
 144                             nNextCh = '-';
 145                             rInput.SeekRel( -1 );
 146                         }
 147                         if( ' ' == nNextCh )        // Blank gehoert zum Token!
 148                             nNextCh = GetNextChar();
 149
 150                         // suche das Token in der Tabelle:
 151                         if( 0 == (nRet = GetRTFToken( aToken )) )
 152                             // Unknown Control
 153                             nRet = RTF_UNKNOWNCONTROL;
 154
 155                         // bug 76812 - unicode token handled as normal text
 156                         bNextCh = false;
 157                         switch( nRet )
 158                         {
 159                         case RTF_UC:
 160                             if( 0 <= nTokenValue )
 161                             {
 162                                 nUCharOverread = (sal_uInt8)nTokenValue;
 163                                 //cmc: other ifdef breaks #i3584
 164                                 aParserStates.top().
 165                                     nUCharOverread = nUCharOverread;
 166                             }
 167                             aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
 168                             // read next token
 169                             nRet = 0;
 170                             break;
 171
 172                         case RTF_UPR:
 173                             if (!_inSkipGroup) {
 174                             // UPR - overread the group with the ansi
 175                             //       information
 176                             while( '{' != _GetNextToken() )
 177                                 ;
 178                             SkipGroup();
 179                             _GetNextToken();  // overread the last bracket
 180                             nRet = 0;
 181                             }
 182                             break;
 183
 184                         case RTF_U:
 185                             if( !bRTF_InTextRead )
 186                             {
 187                                 nRet = RTF_TEXTTOKEN;
 188                                 aToken = (sal_Unicode)nTokenValue;
 189
 190                                 // overread the next n "RTF" characters. This
 191                                 // can be also \{, \}, \'88
 192                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
 193                                 {
 194                                     sal_Unicode cAnsi = nNextCh;
 195                                     while( 0xD == cAnsi )
 196                                         cAnsi = GetNextChar();
 197                                     while( 0xA == cAnsi )
 198                                         cAnsi = GetNextChar();
 199
 200                                     if( '\\' == cAnsi &&
 201                                         '\'' == ( cAnsi = GetNextChar() ))
 202                                         // HexValue ueberlesen
 203                                         cAnsi = GetHexValue();
 204                                     nNextCh = GetNextChar();
 205                                 }
 206                                 ScanText();
 207                                 bNextCh = 0 == nNextCh;
 208                             }
 209                             break;
 210                         }
 211                     }
 212                     else if( SVPAR_PENDING != eState )
 213                     {
 214                         // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
 215                         // eState = SVPAR_ERROR;
 216                         bNextCh = false;
 217                     }
 218                     break;
 219                 }
 220             }
 221             break;
 222
 223         case sal_Unicode(EOF):
 224             eState = SVPAR_ACCEPTED;
 225             nRet = nNextCh;
 226             break;
 227
 228         case '{':
 229             {
 230                 if( 0 <= nOpenBrakets )
 231                 {
 232                     RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
 233                     aParserStates.push( aState );
 234                 }
 235                 ++nOpenBrakets;
 236                 DBG_ASSERT(
 237                     static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
 238                     "ParserStateStack unequal to bracket count" );
 239                 nRet = nNextCh;
 240             }
 241             break;
 242
 243         case '}':
 244             --nOpenBrakets;
 245             if( 0 <= nOpenBrakets )
 246             {
 247                 aParserStates.pop();
 248                 if( !aParserStates.empty() )
 249                 {
 250                     const RtfParserState_Impl& rRPS =
 251                             aParserStates.top();
 252                     nUCharOverread = rRPS.nUCharOverread;
 253                     SetSrcEncoding( rRPS.eCodeSet );
 254                 }
 255                 else
 256                 {
 257                     nUCharOverread = 1;
 258                     SetSrcEncoding( GetCodeSet() );
 259                 }
 260             }
 261             DBG_ASSERT(
 262                 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
 263                 "ParserStateStack unequal to bracket count" );
 264             nRet = nNextCh;
 265             break;
 266
 267         case 0x0d:
 268         case 0x0a:
 269             break;
 270
 271         default:
 272             // es folgt normaler Text
 273             ScanText();
 274             nRet = RTF_TEXTTOKEN;
 275             bNextCh = 0 == nNextCh;
 276             break;
 277         }
 278
 279         if( bNextCh )
 280             nNextCh = GetNextChar();
 281
 282     } while( !nRet && SVPAR_WORKING == eState );
 283     return nRet;
 284 }
 285
 286
 287 sal_Unicode SvRTFParser::GetHexValue()
 288 {
 289     // Hex-Wert sammeln
 290     register int n;
 291     register sal_Unicode nHexVal = 0;
 292
 293     for( n = 0; n < 2; ++n )
 294     {
 295         nHexVal *= 16;
 296         nNextCh = GetNextChar();
 297         if( nNextCh >= '0' && nNextCh <= '9' )
 298             nHexVal += (nNextCh - 48);
 299         else if( nNextCh >= 'a' && nNextCh <= 'f' )
 300             nHexVal += (nNextCh - 87);
 301         else if( nNextCh >= 'A' && nNextCh <= 'F' )
 302             nHexVal += (nNextCh - 55);
 303     }
 304     return nHexVal;
 305 }
 306
 307 void SvRTFParser::ScanText( const sal_Unicode cBreak )
 308 {
 309     String aStrBuffer;
 310     int bWeiter = true;
 311     while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
 312     {
 313         int bNextCh = true;
 314         switch( nNextCh )
 315         {
 316         case '\\':
 317             {
 318                 switch (nNextCh = GetNextChar())
 319                 {
 320                 case '\'':
 321                     {
 322
 323                         OStringBuffer aByteString;
 324                         while (1)
 325                         {
 326                             char c = (char)GetHexValue();
 327                             /*
 328                              * Note: \'00 is a valid internal character in  a
 329                              * string in RTF. OStringBuffer supports
 330                              * appending nulls fine
 331                              */
 332                             aByteString.append(c);
 333
 334                             bool bBreak = false;
 335                             sal_Char nSlash = '\\';
 336                             while (!bBreak)
 337                             {
 338                                 wchar_t __next=GetNextChar();
 339                                 if (__next>0xFF) // fix for #i43933# and #i35653#
 340                                 {
 341                                     if (aByteString.getLength())
 342                                         aStrBuffer.Append(String(OStringToOUString(aByteString.makeStringAndClear(), GetSrcEncoding())));
 343                                     aStrBuffer.Append((sal_Unicode)__next);
 344
 345                                     continue;
 346                                 }
 347                                 nSlash = (sal_Char)__next;
 348                                 while (nSlash == 0xD || nSlash == 0xA)
 349                                     nSlash = (sal_Char)GetNextChar();
 350
 351                                 switch (nSlash)
 352                                 {
 353                                     case '{':
 354                                     case '}':
 355                                     case '\\':
 356                                         bBreak = true;
 357                                         break;
 358                                     default:
 359                                         aByteString.append(nSlash);
 360                                         break;
 361                                 }
 362                             }
 363
 364                             nNextCh = GetNextChar();
 365
 366                             if (nSlash != '\\' || nNextCh != '\'')
 367                             {
 368                                 rInput.SeekRel(-1);
 369                                 nNextCh = nSlash;
 370                                 break;
 371                             }
 372                         }
 373
 374                         bNextCh = false;
 375
 376                         if (aByteString.getLength())
 377                             aStrBuffer.Append(String(OStringToOUString(aByteString.makeStringAndClear(), GetSrcEncoding())));
 378                     }
 379                     break;
 380                 case '\\':
 381                 case '}':
 382                 case '{':
 383                 case '+':       // habe ich in einem RTF-File gefunden
 384                     aStrBuffer.Append(nNextCh);
 385                     break;
 386                 case '~':       // nonbreaking space
 387                     aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
 388                     break;
 389                 case '-':       // optional hyphen
 390                     aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
 391                     break;
 392                 case '_':       // nonbreaking hyphen
 393                     aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
 394                     break;
 395
 396                 case 'u':
 397                     // UNI-Code Zeichen lesen
 398                     {
 399                         nNextCh = GetNextChar();
 400                         rInput.SeekRel( -2 );
 401
 402                         if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
 403                         {
 404                             bRTF_InTextRead = true;
 405
 406                             String sSave( aToken );
 407                             nNextCh = '\\';
 408                             #ifdef DBG_UTIL
 409                             int nToken =
 410                             #endif
 411                                 _GetNextToken();
 412                             DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
 413                             // dont convert symbol chars
 414                             aStrBuffer.Append(
 415                                 static_cast< sal_Unicode >(nTokenValue));
 416
 417                             // overread the next n "RTF" characters. This
 418                             // can be also \{, \}, \'88
 419                             for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
 420                             {
 421                                 sal_Unicode cAnsi = nNextCh;
 422                                 while( 0xD == cAnsi )
 423                                     cAnsi = GetNextChar();
 424                                 while( 0xA == cAnsi )
 425                                     cAnsi = GetNextChar();
 426
 427                                 if( '\\' == cAnsi &&
 428                                     '\'' == ( cAnsi = GetNextChar() ))
 429                                     // HexValue ueberlesen
 430                                     cAnsi = GetHexValue();
 431                                 nNextCh = GetNextChar();
 432                             }
 433                             bNextCh = false;
 434                             aToken = sSave;
 435                             bRTF_InTextRead = false;
 436                         }
 437                         else
 438                         {
 439                             nNextCh = '\\';
 440                             bWeiter = false;        // Abbrechen, String zusammen
 441                         }
 442                     }
 443                     break;
 444
 445                 default:
 446                     rInput.SeekRel( -1 );
 447                     nNextCh = '\\';
 448                     bWeiter = false;        // Abbrechen, String zusammen
 449                     break;
 450                 }
 451             }
 452             break;
 453
 454         case sal_Unicode(EOF):
 455                 eState = SVPAR_ERROR;
 456                 // weiter
 457         case '{':
 458         case '}':
 459             bWeiter = false;
 460             break;
 461
 462         case 0x0a:
 463         case 0x0d:
 464             break;
 465
 466         default:
 467             if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
 468                 bWeiter = false;
 469             else
 470             {
 471                 do {
 472                     // alle anderen Zeichen kommen in den Text
 473                     aStrBuffer.Append(nNextCh);
 474
 475                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
 476                     {
 477                         if (aStrBuffer.Len())
 478                             aToken += aStrBuffer;
 479                         return;
 480                     }
 481                 } while
 482                 (
 483                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
 484                     (aStrBuffer.Len() < MAX_STRING_LEN)
 485                 );
 486                 bNextCh = false;
 487             }
 488         }
 489
 490         if( bWeiter && bNextCh )
 491             nNextCh = GetNextChar();
 492     }
 493
 494     if (aStrBuffer.Len())
 495         aToken += aStrBuffer;
 496 }
 497
 498
 499 short SvRTFParser::_inSkipGroup=0;
 500
 501 void SvRTFParser::SkipGroup()
 502 {
 503 short nBrackets=1;
 504 if (_inSkipGroup>0)
 505     return;
 506 _inSkipGroup++;
 507 //#i16185# fecking \bin keyword
 508     do
 509     {
 510         switch (nNextCh)
 511         {
 512             case '{':
 513                 ++nBrackets;
 514                 break;
 515             case '}':
 516                 if (!--nBrackets) {
 517                     _inSkipGroup--;
 518                     return;
 519                 }
 520                 break;
 521         }
 522         int nToken = _GetNextToken();
 523         if (nToken == RTF_BIN)
 524         {
 525             rInput.SeekRel(-1);
 526             rInput.SeekRel(nTokenValue);
 527             nNextCh = GetNextChar();
 528         }
 529         while (nNextCh==0xa || nNextCh==0xd)
 530         {
 531             nNextCh = GetNextChar();
 532         }
 533     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
 534
 535     if( SVPAR_PENDING != eState && '}' != nNextCh )
 536         eState = SVPAR_ERROR;
 537     _inSkipGroup--;
 538 }
 539
 540 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
 541 void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
 542 void SvRTFParser::ReadOLEData()     { SkipGroup(); }
 543
 544
 545 SvParserState SvRTFParser::CallParser()
 546 {
 547     sal_Char cFirstCh;
 548     nNextChPos = rInput.Tell();
 549     rInput >> cFirstCh; nNextCh = cFirstCh;
 550     eState = SVPAR_WORKING;
 551     nOpenBrakets = 0;
 552     SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
 553     eUNICodeSet = RTL_TEXTENCODING_MS_1252;     // default ist ANSI-CodeSet
 554
 555     // die 1. beiden Token muessen '{' und \\rtf sein !!
 556     if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
 557     {
 558         AddRef();
 559         Continue( 0 );
 560         if( SVPAR_PENDING != eState )
 561             ReleaseRef();       // dann brauchen wir den Parser nicht mehr!
 562     }
 563     else
 564         eState = SVPAR_ERROR;
 565
 566     return eState;
 567 }
 568
 569 void SvRTFParser::Continue( int nToken )
 570 {
 571 //  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
 572 //              "Zeichensatz wurde geaendert." );
 573
 574     if( !nToken )
 575         nToken = GetNextToken();
 576
 577     while( IsParserWorking() )
 578     {
 579         SaveState( nToken );
 580         switch( nToken )
 581         {
 582         case '}':
 583             if( nOpenBrakets )
 584                 goto NEXTTOKEN;
 585             eState = SVPAR_ACCEPTED;
 586             break;
 587
 588         case '{':
 589             // eine unbekannte Gruppe ?
 590             {
 591                 if( RTF_IGNOREFLAG != GetNextToken() )
 592                     nToken = SkipToken( -1 );
 593                 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
 594                     nToken = SkipToken( -2 );
 595                 else
 596                 {
 597                     // gleich herausfiltern
 598                     ReadUnknownData();
 599                     nToken = GetNextToken();
 600                     if( '}' != nToken )
 601                         eState = SVPAR_ERROR;
 602                     break;      // auf zum naechsten Token!!
 603                 }
 604             }
 605             goto NEXTTOKEN;
 606
 607         case RTF_UNKNOWNCONTROL:
 608             break;      // unbekannte Token ueberspringen
 609         case RTF_NEXTTYPE:
 610         case RTF_ANSITYPE:
 611             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
 612             break;
 613         case RTF_MACTYPE:
 614             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
 615             break;
 616         case RTF_PCTYPE:
 617             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
 618             break;
 619         case RTF_PCATYPE:
 620             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
 621             break;
 622         case RTF_ANSICPG:
 623             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
 624             SetSrcEncoding(eCodeSet);
 625             break;
 626         default:
 627 NEXTTOKEN:
 628             NextToken( nToken );
 629             break;
 630         }
 631         if( IsParserWorking() )
 632             SaveState( 0 );         // bis hierhin abgearbeitet,
 633                                     // weiter mit neuem Token!
 634         nToken = GetNextToken();
 635     }
 636     if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
 637         eState = SVPAR_ERROR;
 638 }
 639
 640 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
 641 {
 642     if (eEnc == RTL_TEXTENCODING_DONTKNOW)
 643         eEnc = GetCodeSet();
 644
 645     if (!aParserStates.empty())
 646         aParserStates.top().eCodeSet = eEnc;
 647     SetSrcEncoding(eEnc);
 648 }
 649
 650 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */