comphelper/source/misc/syntaxhighlight.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <sal/config.h>
  21
  22 #include <cassert>
  23
  24 #include <unicode/uchar.h>
  25 #include <comphelper/syntaxhighlight.hxx>
  26 #include <comphelper/string.hxx>
  27
  28 // Flags for character properties
  29 #define CHAR_START_IDENTIFIER   0x0001
  30 #define CHAR_IN_IDENTIFIER      0x0002
  31 #define CHAR_START_NUMBER       0x0004
  32 #define CHAR_IN_NUMBER          0x0008
  33 #define CHAR_IN_HEX_NUMBER      0x0010
  34 #define CHAR_IN_OCT_NUMBER      0x0020
  35 #define CHAR_START_STRING       0x0040
  36 #define CHAR_OPERATOR           0x0080
  37 #define CHAR_SPACE              0x0100
  38 #define CHAR_EOL                0x0200
  39
  40 // ##########################################################################
  41 // ATTENTION: all these words need to be in lower case
  42 // ##########################################################################
  43 static const char* strListBasicKeyWords[] = {
  44     "access",
  45     "alias",
  46     "and",
  47     "any",
  48     "append",
  49     "as",
  50     "attribute",
  51     "base",
  52     "binary",
  53     "boolean",
  54     "byref",
  55     "byte",
  56     "byval",
  57     "call",
  58     "case",
  59     "cdecl",
  60     "classmodule",
  61     "close",
  62     "compare",
  63     "compatible",
  64     "const",
  65     "currency",
  66     "date",
  67     "declare",
  68     "defbool",
  69     "defcur",
  70     "defdate",
  71     "defdbl",
  72     "deferr",
  73     "defint",
  74     "deflng",
  75     "defobj",
  76     "defsng",
  77     "defstr",
  78     "defvar",
  79     "dim",
  80     "do",
  81     "doevents",
  82     "double",
  83     "each",
  84     "else",
  85     "elseif",
  86     "end",
  87     "end enum",
  88     "end function",
  89     "end if",
  90     "end property",
  91     "end select",
  92     "end sub",
  93     "end type",
  94     "endif",
  95     "enum",
  96     "eqv",
  97     "erase",
  98     "error",
  99     "exit",
 100     "explicit",
 101     "for",
 102     "function",
 103     "get",
 104     "global",
 105     "gosub",
 106     "goto",
 107     "if",
 108     "imp",
 109     "implements",
 110     "in",
 111     "input",
 112     "integer",
 113     "is",
 114     "let",
 115     "lib",
 116     "like",
 117     "line",
 118     "line input",
 119     "local",
 120     "lock",
 121     "long",
 122     "loop",
 123     "lprint",
 124     "lset",
 125     "mod",
 126     "name",
 127     "new",
 128     "next",
 129     "not",
 130     "object",
 131     "on",
 132     "open",
 133     "option",
 134     "optional",
 135     "or",
 136     "output",
 137     "paramarray",
 138     "preserve",
 139     "print",
 140     "private",
 141     "property",
 142     "public",
 143     "random",
 144     "read",
 145     "redim",
 146     "rem",
 147     "resume",
 148     "return",
 149     "rset",
 150     "select",
 151     "set",
 152     "shared",
 153     "single",
 154     "static",
 155     "step",
 156     "stop",
 157     "string",
 158     "sub",
 159     "system",
 160     "text",
 161     "then",
 162     "to",
 163     "type",
 164     "typeof",
 165     "until",
 166     "variant",
 167     "vbasupport",
 168     "wend",
 169     "while",
 170     "with",
 171     "withevents",
 172     "write",
 173     "xor"
 174 };
 175
 176
 177 static const char* strListSqlKeyWords[] = {
 178     "all",
 179     "and",
 180     "any",
 181     "as",
 182     "asc",
 183     "avg",
 184     "between",
 185     "by",
 186     "cast",
 187     "corresponding",
 188     "count",
 189     "create",
 190     "cross",
 191     "delete",
 192     "desc",
 193     "distinct",
 194     "drop",
 195     "escape",
 196     "except",
 197     "exists",
 198     "false",
 199     "from",
 200     "full",
 201     "global",
 202     "group",
 203     "having",
 204     "in",
 205     "inner",
 206     "insert",
 207     "intersect",
 208     "into",
 209     "is",
 210     "join",
 211     "left",
 212     "like",
 213     "limit",
 214     "local",
 215     "match",
 216     "max",
 217     "min",
 218     "natural",
 219     "not",
 220     "null",
 221     "on",
 222     "or",
 223     "order",
 224     "outer",
 225     "right",
 226     "select",
 227     "set",
 228     "some",
 229     "sum",
 230     "table",
 231     "temporary",
 232     "true",
 233     "union",
 234     "unique",
 235     "unknown",
 236     "update",
 237     "using",
 238     "values",
 239     "where"
 240 };
 241
 242
 243 extern "C" int compare_strings( const void *arg1, const void *arg2 )
 244 {
 245     return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
 246 }
 247
 248
 249 namespace
 250 {
 251     bool isAlpha(sal_Unicode c)
 252     {
 253         if (comphelper::string::isalphaAscii(c))
 254             return true;
 255         return u_isalpha(c);
 256     }
 257 }
 258
 259 class SyntaxHighlighter::Tokenizer
 260 {
 261     // Character information tables
 262     sal_uInt16 aCharTypeTab[256];
 263
 264     // Auxiliary function: testing of the character flags
 265     bool testCharFlags(sal_Unicode c, sal_uInt16 nTestFlags) const;
 266
 267     // Get new token, EmptyString == nothing more over there
 268     bool getNextToken(const sal_Unicode*& pos, /*out*/TokenTypes& reType,
 269         /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const;
 270
 271     const char** ppListKeyWords;
 272     sal_uInt16 nKeyWordCount;
 273
 274 public:
 275     HighlighterLanguage const aLanguage;
 276
 277     Tokenizer( HighlighterLanguage aLang );
 278     ~Tokenizer();
 279
 280     void getHighlightPortions(const OUString& rLine,
 281                                /*out*/std::vector<HighlightPortion>& portions) const;
 282     void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
 283 };
 284
 285 // Helper function: test character flag
 286 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, sal_uInt16 nTestFlags) const
 287 {
 288     bool bRet = false;
 289     if( c != 0 && c <= 255 )
 290     {
 291         bRet = ( (aCharTypeTab[c] & nTestFlags) != 0 );
 292     }
 293     else if( c > 255 )
 294     {
 295         bRet = (( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER ) & nTestFlags) != 0
 296             && isAlpha(c);
 297     }
 298     return bRet;
 299 }
 300
 301 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
 302 {
 303     ppListKeyWords = ppKeyWords;
 304     nKeyWordCount = nCount;
 305 }
 306
 307 bool SyntaxHighlighter::Tokenizer::getNextToken(const sal_Unicode*& pos, /*out*/TokenTypes& reType,
 308     /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const
 309 {
 310     reType = TT_UNKNOWN;
 311
 312     rpStartPos = pos;
 313
 314     sal_Unicode c = *pos;
 315     if( c == 0 )
 316         return false;
 317
 318     ++pos;
 319
 320     //*** Go through all possibilities ***
 321     // Space?
 322     if ( testCharFlags( c, CHAR_SPACE ) )
 323     {
 324         while( testCharFlags( *pos, CHAR_SPACE ) )
 325             ++pos;
 326
 327         reType = TT_WHITESPACE;
 328     }
 329
 330     // Identifier?
 331     else if ( testCharFlags( c, CHAR_START_IDENTIFIER ) )
 332     {
 333         bool bIdentifierChar;
 334         do
 335         {
 336             // Naechstes Zeichen holen
 337             c = *pos;
 338             bIdentifierChar = testCharFlags( c, CHAR_IN_IDENTIFIER );
 339             if( bIdentifierChar )
 340                 ++pos;
 341         }
 342         while( bIdentifierChar );
 343
 344         reType = TT_IDENTIFIER;
 345
 346         // Keyword table
 347         if (ppListKeyWords != NULL)
 348         {
 349             int nCount = pos - rpStartPos;
 350
 351             // No keyword if string contains char > 255
 352             bool bCanBeKeyword = true;
 353             for( int i = 0 ; i < nCount ; i++ )
 354             {
 355                 if( rpStartPos[i] > 255 )
 356                 {
 357                     bCanBeKeyword = false;
 358                     break;
 359                 }
 360             }
 361
 362             if( bCanBeKeyword )
 363             {
 364                 OUString aKWString(rpStartPos, nCount);
 365                 OString aByteStr = OUStringToOString(aKWString,
 366                     RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
 367                 if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
 368                                                                         compare_strings ) )
 369                 {
 370                     reType = TT_KEYWORDS;
 371
 372                     if( aByteStr == "rem" )
 373                     {
 374                         // Remove all characters until end of line or EOF
 375                         sal_Unicode cPeek = *pos;
 376                         while( cPeek != 0 && !testCharFlags( cPeek, CHAR_EOL ) )
 377                         {
 378                             c = *pos++;
 379                             cPeek = *pos;
 380                         }
 381
 382                         reType = TT_COMMENT;
 383                     }
 384                 }
 385             }
 386         }
 387     }
 388
 389     // Operator?
 390     // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
 391     else if ( testCharFlags( c, CHAR_OPERATOR ) || ( (c == '\'') && (aLanguage==HIGHLIGHT_BASIC)) )
 392     {
 393         // parameters for SQL view
 394         if ( (c==':') || (c=='?'))
 395         {
 396             if (c!='?')
 397             {
 398                 bool bIdentifierChar;
 399                 do
 400                 {
 401                     // Get next character
 402                     c = *pos;
 403                     bIdentifierChar = isAlpha(c);
 404                     if( bIdentifierChar )
 405                         ++pos;
 406                 }
 407                 while( bIdentifierChar );
 408             }
 409             reType = TT_PARAMETER;
 410         }
 411         else if (c=='-')
 412         {
 413             sal_Unicode cPeekNext = *pos;
 414             if (cPeekNext=='-')
 415             {
 416                 // Remove all characters until end of line or EOF
 417                 while( cPeekNext != 0 && !testCharFlags( cPeekNext, CHAR_EOL ) )
 418                 {
 419                     ++pos;
 420                     cPeekNext = *pos;
 421                 }
 422                 reType = TT_COMMENT;
 423             }
 424         }
 425        else if (c=='/')
 426        {
 427            sal_Unicode cPeekNext = *pos;
 428            if (cPeekNext=='/')
 429            {
 430                // Remove all characters until end of line or EOF
 431                while( cPeekNext != 0 && !testCharFlags( cPeekNext, CHAR_EOL ) )
 432                {
 433                    ++pos;
 434                    cPeekNext = *pos;
 435                }
 436                reType = TT_COMMENT;
 437            }
 438        }
 439         else
 440         {
 441             // Comment?
 442             if ( c == '\'' )
 443             {
 444                 // Skip all characters until end of input or end of line:
 445                 for (;;) {
 446                     c = *pos;
 447                     if (c == 0 || testCharFlags(c, CHAR_EOL)) {
 448                         break;
 449                     }
 450                     ++pos;
 451                 }
 452
 453                 reType = TT_COMMENT;
 454             }
 455
 456             // The real operator; can be easily used since not the actual
 457             // operator (e.g. +=) is concerned, but the fact that it is one
 458             if( reType != TT_COMMENT )
 459             {
 460                 reType = TT_OPERATOR;
 461             }
 462
 463         }
 464     }
 465
 466     // Object separator? Must be handled before Number
 467     else if( c == '.' && ( *pos < '0' || *pos > '9' ) )
 468     {
 469         reType = TT_OPERATOR;
 470     }
 471
 472     // Number?
 473     else if( testCharFlags( c, CHAR_START_NUMBER ) )
 474     {
 475         reType = TT_NUMBER;
 476
 477         // Number system, 10 = normal, it is changed for Oct/Hex
 478         int nRadix = 10;
 479
 480         // Is it an Oct or a Hex number?
 481         if( c == '&' )
 482         {
 483             // Octal?
 484             if( *pos == 'o' || *pos == 'O' )
 485             {
 486                 // remove o
 487                 ++pos;
 488                 nRadix = 8;     // Octal base
 489
 490                 // Read all numbers
 491                 while( testCharFlags( *pos, CHAR_IN_OCT_NUMBER ) )
 492                     ++pos;
 493             }
 494             // Hexadecimal?
 495             else if( *pos == 'h' || *pos == 'H' )
 496             {
 497                 // remove x
 498                 ++pos;
 499                 nRadix = 16;     // Hexadecimal base
 500
 501                 // Read all numbers
 502                 while( testCharFlags( *pos, CHAR_IN_HEX_NUMBER ) )
 503                     ++pos;
 504             }
 505             else
 506             {
 507                 reType = TT_OPERATOR;
 508             }
 509         }
 510
 511         // When it is not Oct or Hex, then it is double
 512         if( reType == TT_NUMBER && nRadix == 10 )
 513         {
 514             // Flag if the last character is an exponent
 515             bool bAfterExpChar = false;
 516
 517             // Read all numbers
 518             while( testCharFlags( *pos, CHAR_IN_NUMBER ) ||
 519                     (bAfterExpChar && *pos == '+' ) ||
 520                     (bAfterExpChar && *pos == '-' ) )
 521                     // After exponent +/- are OK, too
 522             {
 523                 c = *pos++;
 524                 bAfterExpChar = ( c == 'e' || c == 'E' );
 525             }
 526         }
 527     }
 528
 529     // String?
 530     else if( testCharFlags( c, CHAR_START_STRING ) )
 531     {
 532         // Remember which character has opened the string
 533         sal_Unicode cEndString = c;
 534         if( c == '[' )
 535             cEndString = ']';
 536
 537         // Read all characters
 538         while( *pos != cEndString )
 539         {
 540             // Detect EOF before reading next char, so we do not lose EOF
 541             if( *pos == 0 )
 542             {
 543                 // ERROR: unterminated string literal
 544                 reType = TT_ERROR;
 545                 break;
 546             }
 547             c = *pos++;
 548             if( testCharFlags( c, CHAR_EOL ) )
 549             {
 550                 // ERROR: unterminated string literal
 551                 reType = TT_ERROR;
 552                 break;
 553             }
 554         }
 555
 556         if( reType != TT_ERROR )
 557         {
 558             ++pos;
 559             if( cEndString == ']' )
 560                 reType = TT_IDENTIFIER;
 561             else
 562                 reType = TT_STRING;
 563         }
 564     }
 565
 566     // End of line?
 567     else if( testCharFlags( c, CHAR_EOL ) )
 568     {
 569         // If another EOL character comes, read it
 570         sal_Unicode cNext = *pos;
 571         if( cNext != c && testCharFlags( cNext, CHAR_EOL ) )
 572             ++pos;
 573
 574         reType = TT_EOL;
 575     }
 576
 577     // All other will remain TT_UNKNOWN
 578
 579     // Save end position
 580     rpEndPos = pos;
 581     return true;
 582 }
 583
 584 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
 585 {
 586     memset( aCharTypeTab, 0, sizeof( aCharTypeTab ) );
 587
 588     // Fill character table
 589     sal_uInt16 i;
 590
 591     // Allowed characters for identifiers
 592     sal_uInt16 nHelpMask = (sal_uInt16)( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER );
 593     for( i = 'a' ; i <= 'z' ; i++ )
 594         aCharTypeTab[i] |= nHelpMask;
 595     for( i = 'A' ; i <= 'Z' ; i++ )
 596         aCharTypeTab[i] |= nHelpMask;
 597     aCharTypeTab[(int)'_'] |= nHelpMask;
 598     aCharTypeTab[(int)'$'] |= nHelpMask;
 599
 600     // Digit (can be identifier and number)
 601     nHelpMask = (sal_uInt16)( CHAR_IN_IDENTIFIER | CHAR_START_NUMBER |
 602                          CHAR_IN_NUMBER | CHAR_IN_HEX_NUMBER );
 603     for( i = '0' ; i <= '9' ; i++ )
 604         aCharTypeTab[i] |= nHelpMask;
 605
 606     // Add e, E, . and & here manually
 607     aCharTypeTab[(int)'e'] |= CHAR_IN_NUMBER;
 608     aCharTypeTab[(int)'E'] |= CHAR_IN_NUMBER;
 609     aCharTypeTab[(int)'.'] |= (sal_uInt16)( CHAR_IN_NUMBER | CHAR_START_NUMBER );
 610     aCharTypeTab[(int)'&'] |= CHAR_START_NUMBER;
 611
 612     // Hexadecimal digit
 613     for( i = 'a' ; i <= 'f' ; i++ )
 614         aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
 615     for( i = 'A' ; i <= 'F' ; i++ )
 616         aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
 617
 618     // Octal digit
 619     for( i = '0' ; i <= '7' ; i++ )
 620         aCharTypeTab[i] |= CHAR_IN_OCT_NUMBER;
 621
 622     // String literal start/end characters
 623     aCharTypeTab[(int)'\''] |= CHAR_START_STRING;
 624     aCharTypeTab[(int)'\"'] |= CHAR_START_STRING;
 625     aCharTypeTab[(int)'[']  |= CHAR_START_STRING;
 626     aCharTypeTab[(int)'`']  |= CHAR_START_STRING;
 627
 628     // Operator characters
 629     aCharTypeTab[(int)'!'] |= CHAR_OPERATOR;
 630     aCharTypeTab[(int)'%'] |= CHAR_OPERATOR;
 631     // aCharTypeTab[(int)'&'] |= CHAR_OPERATOR;     Removed because of #i14140
 632     aCharTypeTab[(int)'('] |= CHAR_OPERATOR;
 633     aCharTypeTab[(int)')'] |= CHAR_OPERATOR;
 634     aCharTypeTab[(int)'*'] |= CHAR_OPERATOR;
 635     aCharTypeTab[(int)'+'] |= CHAR_OPERATOR;
 636     aCharTypeTab[(int)','] |= CHAR_OPERATOR;
 637     aCharTypeTab[(int)'-'] |= CHAR_OPERATOR;
 638     aCharTypeTab[(int)'/'] |= CHAR_OPERATOR;
 639     aCharTypeTab[(int)':'] |= CHAR_OPERATOR;
 640     aCharTypeTab[(int)'<'] |= CHAR_OPERATOR;
 641     aCharTypeTab[(int)'='] |= CHAR_OPERATOR;
 642     aCharTypeTab[(int)'>'] |= CHAR_OPERATOR;
 643     aCharTypeTab[(int)'?'] |= CHAR_OPERATOR;
 644     aCharTypeTab[(int)'^'] |= CHAR_OPERATOR;
 645     aCharTypeTab[(int)'|'] |= CHAR_OPERATOR;
 646     aCharTypeTab[(int)'~'] |= CHAR_OPERATOR;
 647     aCharTypeTab[(int)'{'] |= CHAR_OPERATOR;
 648     aCharTypeTab[(int)'}'] |= CHAR_OPERATOR;
 649     // aCharTypeTab[(int)'['] |= CHAR_OPERATOR;     Removed because of #i17826
 650     aCharTypeTab[(int)']'] |= CHAR_OPERATOR;
 651     aCharTypeTab[(int)';'] |= CHAR_OPERATOR;
 652
 653     // Space
 654     aCharTypeTab[(int)' ' ] |= CHAR_SPACE;
 655     aCharTypeTab[(int)'\t'] |= CHAR_SPACE;
 656
 657     // End of line characters
 658     aCharTypeTab[(int)'\r'] |= CHAR_EOL;
 659     aCharTypeTab[(int)'\n'] |= CHAR_EOL;
 660
 661     ppListKeyWords = NULL;
 662     nKeyWordCount = 0;
 663 }
 664
 665 SyntaxHighlighter::Tokenizer::~Tokenizer()
 666 {
 667 }
 668
 669 void SyntaxHighlighter::Tokenizer::getHighlightPortions(const OUString& rLine,
 670                                                  /*out*/std::vector<HighlightPortion>& portions) const
 671 {
 672     // Set the position to the beginning of the source string
 673     const sal_Unicode* pos = rLine.getStr();
 674
 675     // Variables for the out parameter
 676     TokenTypes eType;
 677     const sal_Unicode* pStartPos;
 678     const sal_Unicode* pEndPos;
 679
 680     // Loop over all the tokens
 681     while( getNextToken( pos, eType, pStartPos, pEndPos ) )
 682     {
 683         portions.push_back(
 684             HighlightPortion(
 685                 pStartPos - rLine.getStr(), pEndPos - rLine.getStr(), eType));
 686     }
 687 }
 688
 689
 690 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
 691     eLanguage(language), m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
 692 {
 693     switch (eLanguage)
 694     {
 695         case HIGHLIGHT_BASIC:
 696             m_tokenizer->setKeyWords( strListBasicKeyWords,
 697                                             sizeof( strListBasicKeyWords ) / sizeof( char* ));
 698             break;
 699         case HIGHLIGHT_SQL:
 700             m_tokenizer->setKeyWords( strListSqlKeyWords,
 701                                             sizeof( strListSqlKeyWords ) / sizeof( char* ));
 702             break;
 703         default:
 704             assert(false); // this cannot happen
 705     }
 706 }
 707
 708 SyntaxHighlighter::~SyntaxHighlighter() {}
 709
 710 void SyntaxHighlighter::getHighlightPortions(const OUString& rLine,
 711                                               /*out*/std::vector<HighlightPortion>& portions) const
 712 {
 713     m_tokenizer->getHighlightPortions( rLine, portions );
 714 }
 715
 716 HighlighterLanguage SyntaxHighlighter::GetLanguage()
 717 {
 718     return m_tokenizer->aLanguage;
 719 }
 720
 721 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */