comphelper/source/misc/syntaxhighlight.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <sal/config.h>
  21
  22 #include <cassert>
  23
  24 #include <rtl/character.hxx>
  25 #include <unicode/uchar.h>
  26 #include <comphelper/syntaxhighlight.hxx>
  27 #include <o3tl/typed_flags_set.hxx>
  28
  29 namespace {
  30
  31 // Flags for character properties
  32 enum class CharFlags {
  33     StartIdentifier   = 0x0001,
  34     InIdentifier      = 0x0002,
  35     StartNumber       = 0x0004,
  36     InNumber          = 0x0008,
  37     InHexNumber       = 0x0010,
  38     InOctNumber       = 0x0020,
  39     StartString       = 0x0040,
  40     Operator          = 0x0080,
  41     Space             = 0x0100,
  42     EOL               = 0x0200
  43 };
  44
  45 }
  46
  47 namespace o3tl {
  48     template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
  49 }
  50
  51 // ##########################################################################
  52 // ATTENTION: all these words need to be in lower case
  53 // ##########################################################################
  54 static const char* strListBasicKeyWords[] = {
  55     "access",
  56     "alias",
  57     "and",
  58     "any",
  59     "append",
  60     "as",
  61     "attribute",
  62     "base",
  63     "binary",
  64     "boolean",
  65     "byref",
  66     "byte",
  67     "byval",
  68     "call",
  69     "case",
  70     "cdecl",
  71     "classmodule",
  72     "close",
  73     "compare",
  74     "compatible",
  75     "const",
  76     "currency",
  77     "date",
  78     "declare",
  79     "defbool",
  80     "defcur",
  81     "defdate",
  82     "defdbl",
  83     "deferr",
  84     "defint",
  85     "deflng",
  86     "defobj",
  87     "defsng",
  88     "defstr",
  89     "defvar",
  90     "dim",
  91     "do",
  92     "doevents",
  93     "double",
  94     "each",
  95     "else",
  96     "elseif",
  97     "end",
  98     "end enum",
  99     "end function",
 100     "end if",
 101     "end property",
 102     "end select",
 103     "end sub",
 104     "end type",
 105     "endif",
 106     "enum",
 107     "eqv",
 108     "erase",
 109     "error",
 110     "exit",
 111     "explicit",
 112     "for",
 113     "function",
 114     "get",
 115     "global",
 116     "gosub",
 117     "goto",
 118     "if",
 119     "imp",
 120     "implements",
 121     "in",
 122     "input",
 123     "integer",
 124     "is",
 125     "let",
 126     "lib",
 127     "like",
 128     "line",
 129     "line input",
 130     "local",
 131     "lock",
 132     "long",
 133     "loop",
 134     "lprint",
 135     "lset",
 136     "mod",
 137     "name",
 138     "new",
 139     "next",
 140     "not",
 141     "object",
 142     "on",
 143     "open",
 144     "option",
 145     "optional",
 146     "or",
 147     "output",
 148     "paramarray",
 149     "preserve",
 150     "print",
 151     "private",
 152     "property",
 153     "public",
 154     "random",
 155     "read",
 156     "redim",
 157     "rem",
 158     "resume",
 159     "return",
 160     "rset",
 161     "select",
 162     "set",
 163     "shared",
 164     "single",
 165     "static",
 166     "step",
 167     "stop",
 168     "string",
 169     "sub",
 170     "system",
 171     "text",
 172     "then",
 173     "to",
 174     "type",
 175     "typeof",
 176     "until",
 177     "variant",
 178     "vbasupport",
 179     "wend",
 180     "while",
 181     "with",
 182     "withevents",
 183     "write",
 184     "xor"
 185 };
 186
 187
 188 static const char* strListSqlKeyWords[] = {
 189     "all",
 190     "and",
 191     "any",
 192     "as",
 193     "asc",
 194     "avg",
 195     "between",
 196     "by",
 197     "cast",
 198     "corresponding",
 199     "count",
 200     "create",
 201     "cross",
 202     "delete",
 203     "desc",
 204     "distinct",
 205     "drop",
 206     "escape",
 207     "except",
 208     "exists",
 209     "false",
 210     "from",
 211     "full",
 212     "global",
 213     "group",
 214     "having",
 215     "in",
 216     "inner",
 217     "insert",
 218     "intersect",
 219     "into",
 220     "is",
 221     "join",
 222     "left",
 223     "like",
 224     "limit",
 225     "local",
 226     "match",
 227     "max",
 228     "min",
 229     "natural",
 230     "not",
 231     "null",
 232     "on",
 233     "or",
 234     "order",
 235     "outer",
 236     "right",
 237     "select",
 238     "set",
 239     "some",
 240     "sum",
 241     "table",
 242     "temporary",
 243     "true",
 244     "union",
 245     "unique",
 246     "unknown",
 247     "update",
 248     "using",
 249     "values",
 250     "where"
 251 };
 252
 253
 254 extern "C" {
 255
 256 static int compare_strings( const void *arg1, const void *arg2 )
 257 {
 258     return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
 259 }
 260
 261 }
 262
 263 namespace
 264 {
 265     bool isAlpha(sal_Unicode c)
 266     {
 267         if (rtl::isAsciiAlpha(c))
 268             return true;
 269         return u_isalpha(c);
 270     }
 271 }
 272
 273 class SyntaxHighlighter::Tokenizer
 274 {
 275     // Character information tables
 276     CharFlags aCharTypeTab[256] = {};
 277
 278     // Auxiliary function: testing of the character flags
 279     bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
 280
 281     // Get new token, EmptyString == nothing more over there
 282     bool getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end, /*out*/TokenType& reType,
 283         /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const;
 284
 285     const char** ppListKeyWords;
 286     sal_uInt16 nKeyWordCount;
 287
 288 public:
 289     HighlighterLanguage const aLanguage;
 290
 291     explicit Tokenizer( HighlighterLanguage aLang );
 292
 293     void getHighlightPortions(std::u16string_view rLine,
 294                                /*out*/std::vector<HighlightPortion>& portions) const;
 295     void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
 296 };
 297
 298 // Helper function: test character flag
 299 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
 300 {
 301     bool bRet = false;
 302     if( c != 0 && c <= 255 )
 303     {
 304         bRet = bool(aCharTypeTab[c] & nTestFlags);
 305     }
 306     else if( c > 255 )
 307     {
 308         bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
 309             && isAlpha(c);
 310     }
 311     return bRet;
 312 }
 313
 314 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
 315 {
 316     ppListKeyWords = ppKeyWords;
 317     nKeyWordCount = nCount;
 318 }
 319
 320 bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end,
 321     /*out*/TokenType& reType,
 322     /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const
 323 {
 324     reType = TokenType::Unknown;
 325
 326     rpStartPos = pos;
 327
 328     if( pos == end )
 329         return false;
 330
 331     sal_Unicode c = *pos;
 332     ++pos;
 333
 334     //*** Go through all possibilities ***
 335     // Space?
 336     if ( testCharFlags( c, CharFlags::Space ) )
 337     {
 338         while( pos != end && testCharFlags( *pos, CharFlags::Space ) )
 339             ++pos;
 340
 341         reType = TokenType::Whitespace;
 342     }
 343
 344     // Identifier?
 345     else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
 346     {
 347         bool bIdentifierChar;
 348         do
 349         {
 350             if (pos == end)
 351                 break;
 352             // Fetch next character
 353             c = *pos;
 354             bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
 355             if( bIdentifierChar )
 356                 ++pos;
 357         }
 358         while( bIdentifierChar );
 359
 360         reType = TokenType::Identifier;
 361
 362         // Keyword table
 363         if (ppListKeyWords != nullptr)
 364         {
 365             int nCount = pos - rpStartPos;
 366
 367             // No keyword if string contains char > 255
 368             bool bCanBeKeyword = true;
 369             for( int i = 0 ; i < nCount ; i++ )
 370             {
 371                 if( rpStartPos[i] > 255 )
 372                 {
 373                     bCanBeKeyword = false;
 374                     break;
 375                 }
 376             }
 377
 378             if( bCanBeKeyword )
 379             {
 380                 std::u16string_view aKWString(&*rpStartPos, nCount);
 381                 OString aByteStr = OUStringToOString(aKWString,
 382                     RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
 383                 if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
 384                                                                         compare_strings ) )
 385                 {
 386                     reType = TokenType::Keywords;
 387
 388                     if( aByteStr == "rem" )
 389                     {
 390                         // Remove all characters until end of line or EOF
 391                         for (;;)
 392                         {
 393                             if (pos == end)
 394                                 break;
 395                             sal_Unicode cPeek = *pos;
 396                             if ( testCharFlags( cPeek, CharFlags::EOL ) )
 397                                 break;
 398                             ++pos;
 399                         }
 400
 401                         reType = TokenType::Comment;
 402                     }
 403                 }
 404             }
 405         }
 406     }
 407
 408     // Operator?
 409     // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
 410     else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
 411     {
 412         // parameters for SQL view
 413         if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
 414         {
 415             if (c!='?')
 416             {
 417                 bool bIdentifierChar;
 418                 do
 419                 {
 420                     // Get next character
 421                     if (pos == end)
 422                         break;
 423                     c = *pos;
 424                     bIdentifierChar = isAlpha(c);
 425                     if( bIdentifierChar )
 426                         ++pos;
 427                 }
 428                 while( bIdentifierChar );
 429             }
 430             reType = TokenType::Parameter;
 431         }
 432         else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
 433         {
 434             if (pos != end && *pos=='-')
 435             {
 436                 // Remove all characters until end of line or EOF
 437                 while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
 438                 {
 439                     ++pos;
 440                 }
 441                 reType = TokenType::Comment;
 442             }
 443             else
 444                 reType = TokenType::Operator;
 445         }
 446         else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
 447         {
 448             if (pos != end && *pos=='/')
 449             {
 450                 // Remove all characters until end of line or EOF
 451                 while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
 452                 {
 453                     ++pos;
 454                 }
 455                 reType = TokenType::Comment;
 456             }
 457             else
 458                 reType = TokenType::Operator;
 459         }
 460         else
 461         {
 462             // Apostrophe is Basic comment
 463             if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
 464             {
 465                 // Skip all characters until end of input or end of line:
 466                 for (;;) {
 467                     if (pos == end)
 468                         break;
 469                     c = *pos;
 470                     if (testCharFlags(c, CharFlags::EOL)) {
 471                         break;
 472                     }
 473                     ++pos;
 474                 }
 475
 476                 reType = TokenType::Comment;
 477             }
 478
 479             // The real operator; can be easily used since not the actual
 480             // operator (e.g. +=) is concerned, but the fact that it is one
 481             if( reType != TokenType::Comment )
 482             {
 483                 reType = TokenType::Operator;
 484             }
 485
 486         }
 487     }
 488
 489     // Object separator? Must be handled before Number
 490     else if( c == '.' && ( pos == end || *pos < '0' || *pos > '9' ) )
 491     {
 492         reType = TokenType::Operator;
 493     }
 494
 495     // Number?
 496     else if( testCharFlags( c, CharFlags::StartNumber ) )
 497     {
 498         reType = TokenType::Number;
 499
 500         // Number system, 10 = normal, it is changed for Oct/Hex
 501         int nRadix = 10;
 502
 503         // Is it an Oct or a Hex number?
 504         if( c == '&' )
 505         {
 506             // Octal?
 507             if( pos != end && (*pos == 'o' || *pos == 'O' ))
 508             {
 509                 // remove o
 510                 ++pos;
 511                 nRadix = 8;     // Octal base
 512
 513                 // Read all numbers
 514                 while( pos != end && testCharFlags( *pos, CharFlags::InOctNumber ) )
 515                     ++pos;
 516             }
 517             // Hexadecimal?
 518             else if( pos != end && (*pos == 'h' || *pos == 'H' ))
 519             {
 520                 // remove x
 521                 ++pos;
 522                 nRadix = 16;     // Hexadecimal base
 523
 524                 // Read all numbers
 525                 while( pos != end && testCharFlags( *pos, CharFlags::InHexNumber ) )
 526                     ++pos;
 527             }
 528             else
 529             {
 530                 reType = TokenType::Operator;
 531             }
 532         }
 533
 534         // When it is not Oct or Hex, then it is double
 535         if( reType == TokenType::Number && nRadix == 10 )
 536         {
 537             // Flag if the last character is an exponent
 538             bool bAfterExpChar = false;
 539
 540             // Read all numbers
 541             while( pos != end && (testCharFlags( *pos, CharFlags::InNumber ) ||
 542                     (bAfterExpChar && *pos == '+' ) ||
 543                     (bAfterExpChar && *pos == '-' ) ))
 544                     // After exponent +/- are OK, too
 545             {
 546                 c = *pos++;
 547                 bAfterExpChar = ( c == 'e' || c == 'E' );
 548             }
 549         }
 550     }
 551
 552     // String?
 553     else if( testCharFlags( c, CharFlags::StartString ) )
 554     {
 555         // Remember which character has opened the string
 556         sal_Unicode cEndString = c;
 557         if( c == '[' )
 558             cEndString = ']';
 559
 560         // Read all characters
 561         while( pos == end || *pos != cEndString )
 562         {
 563             // Detect EOF before reading next char, so we do not lose EOF
 564             if( pos == end )
 565             {
 566                 // ERROR: unterminated string literal
 567                 reType = TokenType::Error;
 568                 break;
 569             }
 570             c = *pos++;
 571             if( testCharFlags( c, CharFlags::EOL ) )
 572             {
 573                 // ERROR: unterminated string literal
 574                 reType = TokenType::Error;
 575                 break;
 576             }
 577         }
 578
 579         if( reType != TokenType::Error )
 580         {
 581             ++pos;
 582             if( cEndString == ']' )
 583                 reType = TokenType::Identifier;
 584             else
 585                 reType = TokenType::String;
 586         }
 587     }
 588
 589     // End of line?
 590     else if( testCharFlags( c, CharFlags::EOL ) )
 591     {
 592         // If another EOL character comes, read it
 593         if (pos != end)
 594         {
 595             sal_Unicode cNext = *pos;
 596             if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
 597                 ++pos;
 598         }
 599
 600         reType = TokenType::EOL;
 601     }
 602
 603     // All other will remain TokenType::Unknown
 604
 605     // Save end position
 606     rpEndPos = pos;
 607     return true;
 608 }
 609
 610 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
 611 {
 612     // Fill character table
 613     sal_uInt16 i;
 614
 615     // Allowed characters for identifiers
 616     CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
 617     for( i = 'a' ; i <= 'z' ; i++ )
 618         aCharTypeTab[i] |= nHelpMask;
 619     for( i = 'A' ; i <= 'Z' ; i++ )
 620         aCharTypeTab[i] |= nHelpMask;
 621     aCharTypeTab[int('_')] |= nHelpMask;
 622     aCharTypeTab[int('$')] |= nHelpMask;
 623
 624     // Digit (can be identifier and number)
 625     nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
 626                          CharFlags::InNumber | CharFlags::InHexNumber;
 627     for( i = '0' ; i <= '9' ; i++ )
 628         aCharTypeTab[i] |= nHelpMask;
 629
 630     // Add e, E, . and & here manually
 631     aCharTypeTab[int('e')] |= CharFlags::InNumber;
 632     aCharTypeTab[int('E')] |= CharFlags::InNumber;
 633     aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
 634     aCharTypeTab[int('&')] |= CharFlags::StartNumber;
 635
 636     // Hexadecimal digit
 637     for( i = 'a' ; i <= 'f' ; i++ )
 638         aCharTypeTab[i] |= CharFlags::InHexNumber;
 639     for( i = 'A' ; i <= 'F' ; i++ )
 640         aCharTypeTab[i] |= CharFlags::InHexNumber;
 641
 642     // Octal digit
 643     for( i = '0' ; i <= '7' ; i++ )
 644         aCharTypeTab[i] |= CharFlags::InOctNumber;
 645
 646     // String literal start/end characters
 647     aCharTypeTab[int('\'')] |= CharFlags::StartString;
 648     aCharTypeTab[int('\"')] |= CharFlags::StartString;
 649     aCharTypeTab[int('[')]  |= CharFlags::StartString;
 650     aCharTypeTab[int('`')]  |= CharFlags::StartString;
 651
 652     // Operator characters
 653     aCharTypeTab[int('!')] |= CharFlags::Operator;
 654     aCharTypeTab[int('%')] |= CharFlags::Operator;
 655     // aCharTypeTab[(int)'&'] |= CharFlags::Operator;     Removed because of #i14140
 656     aCharTypeTab[int('(')] |= CharFlags::Operator;
 657     aCharTypeTab[int(')')] |= CharFlags::Operator;
 658     aCharTypeTab[int('*')] |= CharFlags::Operator;
 659     aCharTypeTab[int('+')] |= CharFlags::Operator;
 660     aCharTypeTab[int(',')] |= CharFlags::Operator;
 661     aCharTypeTab[int('-')] |= CharFlags::Operator;
 662     aCharTypeTab[int('/')] |= CharFlags::Operator;
 663     aCharTypeTab[int(':')] |= CharFlags::Operator;
 664     aCharTypeTab[int('<')] |= CharFlags::Operator;
 665     aCharTypeTab[int('=')] |= CharFlags::Operator;
 666     aCharTypeTab[int('>')] |= CharFlags::Operator;
 667     aCharTypeTab[int('?')] |= CharFlags::Operator;
 668     aCharTypeTab[int('^')] |= CharFlags::Operator;
 669     aCharTypeTab[int('|')] |= CharFlags::Operator;
 670     aCharTypeTab[int('~')] |= CharFlags::Operator;
 671     aCharTypeTab[int('{')] |= CharFlags::Operator;
 672     aCharTypeTab[int('}')] |= CharFlags::Operator;
 673     // aCharTypeTab[(int)'['] |= CharFlags::Operator;     Removed because of #i17826
 674     aCharTypeTab[int(']')] |= CharFlags::Operator;
 675     aCharTypeTab[int(';')] |= CharFlags::Operator;
 676
 677     // Space
 678     aCharTypeTab[int(' ') ] |= CharFlags::Space;
 679     aCharTypeTab[int('\t')] |= CharFlags::Space;
 680
 681     // End of line characters
 682     aCharTypeTab[int('\r')] |= CharFlags::EOL;
 683     aCharTypeTab[int('\n')] |= CharFlags::EOL;
 684
 685     ppListKeyWords = nullptr;
 686     nKeyWordCount = 0;
 687 }
 688
 689 void SyntaxHighlighter::Tokenizer::getHighlightPortions(std::u16string_view rLine,
 690                                                  /*out*/std::vector<HighlightPortion>& portions) const
 691 {
 692     // Set the position to the beginning of the source string
 693     auto pos = rLine.begin();
 694
 695     // Variables for the out parameter
 696     TokenType eType;
 697     std::u16string_view::const_iterator pStartPos;
 698     std::u16string_view::const_iterator pEndPos;
 699
 700     // Loop over all the tokens
 701     while( getNextToken( pos, rLine.end(), eType, pStartPos, pEndPos ) )
 702     {
 703         portions.emplace_back(
 704                 pStartPos - rLine.begin(), pEndPos - rLine.begin(), eType);
 705     }
 706 }
 707
 708
 709 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
 710     m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
 711 {
 712     switch (language)
 713     {
 714         case HighlighterLanguage::Basic:
 715             m_tokenizer->setKeyWords( strListBasicKeyWords,
 716                                       std::size( strListBasicKeyWords ));
 717             break;
 718         case HighlighterLanguage::SQL:
 719             m_tokenizer->setKeyWords( strListSqlKeyWords,
 720                                       std::size( strListSqlKeyWords ));
 721             break;
 722         default:
 723             assert(false); // this cannot happen
 724     }
 725 }
 726
 727 SyntaxHighlighter::~SyntaxHighlighter() {}
 728
 729 void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine,
 730                                               /*out*/std::vector<HighlightPortion>& portions) const
 731 {
 732     m_tokenizer->getHighlightPortions( rLine, portions );
 733 }
 734
 735 HighlighterLanguage SyntaxHighlighter::GetLanguage() const
 736 {
 737     return m_tokenizer->aLanguage;
 738 }
 739
 740 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */