Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / comphelper / source / misc / syntaxhighlight.cxx
blob3ce8086e64d52f75cd273f02bbe7bff64acccbae
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <rtl/character.hxx>
25 #include <unicode/uchar.h>
26 #include <comphelper/syntaxhighlight.hxx>
27 #include <o3tl/typed_flags_set.hxx>
29 namespace {
31 // Flags for character properties
32 enum class CharFlags {
33 StartIdentifier = 0x0001,
34 InIdentifier = 0x0002,
35 StartNumber = 0x0004,
36 InNumber = 0x0008,
37 InHexNumber = 0x0010,
38 InOctNumber = 0x0020,
39 StartString = 0x0040,
40 Operator = 0x0080,
41 Space = 0x0100,
42 EOL = 0x0200
47 namespace o3tl {
48 template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
51 // ##########################################################################
52 // ATTENTION: all these words need to be in lower case
53 // ##########################################################################
54 static const char* strListBasicKeyWords[] = {
55 "access",
56 "alias",
57 "and",
58 "any",
59 "append",
60 "as",
61 "attribute",
62 "base",
63 "binary",
64 "boolean",
65 "byref",
66 "byte",
67 "byval",
68 "call",
69 "case",
70 "cdecl",
71 "classmodule",
72 "close",
73 "compare",
74 "compatible",
75 "const",
76 "currency",
77 "date",
78 "declare",
79 "defbool",
80 "defcur",
81 "defdate",
82 "defdbl",
83 "deferr",
84 "defint",
85 "deflng",
86 "defobj",
87 "defsng",
88 "defstr",
89 "defvar",
90 "dim",
91 "do",
92 "doevents",
93 "double",
94 "each",
95 "else",
96 "elseif",
97 "end",
98 "end enum",
99 "end function",
100 "end if",
101 "end property",
102 "end select",
103 "end sub",
104 "end type",
105 "endif",
106 "enum",
107 "eqv",
108 "erase",
109 "error",
110 "exit",
111 "explicit",
112 "for",
113 "function",
114 "get",
115 "global",
116 "gosub",
117 "goto",
118 "if",
119 "imp",
120 "implements",
121 "in",
122 "input",
123 "integer",
124 "is",
125 "let",
126 "lib",
127 "like",
128 "line",
129 "line input",
130 "local",
131 "lock",
132 "long",
133 "loop",
134 "lprint",
135 "lset",
136 "mod",
137 "name",
138 "new",
139 "next",
140 "not",
141 "object",
142 "on",
143 "open",
144 "option",
145 "optional",
146 "or",
147 "output",
148 "paramarray",
149 "preserve",
150 "print",
151 "private",
152 "property",
153 "public",
154 "random",
155 "read",
156 "redim",
157 "rem",
158 "resume",
159 "return",
160 "rset",
161 "select",
162 "set",
163 "shared",
164 "single",
165 "static",
166 "step",
167 "stop",
168 "string",
169 "sub",
170 "system",
171 "text",
172 "then",
173 "to",
174 "type",
175 "typeof",
176 "until",
177 "variant",
178 "vbasupport",
179 "wend",
180 "while",
181 "with",
182 "withevents",
183 "write",
184 "xor"
188 static const char* strListSqlKeyWords[] = {
189 "all",
190 "and",
191 "any",
192 "as",
193 "asc",
194 "avg",
195 "between",
196 "by",
197 "cast",
198 "corresponding",
199 "count",
200 "create",
201 "cross",
202 "delete",
203 "desc",
204 "distinct",
205 "drop",
206 "escape",
207 "except",
208 "exists",
209 "false",
210 "from",
211 "full",
212 "global",
213 "group",
214 "having",
215 "in",
216 "inner",
217 "insert",
218 "intersect",
219 "into",
220 "is",
221 "join",
222 "left",
223 "like",
224 "limit",
225 "local",
226 "match",
227 "max",
228 "min",
229 "natural",
230 "not",
231 "null",
232 "on",
233 "or",
234 "order",
235 "outer",
236 "right",
237 "select",
238 "set",
239 "some",
240 "sum",
241 "table",
242 "temporary",
243 "true",
244 "union",
245 "unique",
246 "unknown",
247 "update",
248 "using",
249 "values",
250 "where"
254 extern "C" {
256 static int compare_strings( const void *arg1, const void *arg2 )
258 return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
263 namespace
265 bool isAlpha(sal_Unicode c)
267 if (rtl::isAsciiAlpha(c))
268 return true;
269 return u_isalpha(c);
273 class SyntaxHighlighter::Tokenizer
275 // Character information tables
276 CharFlags aCharTypeTab[256] = {};
278 // Auxiliary function: testing of the character flags
279 bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
281 // Get new token, EmptyString == nothing more over there
282 bool getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end, /*out*/TokenType& reType,
283 /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const;
285 const char** ppListKeyWords;
286 sal_uInt16 nKeyWordCount;
288 public:
289 HighlighterLanguage const aLanguage;
291 explicit Tokenizer( HighlighterLanguage aLang );
293 void getHighlightPortions(std::u16string_view rLine,
294 /*out*/std::vector<HighlightPortion>& portions) const;
295 void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
298 // Helper function: test character flag
299 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
301 bool bRet = false;
302 if( c != 0 && c <= 255 )
304 bRet = bool(aCharTypeTab[c] & nTestFlags);
306 else if( c > 255 )
308 bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
309 && isAlpha(c);
311 return bRet;
314 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
316 ppListKeyWords = ppKeyWords;
317 nKeyWordCount = nCount;
320 bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end,
321 /*out*/TokenType& reType,
322 /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const
324 reType = TokenType::Unknown;
326 rpStartPos = pos;
328 if( pos == end )
329 return false;
331 sal_Unicode c = *pos;
332 ++pos;
334 //*** Go through all possibilities ***
335 // Space?
336 if ( testCharFlags( c, CharFlags::Space ) )
338 while( pos != end && testCharFlags( *pos, CharFlags::Space ) )
339 ++pos;
341 reType = TokenType::Whitespace;
344 // Identifier?
345 else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
347 bool bIdentifierChar;
350 if (pos == end)
351 break;
352 // Fetch next character
353 c = *pos;
354 bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
355 if( bIdentifierChar )
356 ++pos;
358 while( bIdentifierChar );
360 reType = TokenType::Identifier;
362 // Keyword table
363 if (ppListKeyWords != nullptr)
365 int nCount = pos - rpStartPos;
367 // No keyword if string contains char > 255
368 bool bCanBeKeyword = true;
369 for( int i = 0 ; i < nCount ; i++ )
371 if( rpStartPos[i] > 255 )
373 bCanBeKeyword = false;
374 break;
378 if( bCanBeKeyword )
380 std::u16string_view aKWString(&*rpStartPos, nCount);
381 OString aByteStr = OUStringToOString(aKWString,
382 RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
383 if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
384 compare_strings ) )
386 reType = TokenType::Keywords;
388 if( aByteStr == "rem" )
390 // Remove all characters until end of line or EOF
391 for (;;)
393 if (pos == end)
394 break;
395 sal_Unicode cPeek = *pos;
396 if ( testCharFlags( cPeek, CharFlags::EOL ) )
397 break;
398 ++pos;
401 reType = TokenType::Comment;
408 // Operator?
409 // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
410 else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
412 // parameters for SQL view
413 if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
415 if (c!='?')
417 bool bIdentifierChar;
420 // Get next character
421 if (pos == end)
422 break;
423 c = *pos;
424 bIdentifierChar = isAlpha(c);
425 if( bIdentifierChar )
426 ++pos;
428 while( bIdentifierChar );
430 reType = TokenType::Parameter;
432 else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
434 if (pos != end && *pos=='-')
436 // Remove all characters until end of line or EOF
437 while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
439 ++pos;
441 reType = TokenType::Comment;
443 else
444 reType = TokenType::Operator;
446 else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
448 if (pos != end && *pos=='/')
450 // Remove all characters until end of line or EOF
451 while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
453 ++pos;
455 reType = TokenType::Comment;
457 else
458 reType = TokenType::Operator;
460 else
462 // Apostrophe is Basic comment
463 if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
465 // Skip all characters until end of input or end of line:
466 for (;;) {
467 if (pos == end)
468 break;
469 c = *pos;
470 if (testCharFlags(c, CharFlags::EOL)) {
471 break;
473 ++pos;
476 reType = TokenType::Comment;
479 // The real operator; can be easily used since not the actual
480 // operator (e.g. +=) is concerned, but the fact that it is one
481 if( reType != TokenType::Comment )
483 reType = TokenType::Operator;
489 // Object separator? Must be handled before Number
490 else if( c == '.' && ( pos == end || *pos < '0' || *pos > '9' ) )
492 reType = TokenType::Operator;
495 // Number?
496 else if( testCharFlags( c, CharFlags::StartNumber ) )
498 reType = TokenType::Number;
500 // Number system, 10 = normal, it is changed for Oct/Hex
501 int nRadix = 10;
503 // Is it an Oct or a Hex number?
504 if( c == '&' )
506 // Octal?
507 if( pos != end && (*pos == 'o' || *pos == 'O' ))
509 // remove o
510 ++pos;
511 nRadix = 8; // Octal base
513 // Read all numbers
514 while( pos != end && testCharFlags( *pos, CharFlags::InOctNumber ) )
515 ++pos;
517 // Hexadecimal?
518 else if( pos != end && (*pos == 'h' || *pos == 'H' ))
520 // remove x
521 ++pos;
522 nRadix = 16; // Hexadecimal base
524 // Read all numbers
525 while( pos != end && testCharFlags( *pos, CharFlags::InHexNumber ) )
526 ++pos;
528 else
530 reType = TokenType::Operator;
534 // When it is not Oct or Hex, then it is double
535 if( reType == TokenType::Number && nRadix == 10 )
537 // Flag if the last character is an exponent
538 bool bAfterExpChar = false;
540 // Read all numbers
541 while( pos != end && (testCharFlags( *pos, CharFlags::InNumber ) ||
542 (bAfterExpChar && *pos == '+' ) ||
543 (bAfterExpChar && *pos == '-' ) ))
544 // After exponent +/- are OK, too
546 c = *pos++;
547 bAfterExpChar = ( c == 'e' || c == 'E' );
552 // String?
553 else if( testCharFlags( c, CharFlags::StartString ) )
555 // Remember which character has opened the string
556 sal_Unicode cEndString = c;
557 if( c == '[' )
558 cEndString = ']';
560 // Read all characters
561 while( pos == end || *pos != cEndString )
563 // Detect EOF before reading next char, so we do not lose EOF
564 if( pos == end )
566 // ERROR: unterminated string literal
567 reType = TokenType::Error;
568 break;
570 c = *pos++;
571 if( testCharFlags( c, CharFlags::EOL ) )
573 // ERROR: unterminated string literal
574 reType = TokenType::Error;
575 break;
579 if( reType != TokenType::Error )
581 ++pos;
582 if( cEndString == ']' )
583 reType = TokenType::Identifier;
584 else
585 reType = TokenType::String;
589 // End of line?
590 else if( testCharFlags( c, CharFlags::EOL ) )
592 // If another EOL character comes, read it
593 if (pos != end)
595 sal_Unicode cNext = *pos;
596 if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
597 ++pos;
600 reType = TokenType::EOL;
603 // All other will remain TokenType::Unknown
605 // Save end position
606 rpEndPos = pos;
607 return true;
610 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
612 // Fill character table
613 sal_uInt16 i;
615 // Allowed characters for identifiers
616 CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
617 for( i = 'a' ; i <= 'z' ; i++ )
618 aCharTypeTab[i] |= nHelpMask;
619 for( i = 'A' ; i <= 'Z' ; i++ )
620 aCharTypeTab[i] |= nHelpMask;
621 aCharTypeTab[int('_')] |= nHelpMask;
622 aCharTypeTab[int('$')] |= nHelpMask;
624 // Digit (can be identifier and number)
625 nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
626 CharFlags::InNumber | CharFlags::InHexNumber;
627 for( i = '0' ; i <= '9' ; i++ )
628 aCharTypeTab[i] |= nHelpMask;
630 // Add e, E, . and & here manually
631 aCharTypeTab[int('e')] |= CharFlags::InNumber;
632 aCharTypeTab[int('E')] |= CharFlags::InNumber;
633 aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
634 aCharTypeTab[int('&')] |= CharFlags::StartNumber;
636 // Hexadecimal digit
637 for( i = 'a' ; i <= 'f' ; i++ )
638 aCharTypeTab[i] |= CharFlags::InHexNumber;
639 for( i = 'A' ; i <= 'F' ; i++ )
640 aCharTypeTab[i] |= CharFlags::InHexNumber;
642 // Octal digit
643 for( i = '0' ; i <= '7' ; i++ )
644 aCharTypeTab[i] |= CharFlags::InOctNumber;
646 // String literal start/end characters
647 aCharTypeTab[int('\'')] |= CharFlags::StartString;
648 aCharTypeTab[int('\"')] |= CharFlags::StartString;
649 aCharTypeTab[int('[')] |= CharFlags::StartString;
650 aCharTypeTab[int('`')] |= CharFlags::StartString;
652 // Operator characters
653 aCharTypeTab[int('!')] |= CharFlags::Operator;
654 aCharTypeTab[int('%')] |= CharFlags::Operator;
655 // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
656 aCharTypeTab[int('(')] |= CharFlags::Operator;
657 aCharTypeTab[int(')')] |= CharFlags::Operator;
658 aCharTypeTab[int('*')] |= CharFlags::Operator;
659 aCharTypeTab[int('+')] |= CharFlags::Operator;
660 aCharTypeTab[int(',')] |= CharFlags::Operator;
661 aCharTypeTab[int('-')] |= CharFlags::Operator;
662 aCharTypeTab[int('/')] |= CharFlags::Operator;
663 aCharTypeTab[int(':')] |= CharFlags::Operator;
664 aCharTypeTab[int('<')] |= CharFlags::Operator;
665 aCharTypeTab[int('=')] |= CharFlags::Operator;
666 aCharTypeTab[int('>')] |= CharFlags::Operator;
667 aCharTypeTab[int('?')] |= CharFlags::Operator;
668 aCharTypeTab[int('^')] |= CharFlags::Operator;
669 aCharTypeTab[int('|')] |= CharFlags::Operator;
670 aCharTypeTab[int('~')] |= CharFlags::Operator;
671 aCharTypeTab[int('{')] |= CharFlags::Operator;
672 aCharTypeTab[int('}')] |= CharFlags::Operator;
673 // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
674 aCharTypeTab[int(']')] |= CharFlags::Operator;
675 aCharTypeTab[int(';')] |= CharFlags::Operator;
677 // Space
678 aCharTypeTab[int(' ') ] |= CharFlags::Space;
679 aCharTypeTab[int('\t')] |= CharFlags::Space;
681 // End of line characters
682 aCharTypeTab[int('\r')] |= CharFlags::EOL;
683 aCharTypeTab[int('\n')] |= CharFlags::EOL;
685 ppListKeyWords = nullptr;
686 nKeyWordCount = 0;
689 void SyntaxHighlighter::Tokenizer::getHighlightPortions(std::u16string_view rLine,
690 /*out*/std::vector<HighlightPortion>& portions) const
692 // Set the position to the beginning of the source string
693 auto pos = rLine.begin();
695 // Variables for the out parameter
696 TokenType eType;
697 std::u16string_view::const_iterator pStartPos;
698 std::u16string_view::const_iterator pEndPos;
700 // Loop over all the tokens
701 while( getNextToken( pos, rLine.end(), eType, pStartPos, pEndPos ) )
703 portions.emplace_back(
704 pStartPos - rLine.begin(), pEndPos - rLine.begin(), eType);
709 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
710 m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
712 switch (language)
714 case HighlighterLanguage::Basic:
715 m_tokenizer->setKeyWords( strListBasicKeyWords,
716 std::size( strListBasicKeyWords ));
717 break;
718 case HighlighterLanguage::SQL:
719 m_tokenizer->setKeyWords( strListSqlKeyWords,
720 std::size( strListSqlKeyWords ));
721 break;
722 default:
723 assert(false); // this cannot happen
727 SyntaxHighlighter::~SyntaxHighlighter() {}
729 void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine,
730 /*out*/std::vector<HighlightPortion>& portions) const
732 m_tokenizer->getHighlightPortions( rLine, portions );
735 HighlighterLanguage SyntaxHighlighter::GetLanguage() const
737 return m_tokenizer->aLanguage;
740 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */