remove assert looking for new compatibilityMode DOCX
[LibreOffice.git] / comphelper / source / misc / syntaxhighlight.cxx
blob89dcb73752e4aae9f97bd8fefbd8b961cb01bea4
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <rtl/character.hxx>
25 #include <rtl/ustring.hxx>
26 #include <unicode/uchar.h>
27 #include <comphelper/syntaxhighlight.hxx>
28 #include <o3tl/typed_flags_set.hxx>
30 namespace {
32 // Flags for character properties
33 enum class CharFlags {
34 StartIdentifier = 0x0001,
35 InIdentifier = 0x0002,
36 StartNumber = 0x0004,
37 InNumber = 0x0008,
38 InHexNumber = 0x0010,
39 InOctNumber = 0x0020,
40 StartString = 0x0040,
41 Operator = 0x0080,
42 Space = 0x0100,
43 EOL = 0x0200
48 namespace o3tl {
49 template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
52 // ##########################################################################
53 // ATTENTION: all these words need to be in lower case
54 // ##########################################################################
55 static const char* strListBasicKeyWords[] = {
56 "access",
57 "alias",
58 "and",
59 "any",
60 "append",
61 "as",
62 "attribute",
63 "base",
64 "binary",
65 "boolean",
66 "byref",
67 "byte",
68 "byval",
69 "call",
70 "case",
71 "cdecl",
72 "classmodule",
73 "close",
74 "compare",
75 "compatible",
76 "const",
77 "currency",
78 "date",
79 "declare",
80 "defbool",
81 "defcur",
82 "defdate",
83 "defdbl",
84 "deferr",
85 "defint",
86 "deflng",
87 "defobj",
88 "defsng",
89 "defstr",
90 "defvar",
91 "dim",
92 "do",
93 "doevents",
94 "double",
95 "each",
96 "else",
97 "elseif",
98 "end",
99 "end enum",
100 "end function",
101 "end if",
102 "end property",
103 "end select",
104 "end sub",
105 "end type",
106 "endif",
107 "enum",
108 "eqv",
109 "erase",
110 "error",
111 "exit",
112 "explicit",
113 "for",
114 "function",
115 "get",
116 "global",
117 "gosub",
118 "goto",
119 "if",
120 "imp",
121 "implements",
122 "in",
123 "input",
124 "integer",
125 "is",
126 "let",
127 "lib",
128 "like",
129 "line",
130 "line input",
131 "local",
132 "lock",
133 "long",
134 "loop",
135 "lprint",
136 "lset",
137 "mod",
138 "name",
139 "new",
140 "next",
141 "not",
142 "object",
143 "on",
144 "open",
145 "option",
146 "optional",
147 "or",
148 "output",
149 "paramarray",
150 "preserve",
151 "print",
152 "private",
153 "property",
154 "public",
155 "random",
156 "read",
157 "redim",
158 "rem",
159 "resume",
160 "return",
161 "rset",
162 "select",
163 "set",
164 "shared",
165 "single",
166 "static",
167 "step",
168 "stop",
169 "string",
170 "sub",
171 "system",
172 "text",
173 "then",
174 "to",
175 "type",
176 "typeof",
177 "until",
178 "variant",
179 "vbasupport",
180 "wend",
181 "while",
182 "with",
183 "withevents",
184 "write",
185 "xor"
189 static const char* strListSqlKeyWords[] = {
190 "all",
191 "and",
192 "any",
193 "as",
194 "asc",
195 "avg",
196 "between",
197 "by",
198 "cast",
199 "corresponding",
200 "count",
201 "create",
202 "cross",
203 "delete",
204 "desc",
205 "distinct",
206 "drop",
207 "escape",
208 "except",
209 "exists",
210 "false",
211 "from",
212 "full",
213 "global",
214 "group",
215 "having",
216 "in",
217 "inner",
218 "insert",
219 "intersect",
220 "into",
221 "is",
222 "join",
223 "left",
224 "like",
225 "limit",
226 "local",
227 "match",
228 "max",
229 "min",
230 "natural",
231 "not",
232 "null",
233 "on",
234 "or",
235 "order",
236 "outer",
237 "right",
238 "select",
239 "set",
240 "some",
241 "sum",
242 "table",
243 "temporary",
244 "true",
245 "union",
246 "unique",
247 "unknown",
248 "update",
249 "using",
250 "values",
251 "where"
255 extern "C" {
257 static int compare_strings( const void *arg1, const void *arg2 )
259 return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
264 namespace
266 bool isAlpha(sal_Unicode c)
268 if (rtl::isAsciiAlpha(c))
269 return true;
270 return u_isalpha(c);
274 class SyntaxHighlighter::Tokenizer
276 // Character information tables
277 CharFlags aCharTypeTab[256] = {};
279 // Auxiliary function: testing of the character flags
280 bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
282 // Get new token, EmptyString == nothing more over there
283 bool getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end, /*out*/TokenType& reType,
284 /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const;
286 const char** ppListKeyWords;
287 sal_uInt16 nKeyWordCount;
289 public:
290 HighlighterLanguage const aLanguage;
292 explicit Tokenizer( HighlighterLanguage aLang );
294 void getHighlightPortions(std::u16string_view rLine,
295 /*out*/std::vector<HighlightPortion>& portions) const;
296 void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
299 // Helper function: test character flag
300 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
302 bool bRet = false;
303 if( c != 0 && c <= 255 )
305 bRet = bool(aCharTypeTab[c] & nTestFlags);
307 else if( c > 255 )
309 bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
310 && isAlpha(c);
312 return bRet;
315 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
317 ppListKeyWords = ppKeyWords;
318 nKeyWordCount = nCount;
321 bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end,
322 /*out*/TokenType& reType,
323 /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const
325 reType = TokenType::Unknown;
327 rpStartPos = pos;
329 if( pos == end )
330 return false;
332 sal_Unicode c = *pos;
333 ++pos;
335 //*** Go through all possibilities ***
336 // Space?
337 if ( testCharFlags( c, CharFlags::Space ) )
339 while( pos != end && testCharFlags( *pos, CharFlags::Space ) )
340 ++pos;
342 reType = TokenType::Whitespace;
345 // Identifier?
346 else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
348 bool bIdentifierChar;
351 if (pos == end)
352 break;
353 // Fetch next character
354 c = *pos;
355 bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
356 if( bIdentifierChar )
357 ++pos;
359 while( bIdentifierChar );
361 reType = TokenType::Identifier;
363 // Keyword table
364 if (ppListKeyWords != nullptr)
366 int nCount = pos - rpStartPos;
368 // No keyword if string contains char > 255
369 bool bCanBeKeyword = true;
370 for( int i = 0 ; i < nCount ; i++ )
372 if( rpStartPos[i] > 255 )
374 bCanBeKeyword = false;
375 break;
379 if( bCanBeKeyword )
381 std::u16string_view aKWString(&*rpStartPos, nCount);
382 OString aByteStr = OUStringToOString(aKWString,
383 RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
384 if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
385 compare_strings ) )
387 reType = TokenType::Keywords;
389 if( aByteStr == "rem" )
391 // Remove all characters until end of line or EOF
392 for (;;)
394 if (pos == end)
395 break;
396 sal_Unicode cPeek = *pos;
397 if ( testCharFlags( cPeek, CharFlags::EOL ) )
398 break;
399 ++pos;
402 reType = TokenType::Comment;
409 // Operator?
410 // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
411 else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
413 // parameters for SQL view
414 if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
416 if (c!='?')
418 bool bIdentifierChar;
421 // Get next character
422 if (pos == end)
423 break;
424 c = *pos;
425 bIdentifierChar = isAlpha(c);
426 if( bIdentifierChar )
427 ++pos;
429 while( bIdentifierChar );
431 reType = TokenType::Parameter;
433 else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
435 if (pos != end && *pos=='-')
437 // Remove all characters until end of line or EOF
438 while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
440 ++pos;
442 reType = TokenType::Comment;
444 else
445 reType = TokenType::Operator;
447 else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
449 if (pos != end && *pos=='/')
451 // Remove all characters until end of line or EOF
452 while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
454 ++pos;
456 reType = TokenType::Comment;
458 else
459 reType = TokenType::Operator;
461 else
463 // Apostrophe is Basic comment
464 if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
466 // Skip all characters until end of input or end of line:
467 for (;;) {
468 if (pos == end)
469 break;
470 c = *pos;
471 if (testCharFlags(c, CharFlags::EOL)) {
472 break;
474 ++pos;
477 reType = TokenType::Comment;
480 // The real operator; can be easily used since not the actual
481 // operator (e.g. +=) is concerned, but the fact that it is one
482 if( reType != TokenType::Comment )
484 reType = TokenType::Operator;
490 // Object separator? Must be handled before Number
491 else if( c == '.' && ( pos == end || *pos < '0' || *pos > '9' ) )
493 reType = TokenType::Operator;
496 // Number?
497 else if( testCharFlags( c, CharFlags::StartNumber ) )
499 reType = TokenType::Number;
501 // Number system, 10 = normal, it is changed for Oct/Hex
502 int nRadix = 10;
504 // Is it an Oct or a Hex number?
505 if( c == '&' )
507 // Octal?
508 if( pos != end && (*pos == 'o' || *pos == 'O' ))
510 // remove o
511 ++pos;
512 nRadix = 8; // Octal base
514 // Read all numbers
515 while( pos != end && testCharFlags( *pos, CharFlags::InOctNumber ) )
516 ++pos;
518 // Hexadecimal?
519 else if( pos != end && (*pos == 'h' || *pos == 'H' ))
521 // remove x
522 ++pos;
523 nRadix = 16; // Hexadecimal base
525 // Read all numbers
526 while( pos != end && testCharFlags( *pos, CharFlags::InHexNumber ) )
527 ++pos;
529 else
531 reType = TokenType::Operator;
535 // When it is not Oct or Hex, then it is double
536 if( reType == TokenType::Number && nRadix == 10 )
538 // Flag if the last character is an exponent
539 bool bAfterExpChar = false;
541 // Read all numbers
542 while( pos != end && (testCharFlags( *pos, CharFlags::InNumber ) ||
543 (bAfterExpChar && *pos == '+' ) ||
544 (bAfterExpChar && *pos == '-' ) ))
545 // After exponent +/- are OK, too
547 c = *pos++;
548 bAfterExpChar = ( c == 'e' || c == 'E' );
553 // String?
554 else if( testCharFlags( c, CharFlags::StartString ) )
556 // Remember which character has opened the string
557 sal_Unicode cEndString = c;
558 if( c == '[' )
559 cEndString = ']';
561 // Read all characters
562 while( pos == end || *pos != cEndString )
564 // Detect EOF before reading next char, so we do not lose EOF
565 if( pos == end )
567 // ERROR: unterminated string literal
568 reType = TokenType::Error;
569 break;
571 c = *pos++;
572 if( testCharFlags( c, CharFlags::EOL ) )
574 // ERROR: unterminated string literal
575 reType = TokenType::Error;
576 break;
580 if( reType != TokenType::Error )
582 ++pos;
583 if( cEndString == ']' )
584 reType = TokenType::Identifier;
585 else
586 reType = TokenType::String;
590 // End of line?
591 else if( testCharFlags( c, CharFlags::EOL ) )
593 // If another EOL character comes, read it
594 if (pos != end)
596 sal_Unicode cNext = *pos;
597 if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
598 ++pos;
601 reType = TokenType::EOL;
604 // All other will remain TokenType::Unknown
606 // Save end position
607 rpEndPos = pos;
608 return true;
611 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
613 // Fill character table
614 sal_uInt16 i;
616 // Allowed characters for identifiers
617 CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
618 for( i = 'a' ; i <= 'z' ; i++ )
619 aCharTypeTab[i] |= nHelpMask;
620 for( i = 'A' ; i <= 'Z' ; i++ )
621 aCharTypeTab[i] |= nHelpMask;
622 aCharTypeTab[int('_')] |= nHelpMask;
623 aCharTypeTab[int('$')] |= nHelpMask;
625 // Digit (can be identifier and number)
626 nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
627 CharFlags::InNumber | CharFlags::InHexNumber;
628 for( i = '0' ; i <= '9' ; i++ )
629 aCharTypeTab[i] |= nHelpMask;
631 // Add e, E, . and & here manually
632 aCharTypeTab[int('e')] |= CharFlags::InNumber;
633 aCharTypeTab[int('E')] |= CharFlags::InNumber;
634 aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
635 aCharTypeTab[int('&')] |= CharFlags::StartNumber;
637 // Hexadecimal digit
638 for( i = 'a' ; i <= 'f' ; i++ )
639 aCharTypeTab[i] |= CharFlags::InHexNumber;
640 for( i = 'A' ; i <= 'F' ; i++ )
641 aCharTypeTab[i] |= CharFlags::InHexNumber;
643 // Octal digit
644 for( i = '0' ; i <= '7' ; i++ )
645 aCharTypeTab[i] |= CharFlags::InOctNumber;
647 // String literal start/end characters
648 aCharTypeTab[int('\'')] |= CharFlags::StartString;
649 aCharTypeTab[int('\"')] |= CharFlags::StartString;
650 aCharTypeTab[int('[')] |= CharFlags::StartString;
651 aCharTypeTab[int('`')] |= CharFlags::StartString;
653 // Operator characters
654 aCharTypeTab[int('!')] |= CharFlags::Operator;
655 aCharTypeTab[int('%')] |= CharFlags::Operator;
656 // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
657 aCharTypeTab[int('(')] |= CharFlags::Operator;
658 aCharTypeTab[int(')')] |= CharFlags::Operator;
659 aCharTypeTab[int('*')] |= CharFlags::Operator;
660 aCharTypeTab[int('+')] |= CharFlags::Operator;
661 aCharTypeTab[int(',')] |= CharFlags::Operator;
662 aCharTypeTab[int('-')] |= CharFlags::Operator;
663 aCharTypeTab[int('/')] |= CharFlags::Operator;
664 aCharTypeTab[int(':')] |= CharFlags::Operator;
665 aCharTypeTab[int('<')] |= CharFlags::Operator;
666 aCharTypeTab[int('=')] |= CharFlags::Operator;
667 aCharTypeTab[int('>')] |= CharFlags::Operator;
668 aCharTypeTab[int('?')] |= CharFlags::Operator;
669 aCharTypeTab[int('^')] |= CharFlags::Operator;
670 aCharTypeTab[int('|')] |= CharFlags::Operator;
671 aCharTypeTab[int('~')] |= CharFlags::Operator;
672 aCharTypeTab[int('{')] |= CharFlags::Operator;
673 aCharTypeTab[int('}')] |= CharFlags::Operator;
674 // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
675 aCharTypeTab[int(']')] |= CharFlags::Operator;
676 aCharTypeTab[int(';')] |= CharFlags::Operator;
678 // Space
679 aCharTypeTab[int(' ') ] |= CharFlags::Space;
680 aCharTypeTab[int('\t')] |= CharFlags::Space;
682 // End of line characters
683 aCharTypeTab[int('\r')] |= CharFlags::EOL;
684 aCharTypeTab[int('\n')] |= CharFlags::EOL;
686 ppListKeyWords = nullptr;
687 nKeyWordCount = 0;
690 void SyntaxHighlighter::Tokenizer::getHighlightPortions(std::u16string_view rLine,
691 /*out*/std::vector<HighlightPortion>& portions) const
693 // Set the position to the beginning of the source string
694 auto pos = rLine.begin();
696 // Variables for the out parameter
697 TokenType eType;
698 std::u16string_view::const_iterator pStartPos;
699 std::u16string_view::const_iterator pEndPos;
701 // Loop over all the tokens
702 while( getNextToken( pos, rLine.end(), eType, pStartPos, pEndPos ) )
704 portions.emplace_back(
705 pStartPos - rLine.begin(), pEndPos - rLine.begin(), eType);
710 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
711 m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
713 switch (language)
715 case HighlighterLanguage::Basic:
716 m_tokenizer->setKeyWords( strListBasicKeyWords,
717 std::size( strListBasicKeyWords ));
718 break;
719 case HighlighterLanguage::SQL:
720 m_tokenizer->setKeyWords( strListSqlKeyWords,
721 std::size( strListSqlKeyWords ));
722 break;
723 default:
724 assert(false); // this cannot happen
728 SyntaxHighlighter::~SyntaxHighlighter() {}
730 void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine,
731 /*out*/std::vector<HighlightPortion>& portions) const
733 m_tokenizer->getHighlightPortions( rLine, portions );
736 HighlighterLanguage SyntaxHighlighter::GetLanguage() const
738 return m_tokenizer->aLanguage;
741 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */