bump product version to 5.0.4.1
[LibreOffice.git] / comphelper / source / misc / syntaxhighlight.cxx
blobecd4ced68ca3b3fc85acfa22eb195126996dc3ef
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <unicode/uchar.h>
25 #include <comphelper/syntaxhighlight.hxx>
26 #include <comphelper/string.hxx>
28 // Flags for character properties
29 #define CHAR_START_IDENTIFIER 0x0001
30 #define CHAR_IN_IDENTIFIER 0x0002
31 #define CHAR_START_NUMBER 0x0004
32 #define CHAR_IN_NUMBER 0x0008
33 #define CHAR_IN_HEX_NUMBER 0x0010
34 #define CHAR_IN_OCT_NUMBER 0x0020
35 #define CHAR_START_STRING 0x0040
36 #define CHAR_OPERATOR 0x0080
37 #define CHAR_SPACE 0x0100
38 #define CHAR_EOL 0x0200
40 // ##########################################################################
41 // ATTENTION: all these words need to be in lower case
42 // ##########################################################################
43 static const char* strListBasicKeyWords[] = {
44 "access",
45 "alias",
46 "and",
47 "any",
48 "append",
49 "as",
50 "attribute",
51 "base",
52 "binary",
53 "boolean",
54 "byref",
55 "byte",
56 "byval",
57 "call",
58 "case",
59 "cdecl",
60 "classmodule",
61 "close",
62 "compare",
63 "compatible",
64 "const",
65 "currency",
66 "date",
67 "declare",
68 "defbool",
69 "defcur",
70 "defdate",
71 "defdbl",
72 "deferr",
73 "defint",
74 "deflng",
75 "defobj",
76 "defsng",
77 "defstr",
78 "defvar",
79 "dim",
80 "do",
81 "doevents",
82 "double",
83 "each",
84 "else",
85 "elseif",
86 "end",
87 "end enum",
88 "end function",
89 "end if",
90 "end property",
91 "end select",
92 "end sub",
93 "end type",
94 "endif",
95 "enum",
96 "eqv",
97 "erase",
98 "error",
99 "exit",
100 "explicit",
101 "for",
102 "function",
103 "get",
104 "global",
105 "gosub",
106 "goto",
107 "if",
108 "imp",
109 "implements",
110 "in",
111 "input",
112 "integer",
113 "is",
114 "let",
115 "lib",
116 "like",
117 "line",
118 "line input",
119 "local",
120 "lock",
121 "long",
122 "loop",
123 "lprint",
124 "lset",
125 "mod",
126 "name",
127 "new",
128 "next",
129 "not",
130 "object",
131 "on",
132 "open",
133 "option",
134 "optional",
135 "or",
136 "output",
137 "paramarray",
138 "preserve",
139 "print",
140 "private",
141 "property",
142 "public",
143 "random",
144 "read",
145 "redim",
146 "rem",
147 "resume",
148 "return",
149 "rset",
150 "select",
151 "set",
152 "shared",
153 "single",
154 "static",
155 "step",
156 "stop",
157 "string",
158 "sub",
159 "system",
160 "text",
161 "then",
162 "to",
163 "type",
164 "typeof",
165 "until",
166 "variant",
167 "vbasupport",
168 "wend",
169 "while",
170 "with",
171 "withevents",
172 "write",
173 "xor"
177 static const char* strListSqlKeyWords[] = {
178 "all",
179 "and",
180 "any",
181 "as",
182 "asc",
183 "avg",
184 "between",
185 "by",
186 "cast",
187 "corresponding",
188 "count",
189 "create",
190 "cross",
191 "delete",
192 "desc",
193 "distinct",
194 "drop",
195 "escape",
196 "except",
197 "exists",
198 "false",
199 "from",
200 "full",
201 "global",
202 "group",
203 "having",
204 "in",
205 "inner",
206 "insert",
207 "intersect",
208 "into",
209 "is",
210 "join",
211 "left",
212 "like",
213 "limit",
214 "local",
215 "match",
216 "max",
217 "min",
218 "natural",
219 "not",
220 "null",
221 "on",
222 "or",
223 "order",
224 "outer",
225 "right",
226 "select",
227 "set",
228 "some",
229 "sum",
230 "table",
231 "temporary",
232 "true",
233 "union",
234 "unique",
235 "unknown",
236 "update",
237 "using",
238 "values",
239 "where"
243 extern "C" int compare_strings( const void *arg1, const void *arg2 )
245 return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
249 namespace
251 bool isAlpha(sal_Unicode c)
253 if (comphelper::string::isalphaAscii(c))
254 return true;
255 return u_isalpha(c);
259 class SyntaxHighlighter::Tokenizer
261 // Character information tables
262 sal_uInt16 aCharTypeTab[256];
264 // Auxiliary function: testing of the character flags
265 bool testCharFlags(sal_Unicode c, sal_uInt16 nTestFlags) const;
267 // Get new token, EmptyString == nothing more over there
268 bool getNextToken(const sal_Unicode*& pos, /*out*/TokenTypes& reType,
269 /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const;
271 const char** ppListKeyWords;
272 sal_uInt16 nKeyWordCount;
274 public:
275 HighlighterLanguage const aLanguage;
277 Tokenizer( HighlighterLanguage aLang );
278 ~Tokenizer();
280 void getHighlightPortions(const OUString& rLine,
281 /*out*/std::vector<HighlightPortion>& portions) const;
282 void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
285 // Helper function: test character flag
286 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, sal_uInt16 nTestFlags) const
288 bool bRet = false;
289 if( c != 0 && c <= 255 )
291 bRet = ( (aCharTypeTab[c] & nTestFlags) != 0 );
293 else if( c > 255 )
295 bRet = (( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER ) & nTestFlags) != 0
296 && isAlpha(c);
298 return bRet;
301 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
303 ppListKeyWords = ppKeyWords;
304 nKeyWordCount = nCount;
307 bool SyntaxHighlighter::Tokenizer::getNextToken(const sal_Unicode*& pos, /*out*/TokenTypes& reType,
308 /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const
310 reType = TT_UNKNOWN;
312 rpStartPos = pos;
314 sal_Unicode c = *pos;
315 if( c == 0 )
316 return false;
318 ++pos;
320 //*** Go through all possibilities ***
321 // Space?
322 if ( testCharFlags( c, CHAR_SPACE ) )
324 while( testCharFlags( *pos, CHAR_SPACE ) )
325 ++pos;
327 reType = TT_WHITESPACE;
330 // Identifier?
331 else if ( testCharFlags( c, CHAR_START_IDENTIFIER ) )
333 bool bIdentifierChar;
336 // Naechstes Zeichen holen
337 c = *pos;
338 bIdentifierChar = testCharFlags( c, CHAR_IN_IDENTIFIER );
339 if( bIdentifierChar )
340 ++pos;
342 while( bIdentifierChar );
344 reType = TT_IDENTIFIER;
346 // Keyword table
347 if (ppListKeyWords != NULL)
349 int nCount = pos - rpStartPos;
351 // No keyword if string contains char > 255
352 bool bCanBeKeyword = true;
353 for( int i = 0 ; i < nCount ; i++ )
355 if( rpStartPos[i] > 255 )
357 bCanBeKeyword = false;
358 break;
362 if( bCanBeKeyword )
364 OUString aKWString(rpStartPos, nCount);
365 OString aByteStr = OUStringToOString(aKWString,
366 RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
367 if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
368 compare_strings ) )
370 reType = TT_KEYWORDS;
372 if( aByteStr == "rem" )
374 // Remove all characters until end of line or EOF
375 sal_Unicode cPeek = *pos;
376 while( cPeek != 0 && !testCharFlags( cPeek, CHAR_EOL ) )
378 c = *pos++;
379 cPeek = *pos;
382 reType = TT_COMMENT;
389 // Operator?
390 // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
391 else if ( testCharFlags( c, CHAR_OPERATOR ) || ( (c == '\'') && (aLanguage==HIGHLIGHT_BASIC)) )
393 // parameters for SQL view
394 if ( (c==':') || (c=='?'))
396 if (c!='?')
398 bool bIdentifierChar;
401 // Get next character
402 c = *pos;
403 bIdentifierChar = isAlpha(c);
404 if( bIdentifierChar )
405 ++pos;
407 while( bIdentifierChar );
409 reType = TT_PARAMETER;
411 else if (c=='-')
413 sal_Unicode cPeekNext = *pos;
414 if (cPeekNext=='-')
416 // Remove all characters until end of line or EOF
417 while( cPeekNext != 0 && !testCharFlags( cPeekNext, CHAR_EOL ) )
419 ++pos;
420 cPeekNext = *pos;
422 reType = TT_COMMENT;
425 else if (c=='/')
427 sal_Unicode cPeekNext = *pos;
428 if (cPeekNext=='/')
430 // Remove all characters until end of line or EOF
431 while( cPeekNext != 0 && !testCharFlags( cPeekNext, CHAR_EOL ) )
433 ++pos;
434 cPeekNext = *pos;
436 reType = TT_COMMENT;
439 else
441 // Comment?
442 if ( c == '\'' )
444 // Skip all characters until end of input or end of line:
445 for (;;) {
446 c = *pos;
447 if (c == 0 || testCharFlags(c, CHAR_EOL)) {
448 break;
450 ++pos;
453 reType = TT_COMMENT;
456 // The real operator; can be easily used since not the actual
457 // operator (e.g. +=) is concerned, but the fact that it is one
458 if( reType != TT_COMMENT )
460 reType = TT_OPERATOR;
466 // Object separator? Must be handled before Number
467 else if( c == '.' && ( *pos < '0' || *pos > '9' ) )
469 reType = TT_OPERATOR;
472 // Number?
473 else if( testCharFlags( c, CHAR_START_NUMBER ) )
475 reType = TT_NUMBER;
477 // Number system, 10 = normal, it is changed for Oct/Hex
478 int nRadix = 10;
480 // Is it an Oct or a Hex number?
481 if( c == '&' )
483 // Octal?
484 if( *pos == 'o' || *pos == 'O' )
486 // remove o
487 ++pos;
488 nRadix = 8; // Octal base
490 // Read all numbers
491 while( testCharFlags( *pos, CHAR_IN_OCT_NUMBER ) )
492 ++pos;
494 // Hexadecimal?
495 else if( *pos == 'h' || *pos == 'H' )
497 // remove x
498 ++pos;
499 nRadix = 16; // Hexadecimal base
501 // Read all numbers
502 while( testCharFlags( *pos, CHAR_IN_HEX_NUMBER ) )
503 ++pos;
505 else
507 reType = TT_OPERATOR;
511 // When it is not Oct or Hex, then it is double
512 if( reType == TT_NUMBER && nRadix == 10 )
514 // Flag if the last character is an exponent
515 bool bAfterExpChar = false;
517 // Read all numbers
518 while( testCharFlags( *pos, CHAR_IN_NUMBER ) ||
519 (bAfterExpChar && *pos == '+' ) ||
520 (bAfterExpChar && *pos == '-' ) )
521 // After exponent +/- are OK, too
523 c = *pos++;
524 bAfterExpChar = ( c == 'e' || c == 'E' );
529 // String?
530 else if( testCharFlags( c, CHAR_START_STRING ) )
532 // Remember which character has opened the string
533 sal_Unicode cEndString = c;
534 if( c == '[' )
535 cEndString = ']';
537 // Read all characters
538 while( *pos != cEndString )
540 // Detect EOF before reading next char, so we do not lose EOF
541 if( *pos == 0 )
543 // ERROR: unterminated string literal
544 reType = TT_ERROR;
545 break;
547 c = *pos++;
548 if( testCharFlags( c, CHAR_EOL ) )
550 // ERROR: unterminated string literal
551 reType = TT_ERROR;
552 break;
556 if( reType != TT_ERROR )
558 ++pos;
559 if( cEndString == ']' )
560 reType = TT_IDENTIFIER;
561 else
562 reType = TT_STRING;
566 // End of line?
567 else if( testCharFlags( c, CHAR_EOL ) )
569 // If another EOL character comes, read it
570 sal_Unicode cNext = *pos;
571 if( cNext != c && testCharFlags( cNext, CHAR_EOL ) )
572 ++pos;
574 reType = TT_EOL;
577 // All other will remain TT_UNKNOWN
579 // Save end position
580 rpEndPos = pos;
581 return true;
584 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
586 memset( aCharTypeTab, 0, sizeof( aCharTypeTab ) );
588 // Fill character table
589 sal_uInt16 i;
591 // Allowed characters for identifiers
592 sal_uInt16 nHelpMask = (sal_uInt16)( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER );
593 for( i = 'a' ; i <= 'z' ; i++ )
594 aCharTypeTab[i] |= nHelpMask;
595 for( i = 'A' ; i <= 'Z' ; i++ )
596 aCharTypeTab[i] |= nHelpMask;
597 aCharTypeTab[(int)'_'] |= nHelpMask;
598 aCharTypeTab[(int)'$'] |= nHelpMask;
600 // Digit (can be identifier and number)
601 nHelpMask = (sal_uInt16)( CHAR_IN_IDENTIFIER | CHAR_START_NUMBER |
602 CHAR_IN_NUMBER | CHAR_IN_HEX_NUMBER );
603 for( i = '0' ; i <= '9' ; i++ )
604 aCharTypeTab[i] |= nHelpMask;
606 // Add e, E, . and & here manually
607 aCharTypeTab[(int)'e'] |= CHAR_IN_NUMBER;
608 aCharTypeTab[(int)'E'] |= CHAR_IN_NUMBER;
609 aCharTypeTab[(int)'.'] |= (sal_uInt16)( CHAR_IN_NUMBER | CHAR_START_NUMBER );
610 aCharTypeTab[(int)'&'] |= CHAR_START_NUMBER;
612 // Hexadecimal digit
613 for( i = 'a' ; i <= 'f' ; i++ )
614 aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
615 for( i = 'A' ; i <= 'F' ; i++ )
616 aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
618 // Octal digit
619 for( i = '0' ; i <= '7' ; i++ )
620 aCharTypeTab[i] |= CHAR_IN_OCT_NUMBER;
622 // String literal start/end characters
623 aCharTypeTab[(int)'\''] |= CHAR_START_STRING;
624 aCharTypeTab[(int)'\"'] |= CHAR_START_STRING;
625 aCharTypeTab[(int)'['] |= CHAR_START_STRING;
626 aCharTypeTab[(int)'`'] |= CHAR_START_STRING;
628 // Operator characters
629 aCharTypeTab[(int)'!'] |= CHAR_OPERATOR;
630 aCharTypeTab[(int)'%'] |= CHAR_OPERATOR;
631 // aCharTypeTab[(int)'&'] |= CHAR_OPERATOR; Removed because of #i14140
632 aCharTypeTab[(int)'('] |= CHAR_OPERATOR;
633 aCharTypeTab[(int)')'] |= CHAR_OPERATOR;
634 aCharTypeTab[(int)'*'] |= CHAR_OPERATOR;
635 aCharTypeTab[(int)'+'] |= CHAR_OPERATOR;
636 aCharTypeTab[(int)','] |= CHAR_OPERATOR;
637 aCharTypeTab[(int)'-'] |= CHAR_OPERATOR;
638 aCharTypeTab[(int)'/'] |= CHAR_OPERATOR;
639 aCharTypeTab[(int)':'] |= CHAR_OPERATOR;
640 aCharTypeTab[(int)'<'] |= CHAR_OPERATOR;
641 aCharTypeTab[(int)'='] |= CHAR_OPERATOR;
642 aCharTypeTab[(int)'>'] |= CHAR_OPERATOR;
643 aCharTypeTab[(int)'?'] |= CHAR_OPERATOR;
644 aCharTypeTab[(int)'^'] |= CHAR_OPERATOR;
645 aCharTypeTab[(int)'|'] |= CHAR_OPERATOR;
646 aCharTypeTab[(int)'~'] |= CHAR_OPERATOR;
647 aCharTypeTab[(int)'{'] |= CHAR_OPERATOR;
648 aCharTypeTab[(int)'}'] |= CHAR_OPERATOR;
649 // aCharTypeTab[(int)'['] |= CHAR_OPERATOR; Removed because of #i17826
650 aCharTypeTab[(int)']'] |= CHAR_OPERATOR;
651 aCharTypeTab[(int)';'] |= CHAR_OPERATOR;
653 // Space
654 aCharTypeTab[(int)' ' ] |= CHAR_SPACE;
655 aCharTypeTab[(int)'\t'] |= CHAR_SPACE;
657 // End of line characters
658 aCharTypeTab[(int)'\r'] |= CHAR_EOL;
659 aCharTypeTab[(int)'\n'] |= CHAR_EOL;
661 ppListKeyWords = NULL;
662 nKeyWordCount = 0;
665 SyntaxHighlighter::Tokenizer::~Tokenizer()
669 void SyntaxHighlighter::Tokenizer::getHighlightPortions(const OUString& rLine,
670 /*out*/std::vector<HighlightPortion>& portions) const
672 // Set the position to the beginning of the source string
673 const sal_Unicode* pos = rLine.getStr();
675 // Variables for the out parameter
676 TokenTypes eType;
677 const sal_Unicode* pStartPos;
678 const sal_Unicode* pEndPos;
680 // Loop over all the tokens
681 while( getNextToken( pos, eType, pStartPos, pEndPos ) )
683 portions.push_back(
684 HighlightPortion(
685 pStartPos - rLine.getStr(), pEndPos - rLine.getStr(), eType));
690 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
691 eLanguage(language), m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
693 switch (eLanguage)
695 case HIGHLIGHT_BASIC:
696 m_tokenizer->setKeyWords( strListBasicKeyWords,
697 sizeof( strListBasicKeyWords ) / sizeof( char* ));
698 break;
699 case HIGHLIGHT_SQL:
700 m_tokenizer->setKeyWords( strListSqlKeyWords,
701 sizeof( strListSqlKeyWords ) / sizeof( char* ));
702 break;
703 default:
704 assert(false); // this cannot happen
708 SyntaxHighlighter::~SyntaxHighlighter() {}
710 void SyntaxHighlighter::getHighlightPortions(const OUString& rLine,
711 /*out*/std::vector<HighlightPortion>& portions) const
713 m_tokenizer->getHighlightPortions( rLine, portions );
716 HighlighterLanguage SyntaxHighlighter::GetLanguage()
718 return m_tokenizer->aLanguage;
721 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */