1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
24 #include <unicode/uchar.h>
25 #include <comphelper/syntaxhighlight.hxx>
26 #include <comphelper/string.hxx>
28 // Flags for character properties
29 #define CHAR_START_IDENTIFIER 0x0001
30 #define CHAR_IN_IDENTIFIER 0x0002
31 #define CHAR_START_NUMBER 0x0004
32 #define CHAR_IN_NUMBER 0x0008
33 #define CHAR_IN_HEX_NUMBER 0x0010
34 #define CHAR_IN_OCT_NUMBER 0x0020
35 #define CHAR_START_STRING 0x0040
36 #define CHAR_OPERATOR 0x0080
37 #define CHAR_SPACE 0x0100
38 #define CHAR_EOL 0x0200
40 // ##########################################################################
41 // ATTENTION: all these words need to be in lower case
42 // ##########################################################################
43 static const char* strListBasicKeyWords
[] = {
177 static const char* strListSqlKeyWords
[] = {
243 extern "C" int compare_strings( const void *arg1
, const void *arg2
)
245 return strcmp( static_cast<char const *>(arg1
), *static_cast<char * const *>(arg2
) );
251 bool isAlpha(sal_Unicode c
)
253 if (comphelper::string::isalphaAscii(c
))
259 class SyntaxHighlighter::Tokenizer
261 // Character information tables
262 sal_uInt16 aCharTypeTab
[256];
264 // Auxiliary function: testing of the character flags
265 bool testCharFlags(sal_Unicode c
, sal_uInt16 nTestFlags
) const;
267 // Get new token, EmptyString == nothing more over there
268 bool getNextToken(const sal_Unicode
*& pos
, /*out*/TokenTypes
& reType
,
269 /*out*/const sal_Unicode
*& rpStartPos
, /*out*/const sal_Unicode
*& rpEndPos
) const;
271 const char** ppListKeyWords
;
272 sal_uInt16 nKeyWordCount
;
275 HighlighterLanguage
const aLanguage
;
277 Tokenizer( HighlighterLanguage aLang
);
280 void getHighlightPortions(const OUString
& rLine
,
281 /*out*/std::vector
<HighlightPortion
>& portions
) const;
282 void setKeyWords( const char** ppKeyWords
, sal_uInt16 nCount
);
285 // Helper function: test character flag
286 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c
, sal_uInt16 nTestFlags
) const
289 if( c
!= 0 && c
<= 255 )
291 bRet
= ( (aCharTypeTab
[c
] & nTestFlags
) != 0 );
295 bRet
= (( CHAR_START_IDENTIFIER
| CHAR_IN_IDENTIFIER
) & nTestFlags
) != 0
301 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords
, sal_uInt16 nCount
)
303 ppListKeyWords
= ppKeyWords
;
304 nKeyWordCount
= nCount
;
307 bool SyntaxHighlighter::Tokenizer::getNextToken(const sal_Unicode
*& pos
, /*out*/TokenTypes
& reType
,
308 /*out*/const sal_Unicode
*& rpStartPos
, /*out*/const sal_Unicode
*& rpEndPos
) const
314 sal_Unicode c
= *pos
;
320 //*** Go through all possibilities ***
322 if ( testCharFlags( c
, CHAR_SPACE
) )
324 while( testCharFlags( *pos
, CHAR_SPACE
) )
327 reType
= TT_WHITESPACE
;
331 else if ( testCharFlags( c
, CHAR_START_IDENTIFIER
) )
333 bool bIdentifierChar
;
336 // Naechstes Zeichen holen
338 bIdentifierChar
= testCharFlags( c
, CHAR_IN_IDENTIFIER
);
339 if( bIdentifierChar
)
342 while( bIdentifierChar
);
344 reType
= TT_IDENTIFIER
;
347 if (ppListKeyWords
!= NULL
)
349 int nCount
= pos
- rpStartPos
;
351 // No keyword if string contains char > 255
352 bool bCanBeKeyword
= true;
353 for( int i
= 0 ; i
< nCount
; i
++ )
355 if( rpStartPos
[i
] > 255 )
357 bCanBeKeyword
= false;
364 OUString
aKWString(rpStartPos
, nCount
);
365 OString aByteStr
= OUStringToOString(aKWString
,
366 RTL_TEXTENCODING_ASCII_US
).toAsciiLowerCase();
367 if ( bsearch( aByteStr
.getStr(), ppListKeyWords
, nKeyWordCount
, sizeof( char* ),
370 reType
= TT_KEYWORDS
;
372 if( aByteStr
== "rem" )
374 // Remove all characters until end of line or EOF
375 sal_Unicode cPeek
= *pos
;
376 while( cPeek
!= 0 && !testCharFlags( cPeek
, CHAR_EOL
) )
390 // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
391 else if ( testCharFlags( c
, CHAR_OPERATOR
) || ( (c
== '\'') && (aLanguage
==HIGHLIGHT_BASIC
)) )
393 // parameters for SQL view
394 if ( (c
==':') || (c
=='?'))
398 bool bIdentifierChar
;
401 // Get next character
403 bIdentifierChar
= isAlpha(c
);
404 if( bIdentifierChar
)
407 while( bIdentifierChar
);
409 reType
= TT_PARAMETER
;
413 sal_Unicode cPeekNext
= *pos
;
416 // Remove all characters until end of line or EOF
417 while( cPeekNext
!= 0 && !testCharFlags( cPeekNext
, CHAR_EOL
) )
427 sal_Unicode cPeekNext
= *pos
;
430 // Remove all characters until end of line or EOF
431 while( cPeekNext
!= 0 && !testCharFlags( cPeekNext
, CHAR_EOL
) )
444 // Skip all characters until end of input or end of line:
447 if (c
== 0 || testCharFlags(c
, CHAR_EOL
)) {
456 // The real operator; can be easily used since not the actual
457 // operator (e.g. +=) is concerned, but the fact that it is one
458 if( reType
!= TT_COMMENT
)
460 reType
= TT_OPERATOR
;
466 // Object separator? Must be handled before Number
467 else if( c
== '.' && ( *pos
< '0' || *pos
> '9' ) )
469 reType
= TT_OPERATOR
;
473 else if( testCharFlags( c
, CHAR_START_NUMBER
) )
477 // Number system, 10 = normal, it is changed for Oct/Hex
480 // Is it an Oct or a Hex number?
484 if( *pos
== 'o' || *pos
== 'O' )
488 nRadix
= 8; // Octal base
491 while( testCharFlags( *pos
, CHAR_IN_OCT_NUMBER
) )
495 else if( *pos
== 'h' || *pos
== 'H' )
499 nRadix
= 16; // Hexadecimal base
502 while( testCharFlags( *pos
, CHAR_IN_HEX_NUMBER
) )
507 reType
= TT_OPERATOR
;
511 // When it is not Oct or Hex, then it is double
512 if( reType
== TT_NUMBER
&& nRadix
== 10 )
514 // Flag if the last character is an exponent
515 bool bAfterExpChar
= false;
518 while( testCharFlags( *pos
, CHAR_IN_NUMBER
) ||
519 (bAfterExpChar
&& *pos
== '+' ) ||
520 (bAfterExpChar
&& *pos
== '-' ) )
521 // After exponent +/- are OK, too
524 bAfterExpChar
= ( c
== 'e' || c
== 'E' );
530 else if( testCharFlags( c
, CHAR_START_STRING
) )
532 // Remember which character has opened the string
533 sal_Unicode cEndString
= c
;
537 // Read all characters
538 while( *pos
!= cEndString
)
540 // Detect EOF before reading next char, so we do not lose EOF
543 // ERROR: unterminated string literal
548 if( testCharFlags( c
, CHAR_EOL
) )
550 // ERROR: unterminated string literal
556 if( reType
!= TT_ERROR
)
559 if( cEndString
== ']' )
560 reType
= TT_IDENTIFIER
;
567 else if( testCharFlags( c
, CHAR_EOL
) )
569 // If another EOL character comes, read it
570 sal_Unicode cNext
= *pos
;
571 if( cNext
!= c
&& testCharFlags( cNext
, CHAR_EOL
) )
577 // All other will remain TT_UNKNOWN
584 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang
): aLanguage(aLang
)
586 memset( aCharTypeTab
, 0, sizeof( aCharTypeTab
) );
588 // Fill character table
591 // Allowed characters for identifiers
592 sal_uInt16 nHelpMask
= (sal_uInt16
)( CHAR_START_IDENTIFIER
| CHAR_IN_IDENTIFIER
);
593 for( i
= 'a' ; i
<= 'z' ; i
++ )
594 aCharTypeTab
[i
] |= nHelpMask
;
595 for( i
= 'A' ; i
<= 'Z' ; i
++ )
596 aCharTypeTab
[i
] |= nHelpMask
;
597 aCharTypeTab
[(int)'_'] |= nHelpMask
;
598 aCharTypeTab
[(int)'$'] |= nHelpMask
;
600 // Digit (can be identifier and number)
601 nHelpMask
= (sal_uInt16
)( CHAR_IN_IDENTIFIER
| CHAR_START_NUMBER
|
602 CHAR_IN_NUMBER
| CHAR_IN_HEX_NUMBER
);
603 for( i
= '0' ; i
<= '9' ; i
++ )
604 aCharTypeTab
[i
] |= nHelpMask
;
606 // Add e, E, . and & here manually
607 aCharTypeTab
[(int)'e'] |= CHAR_IN_NUMBER
;
608 aCharTypeTab
[(int)'E'] |= CHAR_IN_NUMBER
;
609 aCharTypeTab
[(int)'.'] |= (sal_uInt16
)( CHAR_IN_NUMBER
| CHAR_START_NUMBER
);
610 aCharTypeTab
[(int)'&'] |= CHAR_START_NUMBER
;
613 for( i
= 'a' ; i
<= 'f' ; i
++ )
614 aCharTypeTab
[i
] |= CHAR_IN_HEX_NUMBER
;
615 for( i
= 'A' ; i
<= 'F' ; i
++ )
616 aCharTypeTab
[i
] |= CHAR_IN_HEX_NUMBER
;
619 for( i
= '0' ; i
<= '7' ; i
++ )
620 aCharTypeTab
[i
] |= CHAR_IN_OCT_NUMBER
;
622 // String literal start/end characters
623 aCharTypeTab
[(int)'\''] |= CHAR_START_STRING
;
624 aCharTypeTab
[(int)'\"'] |= CHAR_START_STRING
;
625 aCharTypeTab
[(int)'['] |= CHAR_START_STRING
;
626 aCharTypeTab
[(int)'`'] |= CHAR_START_STRING
;
628 // Operator characters
629 aCharTypeTab
[(int)'!'] |= CHAR_OPERATOR
;
630 aCharTypeTab
[(int)'%'] |= CHAR_OPERATOR
;
631 // aCharTypeTab[(int)'&'] |= CHAR_OPERATOR; Removed because of #i14140
632 aCharTypeTab
[(int)'('] |= CHAR_OPERATOR
;
633 aCharTypeTab
[(int)')'] |= CHAR_OPERATOR
;
634 aCharTypeTab
[(int)'*'] |= CHAR_OPERATOR
;
635 aCharTypeTab
[(int)'+'] |= CHAR_OPERATOR
;
636 aCharTypeTab
[(int)','] |= CHAR_OPERATOR
;
637 aCharTypeTab
[(int)'-'] |= CHAR_OPERATOR
;
638 aCharTypeTab
[(int)'/'] |= CHAR_OPERATOR
;
639 aCharTypeTab
[(int)':'] |= CHAR_OPERATOR
;
640 aCharTypeTab
[(int)'<'] |= CHAR_OPERATOR
;
641 aCharTypeTab
[(int)'='] |= CHAR_OPERATOR
;
642 aCharTypeTab
[(int)'>'] |= CHAR_OPERATOR
;
643 aCharTypeTab
[(int)'?'] |= CHAR_OPERATOR
;
644 aCharTypeTab
[(int)'^'] |= CHAR_OPERATOR
;
645 aCharTypeTab
[(int)'|'] |= CHAR_OPERATOR
;
646 aCharTypeTab
[(int)'~'] |= CHAR_OPERATOR
;
647 aCharTypeTab
[(int)'{'] |= CHAR_OPERATOR
;
648 aCharTypeTab
[(int)'}'] |= CHAR_OPERATOR
;
649 // aCharTypeTab[(int)'['] |= CHAR_OPERATOR; Removed because of #i17826
650 aCharTypeTab
[(int)']'] |= CHAR_OPERATOR
;
651 aCharTypeTab
[(int)';'] |= CHAR_OPERATOR
;
654 aCharTypeTab
[(int)' ' ] |= CHAR_SPACE
;
655 aCharTypeTab
[(int)'\t'] |= CHAR_SPACE
;
657 // End of line characters
658 aCharTypeTab
[(int)'\r'] |= CHAR_EOL
;
659 aCharTypeTab
[(int)'\n'] |= CHAR_EOL
;
661 ppListKeyWords
= NULL
;
665 SyntaxHighlighter::Tokenizer::~Tokenizer()
669 void SyntaxHighlighter::Tokenizer::getHighlightPortions(const OUString
& rLine
,
670 /*out*/std::vector
<HighlightPortion
>& portions
) const
672 // Set the position to the beginning of the source string
673 const sal_Unicode
* pos
= rLine
.getStr();
675 // Variables for the out parameter
677 const sal_Unicode
* pStartPos
;
678 const sal_Unicode
* pEndPos
;
680 // Loop over all the tokens
681 while( getNextToken( pos
, eType
, pStartPos
, pEndPos
) )
685 pStartPos
- rLine
.getStr(), pEndPos
- rLine
.getStr(), eType
));
690 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language
):
691 eLanguage(language
), m_tokenizer(new SyntaxHighlighter::Tokenizer(language
))
695 case HIGHLIGHT_BASIC
:
696 m_tokenizer
->setKeyWords( strListBasicKeyWords
,
697 sizeof( strListBasicKeyWords
) / sizeof( char* ));
700 m_tokenizer
->setKeyWords( strListSqlKeyWords
,
701 sizeof( strListSqlKeyWords
) / sizeof( char* ));
704 assert(false); // this cannot happen
708 SyntaxHighlighter::~SyntaxHighlighter() {}
710 void SyntaxHighlighter::getHighlightPortions(const OUString
& rLine
,
711 /*out*/std::vector
<HighlightPortion
>& portions
) const
713 m_tokenizer
->getHighlightPortions( rLine
, portions
);
716 HighlighterLanguage
SyntaxHighlighter::GetLanguage()
718 return m_tokenizer
->aLanguage
;
721 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */