1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
24 #include <rtl/character.hxx>
25 #include <unicode/uchar.h>
26 #include <comphelper/syntaxhighlight.hxx>
27 #include <o3tl/typed_flags_set.hxx>
31 // Flags for character properties
32 enum class CharFlags
{
33 StartIdentifier
= 0x0001,
34 InIdentifier
= 0x0002,
48 template<> struct typed_flags
<CharFlags
> : is_typed_flags
<CharFlags
, 0x03ff> {};
51 // ##########################################################################
52 // ATTENTION: all these words need to be in lower case
53 // ##########################################################################
54 static const char* strListBasicKeyWords
[] = {
188 static const char* strListSqlKeyWords
[] = {
256 static int compare_strings( const void *arg1
, const void *arg2
)
258 return strcmp( static_cast<char const *>(arg1
), *static_cast<char * const *>(arg2
) );
265 bool isAlpha(sal_Unicode c
)
267 if (rtl::isAsciiAlpha(c
))
273 class SyntaxHighlighter::Tokenizer
275 // Character information tables
276 CharFlags aCharTypeTab
[256] = {};
278 // Auxiliary function: testing of the character flags
279 bool testCharFlags(sal_Unicode c
, CharFlags nTestFlags
) const;
281 // Get new token, EmptyString == nothing more over there
282 bool getNextToken(std::u16string_view::const_iterator
& pos
, std::u16string_view::const_iterator end
, /*out*/TokenType
& reType
,
283 /*out*/std::u16string_view::const_iterator
& rpStartPos
, /*out*/std::u16string_view::const_iterator
& rpEndPos
) const;
285 const char** ppListKeyWords
;
286 sal_uInt16 nKeyWordCount
;
289 HighlighterLanguage
const aLanguage
;
291 explicit Tokenizer( HighlighterLanguage aLang
);
293 void getHighlightPortions(std::u16string_view rLine
,
294 /*out*/std::vector
<HighlightPortion
>& portions
) const;
295 void setKeyWords( const char** ppKeyWords
, sal_uInt16 nCount
);
298 // Helper function: test character flag
299 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c
, CharFlags nTestFlags
) const
302 if( c
!= 0 && c
<= 255 )
304 bRet
= bool(aCharTypeTab
[c
] & nTestFlags
);
308 bRet
= (( CharFlags::StartIdentifier
| CharFlags::InIdentifier
) & nTestFlags
)
314 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords
, sal_uInt16 nCount
)
316 ppListKeyWords
= ppKeyWords
;
317 nKeyWordCount
= nCount
;
320 bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator
& pos
, std::u16string_view::const_iterator end
,
321 /*out*/TokenType
& reType
,
322 /*out*/std::u16string_view::const_iterator
& rpStartPos
, /*out*/std::u16string_view::const_iterator
& rpEndPos
) const
324 reType
= TokenType::Unknown
;
331 sal_Unicode c
= *pos
;
334 //*** Go through all possibilities ***
336 if ( testCharFlags( c
, CharFlags::Space
) )
338 while( pos
!= end
&& testCharFlags( *pos
, CharFlags::Space
) )
341 reType
= TokenType::Whitespace
;
345 else if ( testCharFlags( c
, CharFlags::StartIdentifier
) )
347 bool bIdentifierChar
;
352 // Fetch next character
354 bIdentifierChar
= testCharFlags( c
, CharFlags::InIdentifier
);
355 if( bIdentifierChar
)
358 while( bIdentifierChar
);
360 reType
= TokenType::Identifier
;
363 if (ppListKeyWords
!= nullptr)
365 int nCount
= pos
- rpStartPos
;
367 // No keyword if string contains char > 255
368 bool bCanBeKeyword
= true;
369 for( int i
= 0 ; i
< nCount
; i
++ )
371 if( rpStartPos
[i
] > 255 )
373 bCanBeKeyword
= false;
380 std::u16string_view
aKWString(&*rpStartPos
, nCount
);
381 OString aByteStr
= OUStringToOString(aKWString
,
382 RTL_TEXTENCODING_ASCII_US
).toAsciiLowerCase();
383 if ( bsearch( aByteStr
.getStr(), ppListKeyWords
, nKeyWordCount
, sizeof( char* ),
386 reType
= TokenType::Keywords
;
388 if( aByteStr
== "rem" )
390 // Remove all characters until end of line or EOF
395 sal_Unicode cPeek
= *pos
;
396 if ( testCharFlags( cPeek
, CharFlags::EOL
) )
401 reType
= TokenType::Comment
;
409 // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
410 else if ( testCharFlags( c
, CharFlags::Operator
) || ( (c
== '\'') && (aLanguage
==HighlighterLanguage::Basic
)) )
412 // parameters for SQL view
413 if (((c
==':') || (c
=='?')) && (aLanguage
== HighlighterLanguage::SQL
))
417 bool bIdentifierChar
;
420 // Get next character
424 bIdentifierChar
= isAlpha(c
);
425 if( bIdentifierChar
)
428 while( bIdentifierChar
);
430 reType
= TokenType::Parameter
;
432 else if ((c
=='-') && (aLanguage
== HighlighterLanguage::SQL
))
434 if (pos
!= end
&& *pos
=='-')
436 // Remove all characters until end of line or EOF
437 while( pos
!= end
&& !testCharFlags( *pos
, CharFlags::EOL
) )
441 reType
= TokenType::Comment
;
444 reType
= TokenType::Operator
;
446 else if ((c
=='/') && (aLanguage
== HighlighterLanguage::SQL
))
448 if (pos
!= end
&& *pos
=='/')
450 // Remove all characters until end of line or EOF
451 while( pos
!= end
&& !testCharFlags( *pos
, CharFlags::EOL
) )
455 reType
= TokenType::Comment
;
458 reType
= TokenType::Operator
;
462 // Apostrophe is Basic comment
463 if (( c
== '\'') && (aLanguage
== HighlighterLanguage::Basic
))
465 // Skip all characters until end of input or end of line:
470 if (testCharFlags(c
, CharFlags::EOL
)) {
476 reType
= TokenType::Comment
;
479 // The real operator; can be easily used since not the actual
480 // operator (e.g. +=) is concerned, but the fact that it is one
481 if( reType
!= TokenType::Comment
)
483 reType
= TokenType::Operator
;
489 // Object separator? Must be handled before Number
490 else if( c
== '.' && ( pos
== end
|| *pos
< '0' || *pos
> '9' ) )
492 reType
= TokenType::Operator
;
496 else if( testCharFlags( c
, CharFlags::StartNumber
) )
498 reType
= TokenType::Number
;
500 // Number system, 10 = normal, it is changed for Oct/Hex
503 // Is it an Oct or a Hex number?
507 if( pos
!= end
&& (*pos
== 'o' || *pos
== 'O' ))
511 nRadix
= 8; // Octal base
514 while( pos
!= end
&& testCharFlags( *pos
, CharFlags::InOctNumber
) )
518 else if( pos
!= end
&& (*pos
== 'h' || *pos
== 'H' ))
522 nRadix
= 16; // Hexadecimal base
525 while( pos
!= end
&& testCharFlags( *pos
, CharFlags::InHexNumber
) )
530 reType
= TokenType::Operator
;
534 // When it is not Oct or Hex, then it is double
535 if( reType
== TokenType::Number
&& nRadix
== 10 )
537 // Flag if the last character is an exponent
538 bool bAfterExpChar
= false;
541 while( pos
!= end
&& (testCharFlags( *pos
, CharFlags::InNumber
) ||
542 (bAfterExpChar
&& *pos
== '+' ) ||
543 (bAfterExpChar
&& *pos
== '-' ) ))
544 // After exponent +/- are OK, too
547 bAfterExpChar
= ( c
== 'e' || c
== 'E' );
553 else if( testCharFlags( c
, CharFlags::StartString
) )
555 // Remember which character has opened the string
556 sal_Unicode cEndString
= c
;
560 // Read all characters
561 while( pos
== end
|| *pos
!= cEndString
)
563 // Detect EOF before reading next char, so we do not lose EOF
566 // ERROR: unterminated string literal
567 reType
= TokenType::Error
;
571 if( testCharFlags( c
, CharFlags::EOL
) )
573 // ERROR: unterminated string literal
574 reType
= TokenType::Error
;
579 if( reType
!= TokenType::Error
)
582 if( cEndString
== ']' )
583 reType
= TokenType::Identifier
;
585 reType
= TokenType::String
;
590 else if( testCharFlags( c
, CharFlags::EOL
) )
592 // If another EOL character comes, read it
595 sal_Unicode cNext
= *pos
;
596 if( cNext
!= c
&& testCharFlags( cNext
, CharFlags::EOL
) )
600 reType
= TokenType::EOL
;
603 // All other will remain TokenType::Unknown
610 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang
): aLanguage(aLang
)
612 // Fill character table
615 // Allowed characters for identifiers
616 CharFlags nHelpMask
= CharFlags::StartIdentifier
| CharFlags::InIdentifier
;
617 for( i
= 'a' ; i
<= 'z' ; i
++ )
618 aCharTypeTab
[i
] |= nHelpMask
;
619 for( i
= 'A' ; i
<= 'Z' ; i
++ )
620 aCharTypeTab
[i
] |= nHelpMask
;
621 aCharTypeTab
[int('_')] |= nHelpMask
;
622 aCharTypeTab
[int('$')] |= nHelpMask
;
624 // Digit (can be identifier and number)
625 nHelpMask
= CharFlags::InIdentifier
| CharFlags::StartNumber
|
626 CharFlags::InNumber
| CharFlags::InHexNumber
;
627 for( i
= '0' ; i
<= '9' ; i
++ )
628 aCharTypeTab
[i
] |= nHelpMask
;
630 // Add e, E, . and & here manually
631 aCharTypeTab
[int('e')] |= CharFlags::InNumber
;
632 aCharTypeTab
[int('E')] |= CharFlags::InNumber
;
633 aCharTypeTab
[int('.')] |= CharFlags::InNumber
| CharFlags::StartNumber
;
634 aCharTypeTab
[int('&')] |= CharFlags::StartNumber
;
637 for( i
= 'a' ; i
<= 'f' ; i
++ )
638 aCharTypeTab
[i
] |= CharFlags::InHexNumber
;
639 for( i
= 'A' ; i
<= 'F' ; i
++ )
640 aCharTypeTab
[i
] |= CharFlags::InHexNumber
;
643 for( i
= '0' ; i
<= '7' ; i
++ )
644 aCharTypeTab
[i
] |= CharFlags::InOctNumber
;
646 // String literal start/end characters
647 aCharTypeTab
[int('\'')] |= CharFlags::StartString
;
648 aCharTypeTab
[int('\"')] |= CharFlags::StartString
;
649 aCharTypeTab
[int('[')] |= CharFlags::StartString
;
650 aCharTypeTab
[int('`')] |= CharFlags::StartString
;
652 // Operator characters
653 aCharTypeTab
[int('!')] |= CharFlags::Operator
;
654 aCharTypeTab
[int('%')] |= CharFlags::Operator
;
655 // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
656 aCharTypeTab
[int('(')] |= CharFlags::Operator
;
657 aCharTypeTab
[int(')')] |= CharFlags::Operator
;
658 aCharTypeTab
[int('*')] |= CharFlags::Operator
;
659 aCharTypeTab
[int('+')] |= CharFlags::Operator
;
660 aCharTypeTab
[int(',')] |= CharFlags::Operator
;
661 aCharTypeTab
[int('-')] |= CharFlags::Operator
;
662 aCharTypeTab
[int('/')] |= CharFlags::Operator
;
663 aCharTypeTab
[int(':')] |= CharFlags::Operator
;
664 aCharTypeTab
[int('<')] |= CharFlags::Operator
;
665 aCharTypeTab
[int('=')] |= CharFlags::Operator
;
666 aCharTypeTab
[int('>')] |= CharFlags::Operator
;
667 aCharTypeTab
[int('?')] |= CharFlags::Operator
;
668 aCharTypeTab
[int('^')] |= CharFlags::Operator
;
669 aCharTypeTab
[int('|')] |= CharFlags::Operator
;
670 aCharTypeTab
[int('~')] |= CharFlags::Operator
;
671 aCharTypeTab
[int('{')] |= CharFlags::Operator
;
672 aCharTypeTab
[int('}')] |= CharFlags::Operator
;
673 // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
674 aCharTypeTab
[int(']')] |= CharFlags::Operator
;
675 aCharTypeTab
[int(';')] |= CharFlags::Operator
;
678 aCharTypeTab
[int(' ') ] |= CharFlags::Space
;
679 aCharTypeTab
[int('\t')] |= CharFlags::Space
;
681 // End of line characters
682 aCharTypeTab
[int('\r')] |= CharFlags::EOL
;
683 aCharTypeTab
[int('\n')] |= CharFlags::EOL
;
685 ppListKeyWords
= nullptr;
689 void SyntaxHighlighter::Tokenizer::getHighlightPortions(std::u16string_view rLine
,
690 /*out*/std::vector
<HighlightPortion
>& portions
) const
692 // Set the position to the beginning of the source string
693 auto pos
= rLine
.begin();
695 // Variables for the out parameter
697 std::u16string_view::const_iterator pStartPos
;
698 std::u16string_view::const_iterator pEndPos
;
700 // Loop over all the tokens
701 while( getNextToken( pos
, rLine
.end(), eType
, pStartPos
, pEndPos
) )
703 portions
.emplace_back(
704 pStartPos
- rLine
.begin(), pEndPos
- rLine
.begin(), eType
);
709 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language
):
710 m_tokenizer(new SyntaxHighlighter::Tokenizer(language
))
714 case HighlighterLanguage::Basic
:
715 m_tokenizer
->setKeyWords( strListBasicKeyWords
,
716 std::size( strListBasicKeyWords
));
718 case HighlighterLanguage::SQL
:
719 m_tokenizer
->setKeyWords( strListSqlKeyWords
,
720 std::size( strListSqlKeyWords
));
723 assert(false); // this cannot happen
727 SyntaxHighlighter::~SyntaxHighlighter() {}
729 void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine
,
730 /*out*/std::vector
<HighlightPortion
>& portions
) const
732 m_tokenizer
->getHighlightPortions( rLine
, portions
);
735 HighlighterLanguage
SyntaxHighlighter::GetLanguage() const
737 return m_tokenizer
->aLanguage
;
740 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */