1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
24 #include <rtl/character.hxx>
25 #include <rtl/ustring.hxx>
26 #include <unicode/uchar.h>
27 #include <comphelper/syntaxhighlight.hxx>
28 #include <o3tl/typed_flags_set.hxx>
32 // Flags for character properties
33 enum class CharFlags
{
34 StartIdentifier
= 0x0001,
35 InIdentifier
= 0x0002,
49 template<> struct typed_flags
<CharFlags
> : is_typed_flags
<CharFlags
, 0x03ff> {};
52 // ##########################################################################
53 // ATTENTION: all these words need to be in lower case
54 // ##########################################################################
55 static const char* strListBasicKeyWords
[] = {
189 static const char* strListSqlKeyWords
[] = {
257 static int compare_strings( const void *arg1
, const void *arg2
)
259 return strcmp( static_cast<char const *>(arg1
), *static_cast<char * const *>(arg2
) );
266 bool isAlpha(sal_Unicode c
)
268 if (rtl::isAsciiAlpha(c
))
274 class SyntaxHighlighter::Tokenizer
276 // Character information tables
277 CharFlags aCharTypeTab
[256] = {};
279 // Auxiliary function: testing of the character flags
280 bool testCharFlags(sal_Unicode c
, CharFlags nTestFlags
) const;
282 // Get new token, EmptyString == nothing more over there
283 bool getNextToken(std::u16string_view::const_iterator
& pos
, std::u16string_view::const_iterator end
, /*out*/TokenType
& reType
,
284 /*out*/std::u16string_view::const_iterator
& rpStartPos
, /*out*/std::u16string_view::const_iterator
& rpEndPos
) const;
286 const char** ppListKeyWords
;
287 sal_uInt16 nKeyWordCount
;
290 HighlighterLanguage
const aLanguage
;
292 explicit Tokenizer( HighlighterLanguage aLang
);
294 void getHighlightPortions(std::u16string_view rLine
,
295 /*out*/std::vector
<HighlightPortion
>& portions
) const;
296 void setKeyWords( const char** ppKeyWords
, sal_uInt16 nCount
);
299 // Helper function: test character flag
300 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c
, CharFlags nTestFlags
) const
303 if( c
!= 0 && c
<= 255 )
305 bRet
= bool(aCharTypeTab
[c
] & nTestFlags
);
309 bRet
= (( CharFlags::StartIdentifier
| CharFlags::InIdentifier
) & nTestFlags
)
315 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords
, sal_uInt16 nCount
)
317 ppListKeyWords
= ppKeyWords
;
318 nKeyWordCount
= nCount
;
321 bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator
& pos
, std::u16string_view::const_iterator end
,
322 /*out*/TokenType
& reType
,
323 /*out*/std::u16string_view::const_iterator
& rpStartPos
, /*out*/std::u16string_view::const_iterator
& rpEndPos
) const
325 reType
= TokenType::Unknown
;
332 sal_Unicode c
= *pos
;
335 //*** Go through all possibilities ***
337 if ( testCharFlags( c
, CharFlags::Space
) )
339 while( pos
!= end
&& testCharFlags( *pos
, CharFlags::Space
) )
342 reType
= TokenType::Whitespace
;
346 else if ( testCharFlags( c
, CharFlags::StartIdentifier
) )
348 bool bIdentifierChar
;
353 // Fetch next character
355 bIdentifierChar
= testCharFlags( c
, CharFlags::InIdentifier
);
356 if( bIdentifierChar
)
359 while( bIdentifierChar
);
361 reType
= TokenType::Identifier
;
364 if (ppListKeyWords
!= nullptr)
366 int nCount
= pos
- rpStartPos
;
368 // No keyword if string contains char > 255
369 bool bCanBeKeyword
= true;
370 for( int i
= 0 ; i
< nCount
; i
++ )
372 if( rpStartPos
[i
] > 255 )
374 bCanBeKeyword
= false;
381 std::u16string_view
aKWString(&*rpStartPos
, nCount
);
382 OString aByteStr
= OUStringToOString(aKWString
,
383 RTL_TEXTENCODING_ASCII_US
).toAsciiLowerCase();
384 if ( bsearch( aByteStr
.getStr(), ppListKeyWords
, nKeyWordCount
, sizeof( char* ),
387 reType
= TokenType::Keywords
;
389 if( aByteStr
== "rem" )
391 // Remove all characters until end of line or EOF
396 sal_Unicode cPeek
= *pos
;
397 if ( testCharFlags( cPeek
, CharFlags::EOL
) )
402 reType
= TokenType::Comment
;
410 // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
411 else if ( testCharFlags( c
, CharFlags::Operator
) || ( (c
== '\'') && (aLanguage
==HighlighterLanguage::Basic
)) )
413 // parameters for SQL view
414 if (((c
==':') || (c
=='?')) && (aLanguage
== HighlighterLanguage::SQL
))
418 bool bIdentifierChar
;
421 // Get next character
425 bIdentifierChar
= isAlpha(c
);
426 if( bIdentifierChar
)
429 while( bIdentifierChar
);
431 reType
= TokenType::Parameter
;
433 else if ((c
=='-') && (aLanguage
== HighlighterLanguage::SQL
))
435 if (pos
!= end
&& *pos
=='-')
437 // Remove all characters until end of line or EOF
438 while( pos
!= end
&& !testCharFlags( *pos
, CharFlags::EOL
) )
442 reType
= TokenType::Comment
;
445 reType
= TokenType::Operator
;
447 else if ((c
=='/') && (aLanguage
== HighlighterLanguage::SQL
))
449 if (pos
!= end
&& *pos
=='/')
451 // Remove all characters until end of line or EOF
452 while( pos
!= end
&& !testCharFlags( *pos
, CharFlags::EOL
) )
456 reType
= TokenType::Comment
;
459 reType
= TokenType::Operator
;
463 // Apostrophe is Basic comment
464 if (( c
== '\'') && (aLanguage
== HighlighterLanguage::Basic
))
466 // Skip all characters until end of input or end of line:
471 if (testCharFlags(c
, CharFlags::EOL
)) {
477 reType
= TokenType::Comment
;
480 // The real operator; can be easily used since not the actual
481 // operator (e.g. +=) is concerned, but the fact that it is one
482 if( reType
!= TokenType::Comment
)
484 reType
= TokenType::Operator
;
490 // Object separator? Must be handled before Number
491 else if( c
== '.' && ( pos
== end
|| *pos
< '0' || *pos
> '9' ) )
493 reType
= TokenType::Operator
;
497 else if( testCharFlags( c
, CharFlags::StartNumber
) )
499 reType
= TokenType::Number
;
501 // Number system, 10 = normal, it is changed for Oct/Hex
504 // Is it an Oct or a Hex number?
508 if( pos
!= end
&& (*pos
== 'o' || *pos
== 'O' ))
512 nRadix
= 8; // Octal base
515 while( pos
!= end
&& testCharFlags( *pos
, CharFlags::InOctNumber
) )
519 else if( pos
!= end
&& (*pos
== 'h' || *pos
== 'H' ))
523 nRadix
= 16; // Hexadecimal base
526 while( pos
!= end
&& testCharFlags( *pos
, CharFlags::InHexNumber
) )
531 reType
= TokenType::Operator
;
535 // When it is not Oct or Hex, then it is double
536 if( reType
== TokenType::Number
&& nRadix
== 10 )
538 // Flag if the last character is an exponent
539 bool bAfterExpChar
= false;
542 while( pos
!= end
&& (testCharFlags( *pos
, CharFlags::InNumber
) ||
543 (bAfterExpChar
&& *pos
== '+' ) ||
544 (bAfterExpChar
&& *pos
== '-' ) ))
545 // After exponent +/- are OK, too
548 bAfterExpChar
= ( c
== 'e' || c
== 'E' );
554 else if( testCharFlags( c
, CharFlags::StartString
) )
556 // Remember which character has opened the string
557 sal_Unicode cEndString
= c
;
561 // Read all characters
562 while( pos
== end
|| *pos
!= cEndString
)
564 // Detect EOF before reading next char, so we do not lose EOF
567 // ERROR: unterminated string literal
568 reType
= TokenType::Error
;
572 if( testCharFlags( c
, CharFlags::EOL
) )
574 // ERROR: unterminated string literal
575 reType
= TokenType::Error
;
580 if( reType
!= TokenType::Error
)
583 if( cEndString
== ']' )
584 reType
= TokenType::Identifier
;
586 reType
= TokenType::String
;
591 else if( testCharFlags( c
, CharFlags::EOL
) )
593 // If another EOL character comes, read it
596 sal_Unicode cNext
= *pos
;
597 if( cNext
!= c
&& testCharFlags( cNext
, CharFlags::EOL
) )
601 reType
= TokenType::EOL
;
604 // All other will remain TokenType::Unknown
611 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang
): aLanguage(aLang
)
613 // Fill character table
616 // Allowed characters for identifiers
617 CharFlags nHelpMask
= CharFlags::StartIdentifier
| CharFlags::InIdentifier
;
618 for( i
= 'a' ; i
<= 'z' ; i
++ )
619 aCharTypeTab
[i
] |= nHelpMask
;
620 for( i
= 'A' ; i
<= 'Z' ; i
++ )
621 aCharTypeTab
[i
] |= nHelpMask
;
622 aCharTypeTab
[int('_')] |= nHelpMask
;
623 aCharTypeTab
[int('$')] |= nHelpMask
;
625 // Digit (can be identifier and number)
626 nHelpMask
= CharFlags::InIdentifier
| CharFlags::StartNumber
|
627 CharFlags::InNumber
| CharFlags::InHexNumber
;
628 for( i
= '0' ; i
<= '9' ; i
++ )
629 aCharTypeTab
[i
] |= nHelpMask
;
631 // Add e, E, . and & here manually
632 aCharTypeTab
[int('e')] |= CharFlags::InNumber
;
633 aCharTypeTab
[int('E')] |= CharFlags::InNumber
;
634 aCharTypeTab
[int('.')] |= CharFlags::InNumber
| CharFlags::StartNumber
;
635 aCharTypeTab
[int('&')] |= CharFlags::StartNumber
;
638 for( i
= 'a' ; i
<= 'f' ; i
++ )
639 aCharTypeTab
[i
] |= CharFlags::InHexNumber
;
640 for( i
= 'A' ; i
<= 'F' ; i
++ )
641 aCharTypeTab
[i
] |= CharFlags::InHexNumber
;
644 for( i
= '0' ; i
<= '7' ; i
++ )
645 aCharTypeTab
[i
] |= CharFlags::InOctNumber
;
647 // String literal start/end characters
648 aCharTypeTab
[int('\'')] |= CharFlags::StartString
;
649 aCharTypeTab
[int('\"')] |= CharFlags::StartString
;
650 aCharTypeTab
[int('[')] |= CharFlags::StartString
;
651 aCharTypeTab
[int('`')] |= CharFlags::StartString
;
653 // Operator characters
654 aCharTypeTab
[int('!')] |= CharFlags::Operator
;
655 aCharTypeTab
[int('%')] |= CharFlags::Operator
;
656 // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
657 aCharTypeTab
[int('(')] |= CharFlags::Operator
;
658 aCharTypeTab
[int(')')] |= CharFlags::Operator
;
659 aCharTypeTab
[int('*')] |= CharFlags::Operator
;
660 aCharTypeTab
[int('+')] |= CharFlags::Operator
;
661 aCharTypeTab
[int(',')] |= CharFlags::Operator
;
662 aCharTypeTab
[int('-')] |= CharFlags::Operator
;
663 aCharTypeTab
[int('/')] |= CharFlags::Operator
;
664 aCharTypeTab
[int(':')] |= CharFlags::Operator
;
665 aCharTypeTab
[int('<')] |= CharFlags::Operator
;
666 aCharTypeTab
[int('=')] |= CharFlags::Operator
;
667 aCharTypeTab
[int('>')] |= CharFlags::Operator
;
668 aCharTypeTab
[int('?')] |= CharFlags::Operator
;
669 aCharTypeTab
[int('^')] |= CharFlags::Operator
;
670 aCharTypeTab
[int('|')] |= CharFlags::Operator
;
671 aCharTypeTab
[int('~')] |= CharFlags::Operator
;
672 aCharTypeTab
[int('{')] |= CharFlags::Operator
;
673 aCharTypeTab
[int('}')] |= CharFlags::Operator
;
674 // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
675 aCharTypeTab
[int(']')] |= CharFlags::Operator
;
676 aCharTypeTab
[int(';')] |= CharFlags::Operator
;
679 aCharTypeTab
[int(' ') ] |= CharFlags::Space
;
680 aCharTypeTab
[int('\t')] |= CharFlags::Space
;
682 // End of line characters
683 aCharTypeTab
[int('\r')] |= CharFlags::EOL
;
684 aCharTypeTab
[int('\n')] |= CharFlags::EOL
;
686 ppListKeyWords
= nullptr;
690 void SyntaxHighlighter::Tokenizer::getHighlightPortions(std::u16string_view rLine
,
691 /*out*/std::vector
<HighlightPortion
>& portions
) const
693 // Set the position to the beginning of the source string
694 auto pos
= rLine
.begin();
696 // Variables for the out parameter
698 std::u16string_view::const_iterator pStartPos
;
699 std::u16string_view::const_iterator pEndPos
;
701 // Loop over all the tokens
702 while( getNextToken( pos
, rLine
.end(), eType
, pStartPos
, pEndPos
) )
704 portions
.emplace_back(
705 pStartPos
- rLine
.begin(), pEndPos
- rLine
.begin(), eType
);
710 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language
):
711 m_tokenizer(new SyntaxHighlighter::Tokenizer(language
))
715 case HighlighterLanguage::Basic
:
716 m_tokenizer
->setKeyWords( strListBasicKeyWords
,
717 std::size( strListBasicKeyWords
));
719 case HighlighterLanguage::SQL
:
720 m_tokenizer
->setKeyWords( strListSqlKeyWords
,
721 std::size( strListSqlKeyWords
));
724 assert(false); // this cannot happen
728 SyntaxHighlighter::~SyntaxHighlighter() {}
730 void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine
,
731 /*out*/std::vector
<HighlightPortion
>& portions
) const
733 m_tokenizer
->getHighlightPortions( rLine
, portions
);
736 HighlighterLanguage
SyntaxHighlighter::GetLanguage() const
738 return m_tokenizer
->aLanguage
;
741 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */