2 * Copyright 2006-2014 Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
6 * Stephan Aßmus <superstippi@gmx.de>
7 * Rene Gollent <rene@gollent.com>
8 * John Scipione <jscipione@gmail.com>
9 * Ingo Weinhold <bonefish@cs.tu-berlin.de>
13 #include "CLanguageTokenizer.h"
20 using CLanguage::ParseException
;
21 using CLanguage::Token
;
22 using CLanguage::Tokenizer
;
25 // #pragma mark - Token
38 Token::Token(const Token
& other
)
43 position(other
.position
)
48 Token::Token(const char* string
, int32 length
, int32 position
, int32 type
)
50 string(string
, length
),
59 Token::operator=(const Token
& other
)
61 string
= other
.string
;
64 position
= other
.position
;
69 // #pragma mark - Tokenizer
72 Tokenizer::Tokenizer()
83 Tokenizer::SetTo(const char* string
)
86 fCurrentChar
= fString
.String();
87 fCurrentToken
= Token();
93 Tokenizer::NextToken()
95 if (fCurrentToken
.type
== TOKEN_END_OF_LINE
)
100 return fCurrentToken
;
103 while (*fCurrentChar
!= 0 && isspace(*fCurrentChar
))
106 if (*fCurrentChar
== 0) {
107 return fCurrentToken
= Token("", 0, _CurrentPos(),
111 bool decimal
= *fCurrentChar
== '.';
113 if (decimal
|| isdigit(*fCurrentChar
)) {
114 if (*fCurrentChar
== '0' && fCurrentChar
[1] == 'x')
115 return _ParseHexOperand();
119 const char* begin
= fCurrentChar
;
121 // optional digits before the comma
122 while (isdigit(*fCurrentChar
)) {
123 temp
<< *fCurrentChar
;
127 // optional post decimal part
128 // (required if there are no digits before the decimal)
129 if (*fCurrentChar
== '.') {
134 // optional post decimal digits
135 while (isdigit(*fCurrentChar
)) {
136 temp
<< *fCurrentChar
;
141 int32 length
= fCurrentChar
- begin
;
142 if (length
== 1 && decimal
) {
143 // check for . operator
144 fCurrentChar
= begin
;
145 if (!_ParseOperator())
146 throw ParseException("unexpected character", _CurrentPos());
148 return fCurrentToken
;
155 int32 matches
= sscanf(test
.String(), "%lf&%s", &value
, t
);
157 throw ParseException("error in constant", _CurrentPos() - length
);
159 fCurrentToken
= Token(begin
, length
, _CurrentPos() - length
,
162 fCurrentToken
.value
.SetTo(value
);
164 fCurrentToken
.value
.SetTo((int64
)strtoll(temp
.String(), NULL
, 10));
165 } else if (isalpha(*fCurrentChar
) || *fCurrentChar
== '_') {
166 const char* begin
= fCurrentChar
;
167 while (*fCurrentChar
!= 0 && (isalpha(*fCurrentChar
)
168 || isdigit(*fCurrentChar
) || *fCurrentChar
== '_')) {
171 int32 length
= fCurrentChar
- begin
;
172 fCurrentToken
= Token(begin
, length
, _CurrentPos() - length
,
174 } else if (*fCurrentChar
== '"' || *fCurrentChar
== '\'') {
175 bool terminatorFound
= false;
176 const char* begin
= fCurrentChar
++;
177 while (*fCurrentChar
!= 0) {
178 if (*fCurrentChar
== '\\') {
179 if (*(fCurrentChar
++) != 0)
181 } else if (*(fCurrentChar
++) == *begin
) {
182 terminatorFound
= true;
186 int32 tokenType
= TOKEN_STRING_LITERAL
;
187 if (!terminatorFound
) {
188 tokenType
= *begin
== '"' ? TOKEN_DOUBLE_QUOTE
189 : TOKEN_SINGLE_QUOTE
;
190 fCurrentChar
= begin
+ 1;
193 int32 length
= fCurrentChar
- begin
;
194 fCurrentToken
= Token(begin
, length
, _CurrentPos() - length
,
197 if (!_ParseOperator()) {
198 int32 type
= TOKEN_NONE
;
199 switch (*fCurrentChar
) {
201 type
= TOKEN_END_OF_LINE
;
205 type
= TOKEN_OPENING_PAREN
;
208 type
= TOKEN_CLOSING_PAREN
;
212 type
= TOKEN_OPENING_SQUARE_BRACKET
;
215 type
= TOKEN_CLOSING_SQUARE_BRACKET
;
219 type
= TOKEN_OPENING_CURLY_BRACE
;
222 type
= TOKEN_CLOSING_CURLY_BRACE
;
226 type
= TOKEN_BACKSLASH
;
234 type
= TOKEN_SEMICOLON
;
250 throw ParseException("unexpected character",
253 fCurrentToken
= Token(fCurrentChar
, 1, _CurrentPos(),
259 return fCurrentToken
;
264 Tokenizer::_ParseOperator()
266 int32 type
= TOKEN_NONE
;
268 switch (*fCurrentChar
) {
275 if (_Peek() == '>') {
276 type
= TOKEN_MEMBER_PTR
;
287 type
= TOKEN_END_COMMENT_BLOCK
;
300 type
= TOKEN_BEGIN_COMMENT_BLOCK
;
304 type
= TOKEN_INLINE_COMMENT
;
320 type
= TOKEN_BITWISE_XOR
;
325 if (_Peek() == '&') {
326 type
= TOKEN_LOGICAL_AND
;
329 type
= TOKEN_BITWISE_AND
;
335 if (_Peek() == '|') {
336 type
= TOKEN_LOGICAL_OR
;
339 type
= TOKEN_BITWISE_OR
;
345 if (_Peek() == '=') {
349 type
= TOKEN_LOGICAL_NOT
;
355 if (_Peek() == '=') {
365 if (_Peek() == '=') {
375 if (_Peek() == '=') {
385 type
= TOKEN_BITWISE_NOT
;
391 type
= TOKEN_CONDITION
;
396 type
= TOKEN_MEMBER_PTR
;
407 fCurrentToken
= Token(fCurrentChar
, length
, _CurrentPos(), type
);
408 fCurrentChar
+= length
;
415 Tokenizer::RewindToken()
422 Tokenizer::_Peek() const
424 if (_CurrentPos() < fString
.Length())
425 return *(fCurrentChar
+ 1);
432 Tokenizer::_IsHexDigit(char c
)
434 return isdigit(c
) || (c
>= 'a' && c
<= 'f') || (c
>= 'A' && c
<= 'F');
439 Tokenizer::_ParseHexOperand()
441 const char* begin
= fCurrentChar
;
445 if (!_IsHexDigit(*fCurrentChar
))
446 throw ParseException("expected hex digit", _CurrentPos());
449 while (_IsHexDigit(*fCurrentChar
))
452 int32 length
= fCurrentChar
- begin
;
453 fCurrentToken
= Token(begin
, length
, _CurrentPos() - length
,
457 // including the leading 0x, a 32-bit constant will be at most
458 // 10 characters. Anything larger, and 64 is necessary.
459 fCurrentToken
.value
.SetTo((uint32
)strtoul(
460 fCurrentToken
.string
.String(), NULL
, 16));
462 fCurrentToken
.value
.SetTo((uint64
)strtoull(
463 fCurrentToken
.string
.String(), NULL
, 16));
465 return fCurrentToken
;
470 Tokenizer::_CurrentPos() const
472 return fCurrentChar
- fString
.String();