headers/bsd: Add sys/queue.h.
[haiku.git] / src / kits / debugger / source_language / c_family / CLanguageTokenizer.cpp
blob18eb1415deafeb0c94c933c89bee0e8a5efbe5bc
1 /*
2 * Copyright 2006-2014 Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
5 * Authors:
6 * Stephan Aßmus <superstippi@gmx.de>
7 * Rene Gollent <rene@gollent.com>
8 * John Scipione <jscipione@gmail.com>
9 * Ingo Weinhold <bonefish@cs.tu-berlin.de>
13 #include "CLanguageTokenizer.h"
15 #include <ctype.h>
16 #include <stdio.h>
17 #include <stdlib.h>
20 using CLanguage::ParseException;
21 using CLanguage::Token;
22 using CLanguage::Tokenizer;
25 // #pragma mark - Token
28 Token::Token()
30 string(""),
31 type(TOKEN_NONE),
32 value(0L),
33 position(0)
38 Token::Token(const Token& other)
40 string(other.string),
41 type(other.type),
42 value(other.value),
43 position(other.position)
48 Token::Token(const char* string, int32 length, int32 position, int32 type)
50 string(string, length),
51 type(type),
52 value(),
53 position(position)
58 Token&
59 Token::operator=(const Token& other)
61 string = other.string;
62 type = other.type;
63 value = other.value;
64 position = other.position;
65 return *this;
69 // #pragma mark - Tokenizer
72 Tokenizer::Tokenizer()
74 fString(""),
75 fCurrentChar(NULL),
76 fCurrentToken(),
77 fReuseToken(false)
82 void
83 Tokenizer::SetTo(const char* string)
85 fString = string;
86 fCurrentChar = fString.String();
87 fCurrentToken = Token();
88 fReuseToken = false;
92 const Token&
93 Tokenizer::NextToken()
95 if (fCurrentToken.type == TOKEN_END_OF_LINE)
96 return fCurrentToken;
98 if (fReuseToken) {
99 fReuseToken = false;
100 return fCurrentToken;
103 while (*fCurrentChar != 0 && isspace(*fCurrentChar))
104 fCurrentChar++;
106 if (*fCurrentChar == 0) {
107 return fCurrentToken = Token("", 0, _CurrentPos(),
108 TOKEN_END_OF_LINE);
111 bool decimal = *fCurrentChar == '.';
113 if (decimal || isdigit(*fCurrentChar)) {
114 if (*fCurrentChar == '0' && fCurrentChar[1] == 'x')
115 return _ParseHexOperand();
117 BString temp;
119 const char* begin = fCurrentChar;
121 // optional digits before the comma
122 while (isdigit(*fCurrentChar)) {
123 temp << *fCurrentChar;
124 fCurrentChar++;
127 // optional post decimal part
128 // (required if there are no digits before the decimal)
129 if (*fCurrentChar == '.') {
130 decimal = true;
131 temp << '.';
132 fCurrentChar++;
134 // optional post decimal digits
135 while (isdigit(*fCurrentChar)) {
136 temp << *fCurrentChar;
137 fCurrentChar++;
141 int32 length = fCurrentChar - begin;
142 if (length == 1 && decimal) {
143 // check for . operator
144 fCurrentChar = begin;
145 if (!_ParseOperator())
146 throw ParseException("unexpected character", _CurrentPos());
148 return fCurrentToken;
151 BString test = temp;
152 test << "&_";
153 double value;
154 char t[2];
155 int32 matches = sscanf(test.String(), "%lf&%s", &value, t);
156 if (matches != 2)
157 throw ParseException("error in constant", _CurrentPos() - length);
159 fCurrentToken = Token(begin, length, _CurrentPos() - length,
160 TOKEN_CONSTANT);
161 if (decimal)
162 fCurrentToken.value.SetTo(value);
163 else
164 fCurrentToken.value.SetTo((int64)strtoll(temp.String(), NULL, 10));
165 } else if (isalpha(*fCurrentChar) || *fCurrentChar == '_') {
166 const char* begin = fCurrentChar;
167 while (*fCurrentChar != 0 && (isalpha(*fCurrentChar)
168 || isdigit(*fCurrentChar) || *fCurrentChar == '_')) {
169 fCurrentChar++;
171 int32 length = fCurrentChar - begin;
172 fCurrentToken = Token(begin, length, _CurrentPos() - length,
173 TOKEN_IDENTIFIER);
174 } else if (*fCurrentChar == '"' || *fCurrentChar == '\'') {
175 bool terminatorFound = false;
176 const char* begin = fCurrentChar++;
177 while (*fCurrentChar != 0) {
178 if (*fCurrentChar == '\\') {
179 if (*(fCurrentChar++) != 0)
180 fCurrentChar++;
181 } else if (*(fCurrentChar++) == *begin) {
182 terminatorFound = true;
183 break;
186 int32 tokenType = TOKEN_STRING_LITERAL;
187 if (!terminatorFound) {
188 tokenType = *begin == '"' ? TOKEN_DOUBLE_QUOTE
189 : TOKEN_SINGLE_QUOTE;
190 fCurrentChar = begin + 1;
193 int32 length = fCurrentChar - begin;
194 fCurrentToken = Token(begin, length, _CurrentPos() - length,
195 tokenType);
196 } else {
197 if (!_ParseOperator()) {
198 int32 type = TOKEN_NONE;
199 switch (*fCurrentChar) {
200 case '\n':
201 type = TOKEN_END_OF_LINE;
202 break;
204 case '(':
205 type = TOKEN_OPENING_PAREN;
206 break;
207 case ')':
208 type = TOKEN_CLOSING_PAREN;
209 break;
211 case '[':
212 type = TOKEN_OPENING_SQUARE_BRACKET;
213 break;
214 case ']':
215 type = TOKEN_CLOSING_SQUARE_BRACKET;
216 break;
218 case '{':
219 type = TOKEN_OPENING_CURLY_BRACE;
220 break;
221 case '}':
222 type = TOKEN_CLOSING_CURLY_BRACE;
223 break;
225 case '\\':
226 type = TOKEN_BACKSLASH;
227 break;
229 case ':':
230 type = TOKEN_COLON;
231 break;
233 case ';':
234 type = TOKEN_SEMICOLON;
235 break;
237 case ',':
238 type = TOKEN_COMMA;
239 break;
241 case '.':
242 type = TOKEN_PERIOD;
243 break;
245 case '#':
246 type = TOKEN_POUND;
247 break;
249 default:
250 throw ParseException("unexpected character",
251 _CurrentPos());
253 fCurrentToken = Token(fCurrentChar, 1, _CurrentPos(),
254 type);
255 fCurrentChar++;
259 return fCurrentToken;
263 bool
264 Tokenizer::_ParseOperator()
266 int32 type = TOKEN_NONE;
267 int32 length = 0;
268 switch (*fCurrentChar) {
269 case '+':
270 type = TOKEN_PLUS;
271 length = 1;
272 break;
274 case '-':
275 if (_Peek() == '>') {
276 type = TOKEN_MEMBER_PTR;
277 length = 2;
278 } else {
279 type = TOKEN_MINUS;
280 length = 1;
282 break;
284 case '*':
285 switch (_Peek()) {
286 case '/':
287 type = TOKEN_END_COMMENT_BLOCK;
288 length = 2;
289 break;
290 default:
291 type = TOKEN_STAR;
292 length = 1;
293 break;
295 break;
297 case '/':
298 switch (_Peek()) {
299 case '*':
300 type = TOKEN_BEGIN_COMMENT_BLOCK;
301 length = 2;
302 break;
303 case '/':
304 type = TOKEN_INLINE_COMMENT;
305 length = 2;
306 break;
307 default:
308 type = TOKEN_SLASH;
309 length = 1;
310 break;
312 break;
314 case '%':
315 type = TOKEN_MODULO;
316 length = 1;
317 break;
319 case '^':
320 type = TOKEN_BITWISE_XOR;
321 length = 1;
322 break;
324 case '&':
325 if (_Peek() == '&') {
326 type = TOKEN_LOGICAL_AND;
327 length = 2;
328 } else {
329 type = TOKEN_BITWISE_AND;
330 length = 1;
332 break;
334 case '|':
335 if (_Peek() == '|') {
336 type = TOKEN_LOGICAL_OR;
337 length = 2;
338 } else {
339 type = TOKEN_BITWISE_OR;
340 length = 1;
342 break;
344 case '!':
345 if (_Peek() == '=') {
346 type = TOKEN_NE;
347 length = 2;
348 } else {
349 type = TOKEN_LOGICAL_NOT;
350 length = 1;
352 break;
354 case '=':
355 if (_Peek() == '=') {
356 type = TOKEN_EQ;
357 length = 2;
358 } else {
359 type = TOKEN_ASSIGN;
360 length = 1;
362 break;
364 case '>':
365 if (_Peek() == '=') {
366 type = TOKEN_GE;
367 length = 2;
368 } else {
369 type = TOKEN_GT;
370 length = 1;
372 break;
374 case '<':
375 if (_Peek() == '=') {
376 type = TOKEN_LE;
377 length = 2;
378 } else {
379 type = TOKEN_LT;
380 length = 1;
382 break;
384 case '~':
385 type = TOKEN_BITWISE_NOT;
386 length = 1;
387 break;
390 case '?':
391 type = TOKEN_CONDITION;
392 length = 1;
393 break;
395 case '.':
396 type = TOKEN_MEMBER_PTR;
397 length = 1;
398 break;
400 default:
401 break;
404 if (length == 0)
405 return false;
407 fCurrentToken = Token(fCurrentChar, length, _CurrentPos(), type);
408 fCurrentChar += length;
410 return true;
414 void
415 Tokenizer::RewindToken()
417 fReuseToken = true;
421 char
422 Tokenizer::_Peek() const
424 if (_CurrentPos() < fString.Length())
425 return *(fCurrentChar + 1);
427 return '\0';
431 /*static*/ bool
432 Tokenizer::_IsHexDigit(char c)
434 return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
438 Token&
439 Tokenizer::_ParseHexOperand()
441 const char* begin = fCurrentChar;
442 fCurrentChar += 2;
443 // skip "0x"
445 if (!_IsHexDigit(*fCurrentChar))
446 throw ParseException("expected hex digit", _CurrentPos());
448 fCurrentChar++;
449 while (_IsHexDigit(*fCurrentChar))
450 fCurrentChar++;
452 int32 length = fCurrentChar - begin;
453 fCurrentToken = Token(begin, length, _CurrentPos() - length,
454 TOKEN_CONSTANT);
456 if (length <= 10) {
457 // including the leading 0x, a 32-bit constant will be at most
458 // 10 characters. Anything larger, and 64 is necessary.
459 fCurrentToken.value.SetTo((uint32)strtoul(
460 fCurrentToken.string.String(), NULL, 16));
461 } else {
462 fCurrentToken.value.SetTo((uint64)strtoull(
463 fCurrentToken.string.String(), NULL, 16));
465 return fCurrentToken;
469 int32
470 Tokenizer::_CurrentPos() const
472 return fCurrentChar - fString.String();