2 * Copyright (C) 2012 Arth Patel
3 * Copyright (C) 2012 Robert Ancell
5 * This program is free software: you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free Software
7 * Foundation, either version 2 of the License, or (at your option) any later
8 * version. See http://www.gnu.org/copyleft/gpl.html the full text of the
12 /* Enum for tokens generated by pre-lexer and lexer. */
13 public enum LexerTokenType
15 UNKNOWN
, /* Unknown */
17 /* These are all Pre-Lexer tokens, returned by pre-lexer */
18 PL_DECIMAL
, /* Decimal separator */
19 PL_DIGIT
, /* Decimal digit */
20 PL_HEX
, /* A-F of Hex digits */
21 PL_SUPER_DIGIT
, /* Super digits */
22 PL_SUPER_MINUS
, /* Super minus */
23 PL_SUB_DIGIT
, /* Sub digits */
24 PL_FRACTION
, /* Fractions */
25 PL_DEGREE
, /* Degree */
26 PL_MINUTE
, /* Minutes */
27 PL_SECOND
, /* Seconds */
28 PL_LETTER
, /* Alphabets */
29 PL_EOS
, /* End of stream */
30 PL_SKIP
, /* Skip this symbol (whitespace or newline). */
32 /* These are all tokens, returned by Lexer. */
35 MULTIPLY
, /* Multiply */
38 L_FLOOR
, /* Floor ( Left ) */
39 R_FLOOR
, /* Floor ( Right ) */
40 L_CEILING
, /* Ceiling ( Left ) */
41 R_CEILING
, /* Ceiling ( Right ) */
42 ROOT
, /* Square root */
43 ROOT_3
, /* Cube root */
44 ROOT_4
, /* Fourth root */
45 NOT
, /* Bitwise NOT */
46 AND
, /* Bitwise AND */
48 XOR
, /* Bitwise XOR */
49 IN
, /* IN ( for converter ) */
51 SUP_NUMBER
, /* Super Number */
52 NSUP_NUMBER
, /* Negative Super Number */
53 SUB_NUMBER
, /* Sub Number */
54 FUNCTION
, /* Function */
55 VARIABLE
, /* Variable name */
69 // FIXME: Merge into lexer
72 public string stream
; /* String being scanned */
73 public int index
; /* Current character index */
74 public int mark_index
; /* Location, last marked. Useful for getting substrings as part of highlighting */
75 private bool eos
= false;
77 public PreLexer (string input
)
84 /* Roll back last scanned unichar. */
85 public void roll_back ()
93 stream
.get_prev_char (ref index
, out c
);
96 /* Set marker index. To be used for highlighting and error reporting. */
97 public void set_marker ()
102 /* Get marked substring. To be used for error reporting. */
103 public string get_marked_substring ()
105 return stream
.substring (mark_index
, index
- mark_index
);
108 /* Pre-Lexer tokanizer. To be called only by Lexer. */
109 public LexerTokenType
get_next_token ()
112 if (!stream
.get_next_char (ref index
, out c
))
114 // We have to flag if we ran out of chars, as roll_back from PL_EOS should have no effect
116 return LexerTokenType
.PL_EOS
;
120 if (c
== ',' || c
== '.')
121 return LexerTokenType
.PL_DECIMAL
;
124 return LexerTokenType
.PL_DIGIT
;
126 if ((c
>= 'a' && c
<= 'f') || (c
>= 'A' && c
<= 'F'))
127 return LexerTokenType
.PL_HEX
;
129 if (c
== '⁰' || c
== '¹' || c
== '²' || c
== '³' || c
== '⁴' || c
== '⁵' || c
== '⁶' || c
== '⁷' || c
== '⁸' || c
== '⁹')
130 return LexerTokenType
.PL_SUPER_DIGIT
;
133 return LexerTokenType
.PL_SUPER_MINUS
;
135 if (c
== '₀' || c
== '₁' || c
== '₂' || c
== '₃' || c
== '₄' || c
== '₅' || c
== '₆' || c
== '₇' || c
== '₈' || c
== '₉')
136 return LexerTokenType
.PL_SUB_DIGIT
;
138 if (c
== '½' || c
== '⅓' || c
== '⅔' || c
== '¼' || c
== '¾' || c
== '⅕' || c
== '⅖' || c
== '⅗' || c
== '⅘' || c
== '⅙' || c
== '⅚' || c
== '⅛' || c
== '⅜' || c
== '⅝' || c
== '⅞')
139 return LexerTokenType
.PL_FRACTION
;
142 return LexerTokenType
.PL_DEGREE
;
145 return LexerTokenType
.PL_MINUTE
;
148 return LexerTokenType
.PL_SECOND
;
150 if (c
.isalpha () || c
== '_')
151 return LexerTokenType
.PL_LETTER
;
154 return LexerTokenType
.AND
;
157 return LexerTokenType
.OR
;
159 if (c
== '⊻' || c
== '⊕')
160 return LexerTokenType
.XOR
;
162 if (c
== '¬' || c
== '~')
163 return LexerTokenType
.NOT
;
166 return LexerTokenType
.ADD
;
168 if (c
== '-' || c
== '−' || c
== '–')
169 return LexerTokenType
.SUBTRACT
;
171 if (c
== '*' || c
== '×')
172 return LexerTokenType
.MULTIPLY
;
174 if (c
== '/' || c
== '∕' || c
== '÷')
175 return LexerTokenType
.DIVIDE
;
178 return LexerTokenType
.L_FLOOR
;
181 return LexerTokenType
.R_FLOOR
;
184 return LexerTokenType
.L_CEILING
;
187 return LexerTokenType
.R_CEILING
;
190 return LexerTokenType
.ROOT
;
193 return LexerTokenType
.ROOT_3
;
196 return LexerTokenType
.ROOT_4
;
199 return LexerTokenType
.ASSIGN
;
202 return LexerTokenType
.L_R_BRACKET
;
205 return LexerTokenType
.R_R_BRACKET
;
208 return LexerTokenType
.L_S_BRACKET
;
211 return LexerTokenType
.R_S_BRACKET
;
214 return LexerTokenType
.L_C_BRACKET
;
217 return LexerTokenType
.R_C_BRACKET
;
220 return LexerTokenType
.ABS
;
223 return LexerTokenType
.POWER
;
226 return LexerTokenType
.FACTORIAL
;
229 return LexerTokenType
.PERCENTAGE
;
231 if (c
== ' ' || c
== '\r' || c
== '\t' || c
== '\n')
232 return LexerTokenType
.PL_SKIP
;
234 return LexerTokenType
.UNKNOWN
;
238 /* Structure to hold single token. */
239 public class LexerToken
241 public string text
; /* Copy of token string. */
242 public uint start_index
; /* Start index in original stream. */
243 public uint end_index
; /* End index in original stream. */
244 public LexerTokenType type
; /* Type of token. */
247 /* Structure to hold lexer state and all the tokens. */
250 private Parser parser
; /* Pointer to the parser parser. */
251 private PreLexer prelexer
; /* Pre-lexer Pre-lexer is part of lexer. */
252 public List
<LexerToken
> tokens
; /* Pointer to the dynamic array of LexerTokens. */
253 private uint next_token
; /* Index of next, to be sent, token. */
254 private int number_base
;
256 public Lexer (string input
, Parser parser
, int number_base
= 10)
258 prelexer
= new
PreLexer (input
);
259 tokens
= new List
<LexerToken
> ();
261 this
.parser
= parser
;
262 this
.number_base
= number_base
;
269 var token
= insert_next_token ();
270 tokens
.append (token
);
271 if (token
.type
== LexerTokenType
.PL_EOS
)
276 /* Get next token interface. Will be called by parser to get pointer to next token in token stream. */
277 public LexerToken
get_next_token ()
279 var token
= tokens
.nth_data (next_token
);
281 if (next_token
>= tokens
.length ())
282 next_token
= tokens
.length ();
287 /* Roll back one lexer token. */
288 public void roll_back ()
294 private bool check_if_function ()
296 var name
= prelexer
.get_marked_substring ();
298 if (parser
.function_is_defined (name
))
304 private bool check_if_number ()
307 var text
= prelexer
.get_marked_substring ();
309 var tmp
= mp_set_from_string (text
, number_base
);
314 /* Try to rollback several characters to see, if that yields any number. */
317 tmp
= mp_set_from_string (text
, number_base
);
321 prelexer
.roll_back ();
322 text
= prelexer
.get_marked_substring ();
325 /* Undo all rollbacks. */
327 prelexer
.get_next_token ();
333 /* Insert generated token to the lexer */
334 private LexerToken
insert_token (LexerTokenType type
)
336 var token
= new
LexerToken ();
337 token
.text
= prelexer
.get_marked_substring ();
338 token
.start_index
= prelexer
.mark_index
;
339 token
.end_index
= prelexer
.index
;
345 /* Generates next token from pre-lexer stream and call insert_token () to insert it at the end. */
346 private LexerToken
insert_next_token ()
348 /* Mark start of next token */
349 prelexer
.set_marker ();
351 /* Ignore whitespace */
352 var type
= prelexer
.get_next_token ();
353 while (type
== LexerTokenType
.PL_SKIP
)
355 prelexer
.set_marker ();
356 type
= prelexer
.get_next_token ();
359 if (type
== LexerTokenType
.AND
|| type
== LexerTokenType
.OR
|| type
== LexerTokenType
.XOR
|| type
== LexerTokenType
.NOT
|| type
== LexerTokenType
.ADD
|| type
== LexerTokenType
.SUBTRACT
|| type
== LexerTokenType
.MULTIPLY
|| type
== LexerTokenType
.DIVIDE
|| type
== LexerTokenType
.L_FLOOR
|| type
== LexerTokenType
.R_FLOOR
|| type
== LexerTokenType
.L_CEILING
|| type
== LexerTokenType
.R_CEILING
|| type
== LexerTokenType
.ROOT
|| type
== LexerTokenType
.ROOT_3
|| type
== LexerTokenType
.ROOT_4
|| type
== LexerTokenType
.ASSIGN
|| type
== LexerTokenType
.L_R_BRACKET
|| type
== LexerTokenType
.R_R_BRACKET
|| type
== LexerTokenType
.L_S_BRACKET
|| type
== LexerTokenType
.R_S_BRACKET
|| type
== LexerTokenType
.L_C_BRACKET
|| type
== LexerTokenType
.R_C_BRACKET
|| type
== LexerTokenType
.ABS
|| type
== LexerTokenType
.POWER
|| type
== LexerTokenType
.FACTORIAL
|| type
== LexerTokenType
.PERCENTAGE
)
360 return insert_token (type
);
362 /* [LexerTokenType.PL_SUPER_MINUS][LexerTokenType.PL_SUPER_DIGIT]+ */
363 if (type
== LexerTokenType
.PL_SUPER_MINUS
)
365 if ((type
= prelexer
.get_next_token ()) != LexerTokenType
.PL_SUPER_DIGIT
)
367 /* ERROR: expected LexerTokenType.PL_SUP_DIGIT */
368 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
369 return insert_token (LexerTokenType
.UNKNOWN
);
372 /* Get all LexerTokenType.PL_SUPER_DIGITs. */
373 while (prelexer
.get_next_token () == LexerTokenType
.PL_SUPER_DIGIT
);
374 prelexer
.roll_back ();
376 return insert_token (LexerTokenType
.NSUP_NUMBER
);
379 /* [LexerTokenType.PL_SUPER_DIGIT]+ */
380 if (type
== LexerTokenType
.PL_SUPER_DIGIT
)
382 while (prelexer
.get_next_token () == LexerTokenType
.PL_SUPER_DIGIT
);
383 prelexer
.roll_back ();
385 return insert_token (LexerTokenType
.SUP_NUMBER
);
388 /* [LexerTokenType.PL_SUB_DIGIT]+ */
389 if (type
== LexerTokenType
.PL_SUB_DIGIT
)
391 while (prelexer
.get_next_token () == LexerTokenType
.PL_SUB_DIGIT
);
392 prelexer
.roll_back ();
394 return insert_token (LexerTokenType
.SUB_NUMBER
);
397 /* [LexerTokenType.PL_FRACTION] */
398 if (type
== LexerTokenType
.PL_FRACTION
)
399 return insert_token (LexerTokenType
.NUMBER
);
401 if (type
== LexerTokenType
.PL_DIGIT
)
402 return insert_digit ();
404 if (type
== LexerTokenType
.PL_DECIMAL
)
405 return insert_decimal ();
407 if (type
== LexerTokenType
.PL_HEX
)
408 return insert_hex ();
410 if (type
== LexerTokenType
.PL_LETTER
)
411 return insert_letter ();
413 if (type
== LexerTokenType
.PL_EOS
)
414 return insert_token (LexerTokenType
.PL_EOS
);
416 /* ERROR: Unexpected token */
417 parser
.set_error (ErrorCode
.INVALID
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
419 return insert_token (LexerTokenType
.UNKNOWN
);
422 private LexerToken
insert_digit ()
424 var type
= prelexer
.get_next_token ();
425 while (type
== LexerTokenType
.PL_DIGIT
)
426 type
= prelexer
.get_next_token ();
428 if (type
== LexerTokenType
.PL_FRACTION
)
429 return insert_token (LexerTokenType
.NUMBER
);
430 else if (type
== LexerTokenType
.PL_SUB_DIGIT
)
432 while (prelexer
.get_next_token () == LexerTokenType
.PL_SUB_DIGIT
);
433 prelexer
.roll_back ();
434 return insert_token (LexerTokenType
.NUMBER
);
436 else if (type
== LexerTokenType
.PL_DEGREE
)
438 type
= prelexer
.get_next_token ();
439 if (type
== LexerTokenType
.PL_DIGIT
)
441 while ((type
= prelexer
.get_next_token ()) == LexerTokenType
.PL_DIGIT
);
442 if (type
== LexerTokenType
.PL_DECIMAL
)
443 return insert_angle_num_dm ();
445 else if (type
== LexerTokenType
.PL_MINUTE
)
447 type
= prelexer
.get_next_token ();
448 if (type
== LexerTokenType
.PL_DIGIT
)
450 while ((type
= prelexer
.get_next_token ()) == LexerTokenType
.PL_DIGIT
);
451 if (type
== LexerTokenType
.PL_DECIMAL
)
452 return insert_angle_num_dms ();
453 else if (type
== LexerTokenType
.PL_SECOND
)
454 return insert_token (LexerTokenType
.NUMBER
);
457 /* ERROR: expected LexerTokenType.PL_SECOND */
458 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
459 return insert_token (LexerTokenType
.UNKNOWN
);
462 else if (type
== LexerTokenType
.PL_DECIMAL
)
463 return insert_angle_num_dms ();
466 prelexer
.roll_back ();
467 return insert_token (LexerTokenType
.NUMBER
);
472 /* ERROR: expected LexerTokenType.PL_MINUTE | LexerTokenType.PL_DIGIT */
473 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
474 return insert_token (LexerTokenType
.UNKNOWN
);
477 else if (type
== LexerTokenType
.PL_DECIMAL
)
478 return insert_angle_num_dm ();
480 return insert_token (LexerTokenType
.NUMBER
);
482 else if (type
== LexerTokenType
.PL_DECIMAL
)
483 return insert_decimal ();
484 else if (type
== LexerTokenType
.PL_HEX
)
485 return insert_hex_dec ();
488 prelexer
.roll_back ();
489 return insert_token (LexerTokenType
.NUMBER
);
493 private LexerToken
insert_angle_num_dm ()
495 var type
= prelexer
.get_next_token ();
496 if (type
!= LexerTokenType
.PL_DIGIT
)
498 /* ERROR: expected LexerTokenType.PL_DIGIT */
499 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
500 return insert_token (LexerTokenType
.UNKNOWN
);
503 while (type
== LexerTokenType
.PL_DIGIT
);
504 type
= prelexer
.get_next_token ();
506 if (type
== LexerTokenType
.PL_MINUTE
)
507 return insert_token (LexerTokenType
.NUMBER
);
510 /* ERROR: expected LexerTokenType.PL_MINUTE */
511 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
512 return insert_token (LexerTokenType
.UNKNOWN
);
516 private LexerToken
insert_angle_num_dms ()
518 var type
= prelexer
.get_next_token ();
519 if (type
!= LexerTokenType
.PL_DIGIT
)
521 /* ERROR: expected LexerTokenType.PL_DIGIT */
522 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
523 return insert_token (LexerTokenType
.UNKNOWN
);
525 while ((type
= prelexer
.get_next_token ()) == LexerTokenType
.PL_DIGIT
);
526 if (type
== LexerTokenType
.PL_SECOND
)
527 return insert_token (LexerTokenType
.NUMBER
);
530 /* ERROR: expected LexerTokenType.PL_SECOND */
531 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
532 return insert_token (LexerTokenType
.UNKNOWN
);
536 private LexerToken
insert_decimal ()
538 var type
= prelexer
.get_next_token ();
539 if (type
== LexerTokenType
.PL_DIGIT
)
541 while ((type
= prelexer
.get_next_token ()) == LexerTokenType
.PL_DIGIT
);
542 if (type
== LexerTokenType
.PL_DEGREE
)
543 return insert_token (LexerTokenType
.NUMBER
);
544 else if (type
== LexerTokenType
.PL_HEX
)
545 return insert_decimal_hex ();
546 else if (type
== LexerTokenType
.PL_SUB_DIGIT
)
548 while (prelexer
.get_next_token () == LexerTokenType
.PL_SUB_DIGIT
);
549 prelexer
.roll_back ();
550 return insert_token (LexerTokenType
.NUMBER
);
554 prelexer
.roll_back ();
555 return insert_token (LexerTokenType
.NUMBER
);
558 else if (type
== LexerTokenType
.PL_HEX
)
559 return insert_decimal_hex ();
562 /* ERROR: expected LexerTokenType.PL_DIGIT | LexerTokenType.PL_HEX */
563 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
564 return insert_token (LexerTokenType
.UNKNOWN
);
568 private LexerToken
insert_hex ()
570 var type
= prelexer
.get_next_token ();
571 while (type
== LexerTokenType
.PL_HEX
)
572 type
= prelexer
.get_next_token ();
574 if (type
== LexerTokenType
.PL_DIGIT
)
575 return insert_hex_dec ();
576 else if (type
== LexerTokenType
.PL_DECIMAL
)
577 return insert_decimal_hex ();
578 else if (type
== LexerTokenType
.PL_SUB_DIGIT
)
580 while (prelexer
.get_next_token () == LexerTokenType
.PL_SUB_DIGIT
);
581 prelexer
.roll_back ();
583 if (check_if_number ())
584 return insert_token (LexerTokenType
.NUMBER
);
587 if (check_if_function ())
588 return insert_token (LexerTokenType
.FUNCTION
);
590 return insert_token (LexerTokenType
.VARIABLE
);
593 else if (type
== LexerTokenType
.PL_LETTER
)
594 return insert_letter ();
597 prelexer
.roll_back ();
598 if (check_if_number ())
599 return insert_token (LexerTokenType
.NUMBER
);
602 if (check_if_function ())
603 return insert_token (LexerTokenType
.FUNCTION
);
605 return insert_token (LexerTokenType
.VARIABLE
);
610 private LexerToken
insert_hex_dec ()
612 var type
= prelexer
.get_next_token ();
613 while (type
== LexerTokenType
.PL_DIGIT
|| type
== LexerTokenType
.PL_HEX
)
614 type
= prelexer
.get_next_token ();
616 if (type
== LexerTokenType
.PL_DECIMAL
)
617 return insert_decimal_hex ();
618 else if (type
== LexerTokenType
.PL_SUB_DIGIT
)
620 while (prelexer
.get_next_token () == LexerTokenType
.PL_SUB_DIGIT
);
621 prelexer
.roll_back ();
622 return insert_token (LexerTokenType
.NUMBER
);
626 if (check_if_number ())
627 return insert_token (LexerTokenType
.NUMBER
);
628 /* ERROR: expected LexerTokenType.PL_DECIMAL | LexerTokenType.PL_DIGIT | LexerTokenType.PL_HEX */
629 parser
.set_error (ErrorCode
.MP
, prelexer
.get_marked_substring (), prelexer
.mark_index
, prelexer
.index
);
630 return insert_token (LexerTokenType
.UNKNOWN
);
634 private LexerToken
insert_decimal_hex ()
636 /* Make up of digits and hexadecimal characters */
637 var type
= prelexer
.get_next_token ();
638 while (type
== LexerTokenType
.PL_DIGIT
|| type
== LexerTokenType
.PL_HEX
)
639 type
= prelexer
.get_next_token ();
641 /* Allow a subdigit suffix */
642 while (type
== LexerTokenType
.PL_SUB_DIGIT
)
643 type
= prelexer
.get_next_token ();
645 prelexer
.roll_back ();
647 return insert_token (LexerTokenType
.NUMBER
);
650 private LexerToken
insert_letter ()
652 /* Get string of letters */
653 var type
= prelexer
.get_next_token ();
654 while (type
== LexerTokenType
.PL_LETTER
|| type
== LexerTokenType
.PL_HEX
)
655 type
= prelexer
.get_next_token ();
657 /* Allow a subdigit suffix */
658 while (type
== LexerTokenType
.PL_SUB_DIGIT
)
659 type
= prelexer
.get_next_token ();
661 prelexer
.roll_back ();
663 var name
= prelexer
.get_marked_substring ().down ();
665 return insert_token (LexerTokenType
.MOD
);
667 return insert_token (LexerTokenType
.AND
);
669 return insert_token (LexerTokenType
.OR
);
671 return insert_token (LexerTokenType
.XOR
);
673 return insert_token (LexerTokenType
.NOT
);
675 return insert_token (LexerTokenType
.IN
);
676 if (check_if_function ())
677 return insert_token (LexerTokenType
.FUNCTION
);
679 return insert_token (LexerTokenType
.VARIABLE
);