17 # Function to buffer a character.
18 action bufChar { array[pos] = fc; pos = pos + 1; }
20 # Function to clear the buffer.
21 action clearBuf { pos = 0; }
23 # Functions to dump tokens as they are matched.
79 # Alpha numberic characters or underscore.
82 # Alpha charactres or underscore.
85 # Symbols. Upon entering clear the buffer. On all transitions
86 # buffer a character. Upon leaving dump the symbol.
87 symbol = ( punct - [_'"] ) >clearBuf $bufChar %symbol;
89 # Identifier. Upon entering clear the buffer. On all transitions
90 # buffer a character. Upon leaving, dump the identifier.
91 ident = (alphau . alnumu*) >clearBuf $bufChar %ident;
93 # Match single characters inside literal strings. Or match
94 # an escape sequence. Buffers the charater matched.
96 ( extend - ['\\] ) @bufChar |
97 ( '\\' . extend @bufChar );
99 ( extend - ["\\] ) @bufChar |
100 ( '\\' . extend @bufChar );
102 # Single quote and double quota literals. At the start clear
103 # the buffer. Upon leaving dump the literal.
104 sliteral = ('\'' @clearBuf . sliteralChar* . '\'' ) %literal;
105 dliteral = ('"' @clearBuf . dliteralChar* . '"' ) %literal;
106 literal = sliteral | dliteral;
108 # Whitespace is standard ws, newlines and control codes.
109 whitespace = any - 33 .. 126;
111 # Describe both c style comments and c++ style comments. The
112 # priority bump on tne terminator of the comments brings us
113 # out of the extend* which matches everything.
114 ccComment = '//' . extend* $0 . '\n' @1;
115 cComment = '/!' . extend* $0 . '!/' @1;
117 # Match an integer. We don't bother clearing the buf or filling it.
118 # The float machine overlaps with int and it will do it.
119 integer = digit+ %integer;
121 # Match a float. Upon entering the machine clear the buf, buffer
122 # characters on every trans and dump the float upon leaving.
123 float = ( digit+ . '.' . digit+ ) >clearBuf $bufChar %float;
125 # Match a hex. Upon entering the hex part, clear the buf, buffer characters
126 # on every trans and dump the hex on leaving transitions.
127 hex = '0x' . xdigit+ >clearBuf $bufChar %hex;
129 # Or together all the lanuage elements.
140 # Star the language elements. It is critical in this type of application
141 # that we decrease the priority of out transitions before doing so. This
142 # is so that when we see 'aa' we stay in the fin machine to match an ident
143 # of length two and not wrap around to the front to match two idents of
145 clang_main = ( fin $1 %0 )*;
147 # This machine matches everything, taking note of newlines.
148 newline = ( any | '\n' @{ line = line + 1; } )*;
150 # The final fsm is the lexer intersected with the newline machine which
151 # will count lines for us. Since the newline machine accepts everything,
152 # the strings accepted is goverened by the clang_main machine, onto which
153 # the newline machine overlays line counting.
154 main := clang_main & newline;
157 "999 0xaAFF99 99.99 /!\n!/ 'lksdj' //\n\"\n\nliteral\n\n\n\"0x00aba foobardd.ddsf 0x0.9\n"
158 "wordwithnum00asdf\n000wordfollowsnum,makes new symbol\n\nfinishing early /! unfinished ...\n"
179 ident(1,17): wordwithnum00asdf
181 ident(2,14): wordfollowsnum
186 ident(4,9): finishing