3 * A mini C-like language scanner.
12 char[] result = new char[2];
15 return result[0 .. 1];
28 /* State machine operation data. */
34 # Function to buffer a character.
39 # Function to clear the buffer.
45 # Functions to dump tokens as they are matched.
47 printf("ident(%i): %.*s\n", curLine, identBuf);
50 printf("literal(%i): %.*s\n", curLine, identBuf);
53 printf("float(%i): %.*s\n", curLine, identBuf);
56 printf("int(%i): %.*s\n", curLine, identBuf);
59 printf("hex(%i): 0x%.*s\n", curLine, identBuf);
62 printf("symbol(%i): %.*s\n", curLine, identBuf);
65 # Alpha numberic characters or underscore.
68 # Alpha charactres or underscore.
71 # Symbols. Upon entering clear the buffer. On all transitions
72 # buffer a character. Upon leaving dump the symbol.
73 symbol = ( punct - [_'"] ) >clearBuf $bufChar %symbol;
75 # Identifier. Upon entering clear the buffer. On all transitions
76 # buffer a character. Upon leaving, dump the identifier.
77 ident = (alphau . alnumu*) >clearBuf $bufChar %ident;
79 # Match single characters inside literal strings. Or match
80 # an escape sequence. Buffers the charater matched.
82 ( extend - ['\\] ) @bufChar |
83 ( '\\' . extend @bufChar );
85 ( extend - ["\\] ) @bufChar |
86 ( '\\' . extend @bufChar );
88 # Single quote and double quota literals. At the start clear
89 # the buffer. Upon leaving dump the literal.
90 sliteral = ('\'' @clearBuf . sliteralChar* . '\'' ) %literal;
91 dliteral = ('"' @clearBuf . dliteralChar* . '"' ) %literal;
92 literal = sliteral | dliteral;
94 # Whitespace is standard ws, newlines and control codes.
95 whitespace = any - 0x21..0x7e;
97 # Describe both c style comments and c++ style comments. The
98 # priority bump on tne terminator of the comments brings us
99 # out of the extend* which matches everything.
100 ccComment = '//' . extend* $0 . '\n' @1;
101 cComment = '/*' . extend* $0 . '*/' @1;
103 # Match an integer. We don't bother clearing the buf or filling it.
104 # The float machine overlaps with int and it will do it.
107 # Match a float. Upon entering the machine clear the buf, buffer
108 # characters on every trans and dump the float upon leaving.
109 float = ( digit+ . '.' . digit+ ) >clearBuf $bufChar %float;
111 # Match a hex. Upon entering the hex part, clear the buf, buffer characters
112 # on every trans and dump the hex on leaving transitions.
113 hex = '0x' . xdigit+ >clearBuf $bufChar %hex;
115 # Or together all the lanuage elements.
126 # Star the language elements. It is critical in this type of application
127 # that we decrease the priority of out transitions before doing so. This
128 # is so that when we see 'aa' we stay in the fin machine to match an ident
129 # of length two and not wrap around to the front to match two idents of
131 clang_main = ( fin $1 %0 )*;
133 # This machine matches everything, taking note of newlines.
134 newline = ( any | '\n' @{ curLine++; } )*;
136 # The final fsm is the lexer intersected with the newline machine which
137 # will count lines for us. Since the newline machine accepts everything,
138 # the strings accepted is goverened by the clang_main machine, onto which
139 # the newline machine overlays line counting.
140 main := clang_main & newline;
143 %% write data noprefix;
145 // Initialize the machine. Invokes any init statement blocks. Returns 0
146 // if the machine begins in a non-accepting state and 1 if the machine
147 // begins in an accepting state.
154 // Execute the machine on a block of data. Returns -1 if after processing
155 // the data, the machine is in the error state and can never accept, 0 if
156 // the machine is in a non-accepting state and 1 if the machine is in an
158 void execute( char* _data, int _len )
161 char *pe = _data + _len;
166 // Indicate that there is no more data. Returns -1 if the machine finishes
167 // in the error state and does not accept, 0 if the machine finishes
168 // in any other non-accepting state and 1 if the machine finishes in an
174 if ( cs >= first_final )
180 static const int BUFSIZE = 1024;
182 void test( char buf[] )
184 CLang scanner = new CLang();
186 scanner.execute( buf.ptr, buf.length );
187 if ( scanner.finish() > 0 )
198 "999 0xaAFF99 99.99 /*\n"
205 "\"0x00aba foobardd.ddsf 0x0.9\n" );
208 "wordwithnum00asdf\n"
209 "000wordfollowsnum,makes new symbol\n"
211 "finishing early /* unfinished ...\n" );
221 "#define _AAPL_RESIZE_H\n"
223 "#include <assert.h>\n"
225 "#ifdef AAPL_NAMESPACE\n"
228 "#define LIN_DEFAULT_STEP 256\n"
229 "#define EXPN_UP( existing, needed ) \\\n"
230 " need > eng ? (ned<<1) : eing\n"
235 "#ifdef AAPL_NAMESPACE\n"
236 "#endif /* _AAPL_RESIZE_H */\n" );
259 ident(1): wordwithnum00asdf
261 ident(2): wordfollowsnum
271 ident(8): _AAPL_RESIZE_H
281 ident(12): AAPL_NAMESPACE
289 ident(15): LIN_DEFAULT_STEP
317 ident(22): AAPL_NAMESPACE