9 static char *keywords
[] = {
15 static char *punctuation
[] = {
16 #define PUNCT(x, str) str,
21 void token_dup(struct token
*src
, struct token
*dest
)
25 dest
->tok_str
= estrdup(src
->tok_str
);
29 void token_free(struct token
*token
)
34 void lex_error(struct lexer
*lex
, const char *fmt
, ...)
38 fprintf(stderr
, "%s:%d: error: ", lex
->tok
.tok_sloc
.name
, lex
->tok
.tok_sloc
.line
);
39 vfprintf(stderr
, fmt
, ap
);
44 // return token number for a keyword
45 static int find_keyword(const char *str
)
55 try_str
= keywords
[try - (TOK_FIRSTK
+ 1)];
56 compare
= strcmp(str
, try_str
);
59 } else if (compare
< 0){
61 } else if (compare
> 0){
68 void lex_create(struct lexer
*lex
)
73 lex
->tok
.tok_str
= NULL
;
74 lex
->tok
.tok_str_len
= 0;
76 lex
->next_tok
.tok
= 0;
77 lex
->next_tok
.tok_str
= NULL
;
78 lex
->next_tok
.tok_str_len
= 0;
80 lex
->next_ident_tok
= TOK_IDENT
;
81 memset(lex
->ident_hashtab
, 0, IDENT_HASH_SIZE
* sizeof(struct ident
*));
85 void lex_delete(struct lexer
*lex
)
88 struct ident
*id
, *id_prev
;
89 for (i
=0; i
<IDENT_HASH_SIZE
; i
++){
90 id
= lex
->ident_hashtab
[i
];
92 id_prev
= id
->hash_prev
;
97 free(lex
->tok
.tok_str
);
98 free(lex
->next_tok
.tok_str
);
99 cpp_delete(&lex
->cpp
);
102 // generate a (fairly simple) hash for a string
103 static int hash_str(const char *str
, int hash_size
)
110 return hash_value
% hash_size
;
113 struct ident
*lex_get_ident_hashed(struct lexer
*lex
, const char *str
, int hash
)
116 ident
= lex
->ident_hashtab
[hash
];
117 while (ident
&& strcmp(ident
->str
, str
)){
118 ident
= ident
->hash_prev
;
123 // get, or create, a 'struct ident'
124 struct ident
*lex_get_ident(struct lexer
*lex
, const char *str
)
126 int hash
= hash_str(str
, IDENT_HASH_SIZE
);
127 struct ident
*ident
= lex_get_ident_hashed(lex
, str
, hash
);
130 ident
= emalloc(sizeof(struct ident
) + strlen(str
));
131 ident
->hash_prev
= lex
->ident_hashtab
[hash
];
132 lex
->ident_hashtab
[hash
] = ident
;
133 ident
->tok
= lex
->next_ident_tok
++;
134 strcpy(ident
->str
, str
);
139 // get a 'struct ident', but don't create it
140 struct ident
*lex_get_ident_nocreate(struct lexer
*lex
, const char *str
)
142 int hash
= hash_str(str
, IDENT_HASH_SIZE
);
143 return lex_get_ident_hashed(lex
, str
, hash
);
146 void lex_getline(struct lexer
*lex
)
149 cpp_read_line(&lex
->cpp
);
150 if (lex
->cpp
.line_buf
){
151 cpp_process_line(&lex
->cpp
);
152 if (lex
->cpp
.line_buf
){
153 lex
->pch
= lex
->cpp
.line_buf
;
166 void lex_start(struct lexer
*lex
)
172 void lex_white(struct lexer
*lex
)
174 lex
->pch
+= strspn(lex
->pch
, " \t\n");
177 void lex_unget_tok(struct lexer
*lex
, struct token
*token
)
179 free(lex
->next_tok
.tok_str
);
180 lex
->next_tok
= lex
->tok
;
184 void lex_next(struct lexer
*lex
)
186 if (lex
->next_tok
.tok
!= 0){
187 // token stored with lex_unget_tok
188 free(lex
->tok
.tok_str
);
189 lex
->tok
= lex
->next_tok
;
190 memset(&lex
->next_tok
, 0, sizeof lex
->next_tok
);
194 if (!lex
->pch
|| !*lex
->pch
){
195 while (!lex
->pch
|| !*lex
->pch
){
202 lex
->tok
.tok_sloc
= lex
->cpp
.line_loc
;
209 lex
->tok
.tok_sloc
= lex
->cpp
.line_loc
;
210 if (isalpha(lex
->pch
[0]) || lex
->pch
[0] == '_'){
211 // identifier or keyword
212 char *p_start
= lex
->pch
, *id_str
= NULL
;
215 while (isalnum(lex
->pch
[0]) || lex
->pch
[0] == '_'){
218 strdncpy(&id_str
, p_start
, lex
->pch
- p_start
);
219 tok
= find_keyword(id_str
);
224 ident
= lex_get_ident(lex
, id_str
);
225 lex
->tok
.tok
= ident
->tok
;
228 } else if (lex
->pch
[0] == '"' || lex
->pch
[0] == '\''){
229 // string or character literal
230 char quote
= lex
->pch
[0], **str_data
= &lex
->tok
.tok_str
;
231 int *pstr_data_len
= &lex
->tok
.tok_str_len
;
233 while (lex
->pch
[0] && lex
->pch
[0] != quote
){
234 strldcatc(str_data
, pstr_data_len
, lex
->pch
[0]);
237 if (lex
->pch
[0] == quote
){
240 lex_error(lex
, "unterminated string literal");
243 lex
->tok
.tok
= TOK_STR
;
245 lex
->tok
.tok
= TOK_CHARSTR
;
247 } else if (lex
->pch
[0] >= '0' && lex
->pch
[0] <= '9'){
249 char **num_str
= &lex
->tok
.tok_str
;
250 // we can use cpp_lex_number - it does what we want :)
251 cpp_lex_number(NULL
, &lex
->pch
, num_str
);
252 lex
->tok
.tok
= TOK_NUMBER
;
254 // scan punctuation table
255 // HOT code! optimize!
256 int i
, longest_match
= 0, longest_match_len
= 0, pch_len
= strlen(lex
->pch
), punct_len
;
257 for (i
=TOK_FIRST_PUNCT
+1; i
<TOK_INVAL
; i
++){
258 punct_len
= strlen(punctuation
[i
- (TOK_FIRST_PUNCT
+ 1)]);
259 if (punct_len
> pch_len
|| punct_len
< longest_match_len
){
262 if (!strncmp(lex
->pch
, punctuation
[i
- (TOK_FIRST_PUNCT
+ 1)], punct_len
)){
263 assert(punct_len
> longest_match_len
);
265 longest_match_len
= punct_len
;
269 lex
->pch
+= longest_match_len
;
270 lex
->tok
.tok
= longest_match
;
271 } else // single-character token?
272 if (strchr("><=!-&|+*/%^.;:~(){}[],", lex
->pch
[0])){
273 lex
->tok
.tok
= lex
->pch
[0];
276 lex_error(lex
, "invalid character in input file: %c", lex
->pch
[0]);
282 // return a string for a token
283 // 'tok_str' may be null, but you won't get the contents of
284 // strings. The return value is a static string. Don't call lex_get_tok_str
285 // or lex_delete etc. until you've finished with the return value!
286 char *lex_get_tok_str(struct lexer
*lex
, tok_t tok
, char *tok_str
)
291 } else if (tok
<= 255){
292 sprintf(buf
, "%c", tok
);
294 } else if (tok
> TOK_FIRSTK
&& tok
< TOK_LASTK
){
295 return keywords
[tok
- (TOK_FIRSTK
+ 1)];
296 } else if (tok
> TOK_FIRST_PUNCT
&& tok
< TOK_LAST_PUNCT
){
297 return punctuation
[tok
- (TOK_FIRST_PUNCT
+ 1)];
298 } else if (tok
>= TOK_IDENT
){
299 // this is difficult, because they're all in a hash table
300 // thankfully, we won't have to do this much
302 for (i
=0; i
<IDENT_HASH_SIZE
; i
++){
304 ident
= lex
->ident_hashtab
[i
];
305 while (ident
&& ident
->tok
!= tok
){
306 ident
= ident
->hash_prev
;
313 } else { // TODO: strings and punctuation-like tokens
318 bool lex_is_ident(struct lexer
*lex
, tok_t tok
)
320 return (tok
>= TOK_IDENT
&& tok
< lex
->next_ident_tok
);