Released version 3-2015061300
[notion.git] / libtu / tokenizer.c
blobcf929424c0591d125554436efbffe4da4b62fdd0
1 /*
2 * libtu/tokenizer.c
4 * Copyright (c) Tuomo Valkonen 1999-2002.
6 * You may distribute and modify this library under the terms of either
7 * the Clarified Artistic License or the GNU LGPL, version 2.1 or later.
8 */
10 #include <errno.h>
11 #include <stdio.h>
12 #include <ctype.h>
13 #include <limits.h>
14 #include <assert.h>
15 #include <math.h>
16 #include <string.h>
18 #include "tokenizer.h"
19 #include "misc.h"
20 #include "output.h"
21 #include "private.h"
24 static const char *errors[]={
25 DUMMY_TR("(no error)"),
26 DUMMY_TR("Unexpected end of file"), /* E_TOKZ_UNEXPECTED_EOF */
27 DUMMY_TR("Unexpected end of line"), /* E_TOKZ_UNEXPECTED_EOL */
28 DUMMY_TR("End of line expected"), /* E_TOKZ_EOL_EXPECTED */
29 DUMMY_TR("Invalid character"), /* E_TOKZ_INVALID_CHAR*/
30 DUMMY_TR("Numeric constant too big"), /* E_TOKZ_TOOBIG */
31 DUMMY_TR("Invalid numberic format"), /* E_TOKZ_NUMFMT */
32 DUMMY_TR("Junk after numeric constant"), /* E_TOKZ_NUM_JUNK */
33 DUMMY_TR("Not an integer"), /* E_TOKZ_NOTINT */
34 DUMMY_TR("Numeric constant out of range"), /* E_TOKZ_RANGE */
35 DUMMY_TR("Multi-character character constant"), /* E_TOKZ_MULTICHAR */
36 DUMMY_TR("Token/statement limit reached"), /* E_TOKZ_TOKEN_LIMIT */
37 DUMMY_TR("Unknown option"), /* E_TOKZ_UNKONWN_OPTION */
38 DUMMY_TR("Syntax error"), /* E_TOKZ_SYNTAX */
39 DUMMY_TR("Invalid argument"), /* E_TOKZ_INVALID_ARGUMENT */
40 DUMMY_TR("End of statement expected"), /* E_TOKZ_EOS_EXPECTED */
41 DUMMY_TR("Too few arguments"), /* E_TOKZ_TOO_FEW_ARGS */
42 DUMMY_TR("Too many arguments"), /* E_TOKZ_TOO_MANY_ARGS */
43 DUMMY_TR("Maximum section nestin level exceeded"), /* E_TOK_Z_MAX_NEST */
44 DUMMY_TR("Identifier expected"), /* E_TOKZ_IDENTIFIER_EXPECTED */
45 DUMMY_TR("Starting brace ('{') expected"), /* E_TOKZ_LBRACE_EXPECTED */
49 /* */
51 #define STRBLEN 32
53 #define STRING_DECL(X) int err=0; char* X=NULL; char X##_tmp[STRBLEN]; int X##_tmpl=0
54 #define STRING_DECL_P(X, P) int err=0; char* X=NULL; char X##_tmp[STRBLEN]=P; int X##_tmpl=sizeof(P)-1
55 #define STRING_APPEND(X, C) {if(!_string_append(&X, X##_tmp, &X##_tmpl, c)) err=-ENOMEM;}
56 #define STRING_FREE(X) if(X!=NULL) free(X)
57 #define STRING_FINISH(X) {if(err!=0) return err; if(!_string_finish(&X, X##_tmp, X##_tmpl)) err=-ENOMEM;}
60 static bool _string_append(char **p, char *tmp, int *tmplen, char c)
62 char *tmp2;
64 if(*tmplen==STRBLEN-1){
65 tmp[STRBLEN-1]='\0';
66 if(*p!=NULL){
67 tmp2=scat(*p, tmp);
68 free(*p);
69 *p=tmp2;
70 }else{
71 *p=scopy(tmp);
73 *tmplen=1;
74 tmp[0]=c;
75 return *p!=NULL;
76 }else{
77 tmp[(*tmplen)++]=c;
78 return TRUE;
83 static bool _string_finish(char **p, char *tmp, int tmplen)
85 char *tmp2;
87 if(tmplen==0){
88 if(*p==NULL)
89 *p=scopy("");
90 }else{
91 tmp[tmplen]='\0';
92 if(*p!=NULL){
93 tmp2=scat(*p, tmp);
94 free(*p);
95 *p=tmp2;
96 }else{
97 *p=scopy(tmp);
100 return *p!=NULL;
104 /* */
107 #define INC_LINE() tokz->line++
108 #define GETCH() _getch(tokz)
109 #define UNGETCH(C) _ungetch(tokz, C)
111 static int _getch(Tokenizer *tokz)
113 int c;
115 if(tokz->ungetc!=-1){
116 c=tokz->ungetc;
117 tokz->ungetc=-1;
118 }else if (tokz->flags&TOKZ_READ_FROM_BUFFER) {
119 assert(tokz->buffer.data!=NULL);
120 if (tokz->buffer.pos==tokz->buffer.len)
121 c=EOF;
122 else
123 c=tokz->buffer.data[tokz->buffer.pos++];
124 }else{
125 c=getc(tokz->file);
128 return c;
132 static void _ungetch(Tokenizer *tokz, int c)
134 tokz->ungetc=c;
138 /* */
141 static int scan_line_comment(Token *tok, Tokenizer *tokz)
143 STRING_DECL_P(s, "#");
144 int c;
146 c=GETCH();
148 while(c!='\n' && c!=EOF){
149 STRING_APPEND(s, c);
150 c=GETCH();
153 UNGETCH(c);
155 STRING_FINISH(s);
157 TOK_SET_COMMENT(tok, s);
159 return 0;
163 static int skip_line_comment(Tokenizer *tokz)
165 int c;
168 c=GETCH();
169 }while(c!='\n' && c!=EOF);
171 UNGETCH(c);
173 return 0;
177 /* */
180 static int scan_c_comment(Token *tok, Tokenizer *tokz)
182 STRING_DECL_P(s, "/*");
183 int c;
184 int st=0;
186 while(1){
187 c=GETCH();
189 if(c==EOF){
190 STRING_FREE(s);
191 return E_TOKZ_UNEXPECTED_EOF;
194 STRING_APPEND(s, c);
196 if(c=='\n'){
197 INC_LINE();
198 }else if(st==0 && c=='*'){
199 st=1;
200 }else if(st==1){
201 if(c=='/')
202 break;
203 st=0;
207 STRING_FINISH(s);
209 TOK_SET_COMMENT(tok, s);
211 return 0;
215 static int skip_c_comment(Tokenizer *tokz)
217 int c;
218 int st=0;
220 while(1){
221 c=GETCH();
223 if(c==EOF)
224 return E_TOKZ_UNEXPECTED_EOF;
226 if(c=='\n')
227 INC_LINE();
228 else if(st==0 && c=='*')
229 st=1;
230 else if(st==1){
231 if(c=='/')
232 break;
233 st=0;
237 return 0;
241 /* */
244 static int scan_char_escape(Tokenizer *tokz)
246 static char* special_chars="nrtbae";
247 static char* specials="\n\r\t\b\a\033";
248 int base, max;
249 int i ,c;
251 c=GETCH();
253 for(i=0;special_chars[i];i++){
254 if(special_chars[i]==c)
255 return specials[c];
258 if(c=='x' || c=='X'){
259 base=16;max=2;i=0;
260 }else if(c=='d' || c=='D'){
261 base=10;max=3;i=0;
262 }else if(c=='8' || c=='9'){
263 base=10;max=2;i=c-'0';
264 }else if('0'<=c && c<='7'){
265 base=8;max=2;i=c-'0';
266 }else if(c=='\n'){
267 UNGETCH(c);
268 return -2;
269 }else{
270 return c;
274 while(--max>=0){
275 c=GETCH();
277 if(c==EOF)
278 return EOF;
280 if(c=='\n'){
281 UNGETCH(c);
282 return -2;
285 if(base==16){
286 if(!isxdigit(c))
287 break;
289 i<<=4;
291 if(isdigit(c))
292 i+=c-'0';
293 else if(i>='a')
294 i+=0xa+c-'a';
295 else
296 i+=0xa+c-'a';
298 }else if(base==10){
299 if(!isdigit(c))
300 break;
301 i*=10;
302 i+=c-'0';
303 }else{
304 if(c<'0' || c>'7')
305 break;
306 i<<=3;
307 i+=c-'0';
311 if(max>=0)
312 UNGETCH(c);
314 return i;
318 /* */
321 static int scan_string(Token *tok, Tokenizer *tokz, bool escapes)
323 STRING_DECL(s);
324 int c;
326 while(1){
327 c=GETCH();
329 if(c=='"')
330 break;
332 if(c=='\n'){
333 UNGETCH(c);
334 STRING_FREE(s);
335 return E_TOKZ_UNEXPECTED_EOL;
338 if(c=='\\' && escapes){
339 c=scan_char_escape(tokz);
340 if(c==-2){
341 STRING_FREE(s);
342 return E_TOKZ_UNEXPECTED_EOL;
346 if(c==EOF){
347 STRING_FREE(s);
348 return E_TOKZ_UNEXPECTED_EOF;
351 STRING_APPEND(s, c);
354 STRING_FINISH(s);
356 TOK_SET_STRING(tok, s);
358 return 0;
362 /* */
365 static int scan_char(Token *tok, Tokenizer *tokz)
367 int c, c2;
369 c=GETCH();
371 if(c==EOF)
372 return E_TOKZ_UNEXPECTED_EOF;
374 if(c=='\n')
375 return E_TOKZ_UNEXPECTED_EOL;
377 if(c=='\\'){
378 c=scan_char_escape(tokz);
380 if(c==EOF)
381 return E_TOKZ_UNEXPECTED_EOF;
383 if(c==-2)
384 return E_TOKZ_UNEXPECTED_EOL;
387 c2=GETCH();
389 if(c2!='\'')
390 return E_TOKZ_MULTICHAR;
392 TOK_SET_CHAR(tok, c);
394 return 0;
398 /* */
401 #define START_IDENT(X) (isalpha(X) || X=='_' || X=='$')
404 static int scan_identifier(Token *tok, Tokenizer *tokz, int c)
406 STRING_DECL(s);
409 STRING_APPEND(s, c);
410 c=GETCH();
411 }while(isalnum(c) || c=='_' || c=='$');
413 UNGETCH(c);
415 STRING_FINISH(s);
417 TOK_SET_IDENT(tok, s);
419 return 0;
422 #define NP_SIMPLE_IMPL
423 #include "np/numparser2.h"
424 #include "np/np-conv.h"
427 static int scan_number(Token *tok, Tokenizer *tokz, int c)
429 NPNum num=NUM_INIT;
430 int e;
432 if((e=parse_number(&num, tokz, c)))
433 return e;
435 if(num.type==NPNUM_INT){
436 long l;
437 if((e=num_to_long(&l, &num, TRUE)))
438 return e;
440 TOK_SET_LONG(tok, l);
441 }else if(num.type==NPNUM_FLOAT){
442 double d;
443 if((e=num_to_double(&d, &num)))
444 return e;
446 TOK_SET_DOUBLE(tok, d);
447 }else{
448 return E_TOKZ_NUMFMT;
451 return 0;
455 /* */
458 static uchar op_map[]={
459 0x00, /* ________ 0-7 */
460 0x00, /* ________ 8-15 */
461 0x00, /* ________ 16-23 */
462 0x00, /* ________ 24-31 */
463 0x62, /* _!___%&_ 32-39 */
464 0xff, /* ()*+,-./ 40-47 */
465 0x00, /* ________ 48-55 */
466 0xfc, /* __:;<=>? 56-63 */
467 0x01, /* @_______ 64-71 */
468 0x00, /* ________ 72-79 */
469 0x00, /* ________ 80-87 */
470 0x78, /* ___[_]^_ 88-95 */
471 0x00, /* ________ 96-103 */
472 0x00, /* ________ 104-111 */
473 0x00, /* ________ 112-119 */
474 0x38 /* ___{|}__ 120-127 */
478 static bool map_isset(uchar *map, uint ch)
480 if(ch>127)
481 return FALSE;
483 return map[ch>>3]&(1<<(ch&7));
487 static bool is_opch(uint ch)
489 return map_isset(op_map, ch);
493 static int scan_op(Token *tok, Tokenizer *tokz, int c)
495 int c2;
496 int op=-1;
498 /* Quickly check it is an operator character */
499 if(!is_opch(c))
500 return E_TOKZ_INVALID_CHAR;
502 switch(c){
503 case '+':
504 case '-':
505 case '*':
506 /* case '/': Checked elsewhere */
507 case '%':
508 case '^':
509 case '!':
510 case '=':
511 case '<':
512 case '>':
513 c2=GETCH();
514 if(c2=='='){
515 op=c|(c2<<8);
516 }else if(c2==c && (c2!='%' && c2!='!' && c2!='*')){
517 if(c=='<' || c=='>'){
518 int c3=GETCH();
519 if(c3=='='){
520 op=c|(c2<<8)|(c3<<16);
521 }else{
522 UNGETCH(c3);
523 op=c|(c2<<8);
525 }else{
526 op=c|(c2<<8);
528 }else{
529 UNGETCH(c2);
530 op=c;
532 break;
534 /* It is already known that it is a operator so these are not needed
535 case ':':
536 case '~':
537 case '?':
538 case '.':
539 case ';';
540 case '{':
541 case '}':
542 case '@':
543 case '|':
544 case '&':
546 default:
547 op=c;
550 TOK_SET_OP(tok, op);
552 return 0;
556 /* */
559 void tokz_warn(const Tokenizer *tokz, int line, const char *fmt, ...)
561 va_list args;
563 va_start(args, fmt);
565 if(tokz!=NULL)
566 warn_obj_line_v(tokz->name, line, fmt, args);
567 else
568 warn(fmt, args);
570 va_end(args);
574 void tokz_warn_error(const Tokenizer *tokz, int line, int e)
576 if(e==E_TOKZ_UNEXPECTED_EOF)
577 line=0;
579 if(e<0)
580 tokz_warn(tokz, line, "%s", strerror(-e));
581 else
582 tokz_warn(tokz, line, "%s", TR(errors[e]));
586 bool tokz_get_token(Tokenizer *tokz, Token *tok)
588 int c, c2, e;
590 if (!(tokz->flags&TOKZ_READ_FROM_BUFFER))
591 assert(tokz->file!=NULL);
593 tok_free(tok);
595 if(!TOK_IS_INVALID(&(tokz->ungettok))){
596 *tok=tokz->ungettok;
597 tokz->ungettok.type=TOK_INVALID;
598 return TRUE;
601 while(1){
603 e=0;
606 c=GETCH();
607 }while(c!='\n' && c!=EOF && isspace(c));
609 tok->line=tokz->line;
611 switch(c){
612 case EOF:
613 TOK_SET_OP(tok, OP_EOF);
614 return TRUE;
616 case '\n':
617 INC_LINE();
619 if(tokz->flags&TOKZ_IGNORE_NEXTLINE)
620 continue;
622 TOK_SET_OP(tok, OP_NEXTLINE);
624 return TRUE;
626 case '\\':
628 c=GETCH();
629 if(c==EOF){
630 TOK_SET_OP(tok, OP_EOF);
631 return FALSE;
633 if(!isspace(c) && e==0){
634 e=E_TOKZ_EOL_EXPECTED;
635 tokz_warn_error(tokz, tokz->line, e);
636 if(!(tokz->flags&TOKZ_ERROR_TOLERANT))
637 return FALSE;
639 }while(c!='\n');
641 INC_LINE();
642 continue;
644 case '#':
645 if(tokz->flags&TOKZ_READ_COMMENTS){
646 e=scan_line_comment(tok, tokz);
647 break;
648 }else if((e=skip_line_comment(tokz))){
649 break;
652 continue;
654 case '/':
655 c2=GETCH();
657 if(c2=='='){
658 TOK_SET_OP(tok, OP_AS_DIV);
659 return TRUE;
662 if(c2!='*'){
663 UNGETCH(c2);
664 TOK_SET_OP(tok, OP_DIV);
665 return TRUE;
668 if(tokz->flags&TOKZ_READ_COMMENTS){
669 e=scan_c_comment(tok, tokz);
670 break;
671 }else if((e=skip_c_comment(tokz))){
672 break;
675 continue;
677 case '\"':
678 e=scan_string(tok, tokz, TRUE);
679 break;
681 case '\'':
682 e=scan_char(tok, tokz);
683 break;
685 default:
686 if(('0'<=c && c<='9') || c=='-' || c=='+'){
687 e=scan_number(tok, tokz, c);
688 break;
691 if(START_IDENT(c))
692 e=scan_identifier(tok, tokz, c);
693 else
694 e=scan_op(tok, tokz, c);
697 if(!e)
698 return TRUE;
700 tokz_warn_error(tokz, tokz->line, e);
701 return FALSE;
706 void tokz_unget_token(Tokenizer *tokz, Token *tok)
708 tok_free(&(tokz->ungettok));
709 tokz->ungettok=*tok;
710 tok->type=TOK_INVALID;
715 * File open
718 static bool do_tokz_pushf(Tokenizer *tokz)
720 Tokenizer_FInfo *finfo;
722 finfo=REALLOC_N(tokz->filestack, Tokenizer_FInfo,
723 tokz->filestack_n, tokz->filestack_n+1);
725 if(finfo==NULL)
726 return FALSE;
728 tokz->filestack=finfo;
729 finfo=&(finfo[tokz->filestack_n++]);
731 finfo->file=tokz->file;
732 finfo->name=tokz->name;
733 finfo->line=tokz->line;
734 finfo->ungetc=tokz->ungetc;
735 finfo->ungettok=tokz->ungettok;
737 return TRUE;
741 bool tokz_pushf_file(Tokenizer *tokz, FILE *file, const char *fname)
743 char *fname_copy=NULL;
745 if(file==NULL)
746 return FALSE;
748 if(fname!=NULL){
749 fname_copy=scopy(fname);
750 if(fname_copy==NULL){
751 warn_err();
752 return FALSE;
756 if(tokz->file!=NULL){
757 if(!do_tokz_pushf(tokz)){
758 warn_err();
759 if(fname_copy!=NULL)
760 free(fname_copy);
761 return FALSE;
765 tokz->file=file;
766 tokz->name=fname_copy;
767 tokz->line=1;
768 tokz->ungetc=-1;
769 tokz->ungettok.type=TOK_INVALID;
771 return TRUE;
775 bool tokz_pushf(Tokenizer *tokz, const char *fname)
777 FILE *file;
779 file=fopen(fname, "r");
781 if(file==NULL){
782 warn_err_obj(fname);
783 return FALSE;
786 if(!tokz_pushf_file(tokz, file, fname)){
787 fclose(file);
788 return FALSE;
791 return TRUE;
796 static Tokenizer *tokz_create()
798 Tokenizer*tokz;
800 tokz=ALLOC(Tokenizer);
802 if(tokz==NULL){
803 warn_err();
804 return NULL;
807 tokz->file=NULL;
808 tokz->name=NULL;
809 tokz->line=1;
810 tokz->ungetc=-1;
811 tokz->ungettok.type=TOK_INVALID;
812 tokz->flags=0;
813 tokz->optstack=NULL;
814 tokz->nest_lvl=0;
815 tokz->filestack_n=0;
816 tokz->filestack=NULL;
817 tokz->buffer.data=0;
818 tokz->buffer.len=0;
819 tokz->buffer.pos=0;
821 return tokz;
825 Tokenizer *tokz_open(const char *fname)
827 Tokenizer *tokz;
829 tokz=tokz_create();
831 if(!tokz_pushf(tokz, fname)){
832 free(tokz);
833 return NULL;
836 return tokz;
840 Tokenizer *tokz_open_file(FILE *file, const char *fname)
842 Tokenizer *tokz;
844 tokz=tokz_create();
846 if(!tokz_pushf_file(tokz, file, fname)){
847 free(tokz);
848 return NULL;
851 return tokz;
854 Tokenizer *tokz_prepare_buffer(char *buffer, int len)
856 Tokenizer *tokz;
857 char old=0;
859 tokz=tokz_create();
860 if(len>0){
861 old=buffer[len-1];
862 buffer[len-1]='\0';
865 tokz->flags|=TOKZ_READ_FROM_BUFFER;
866 tokz->buffer.data=scopy(buffer);
867 tokz->buffer.len=(len>0 ? (uint)len : strlen(tokz->buffer.data));
868 tokz->buffer.pos=0;
870 if(old>0)
871 buffer[len-1]=old;
873 return tokz;
877 * File close
880 static bool do_tokz_popf(Tokenizer *tokz, bool shrink)
882 Tokenizer_FInfo *finfo;
884 if(tokz->filestack_n<=0)
885 return FALSE;
887 if(tokz->file!=NULL)
888 fclose(tokz->file);
889 if(tokz->name!=NULL)
890 free(tokz->name);
892 finfo=&(tokz->filestack[--tokz->filestack_n]);
894 tokz->file=finfo->file;
895 tokz->name=finfo->name;
896 tokz->line=finfo->line;
897 tokz->ungetc=finfo->ungetc;
898 tokz->ungettok=finfo->ungettok;
900 if(tokz->filestack_n==0){
901 free(tokz->filestack);
902 tokz->filestack=NULL;
903 }else if(shrink){
904 finfo=REALLOC_N(tokz->filestack, Tokenizer_FInfo,
905 tokz->filestack_n+1, tokz->filestack_n);
906 if(finfo==NULL)
907 warn_err();
908 else
909 tokz->filestack=finfo;
912 return TRUE;
916 bool tokz_popf(Tokenizer *tokz)
918 return do_tokz_popf(tokz, TRUE);
922 void tokz_close(Tokenizer *tokz)
924 while(tokz->filestack_n>0)
925 do_tokz_popf(tokz, FALSE);
927 if(tokz->file!=NULL)
928 fclose(tokz->file);
929 if(tokz->name!=NULL)
930 free(tokz->name);
931 tok_free(&(tokz->ungettok));
933 free(tokz);
938 /* */
941 void tok_free(Token *tok)
943 if(TOK_IS_STRING(tok) || TOK_IS_IDENT(tok) || TOK_IS_COMMENT(tok)){
944 if(TOK_STRING_VAL(tok)!=NULL)
945 free(TOK_STRING_VAL(tok));
948 tok->type=TOK_INVALID;
952 void tok_init(Token *tok)
954 static Token dummy=TOK_INIT;
956 memcpy(tok, &dummy, sizeof(*tok));