Improved some error messages for command line processing.
[python/dscho.git] / Parser / tokenizer.c
blobc2cb1514d1a4ebd2dbfef9ce7feceab4672290d1
1 /***********************************************************
2 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3 The Netherlands.
5 All Rights Reserved
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI or Corporation for National Research Initiatives or
13 CNRI not be used in advertising or publicity pertaining to
14 distribution of the software without specific, written prior
15 permission.
17 While CWI is the initial source for this software, a modified version
18 is made available by the Corporation for National Research Initiatives
19 (CNRI) at the Internet address ftp://ftp.python.org.
21 STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22 REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23 MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24 CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28 PERFORMANCE OF THIS SOFTWARE.
30 ******************************************************************/
32 /* Tokenizer implementation */
34 #include "pgenheaders.h"
36 #include <ctype.h>
38 #include "tokenizer.h"
39 #include "errcode.h"
41 extern char *PyOS_Readline Py_PROTO((char *));
42 /* Return malloc'ed string including trailing \n;
43 empty malloc'ed string for EOF;
44 NULL if interrupted */
46 /* Don't ever change this -- it would break the portability of Python code */
47 #define TABSIZE 8
49 /* Convert a possibly signed character to a nonnegative int */
50 /* XXX This assumes characters are 8 bits wide */
51 #ifdef __CHAR_UNSIGNED__
52 #define Py_CHARMASK(c) (c)
53 #else
54 #define Py_CHARMASK(c) ((c) & 0xff)
55 #endif
57 /* Forward */
58 static struct tok_state *tok_new Py_PROTO((void));
59 static int tok_nextc Py_PROTO((struct tok_state *tok));
60 static void tok_backup Py_PROTO((struct tok_state *tok, int c));
62 /* Token names */
64 char *_PyParser_TokenNames[] = {
65 "ENDMARKER",
66 "NAME",
67 "NUMBER",
68 "STRING",
69 "NEWLINE",
70 "INDENT",
71 "DEDENT",
72 "LPAR",
73 "RPAR",
74 "LSQB",
75 "RSQB",
76 "COLON",
77 "COMMA",
78 "SEMI",
79 "PLUS",
80 "MINUS",
81 "STAR",
82 "SLASH",
83 "VBAR",
84 "AMPER",
85 "LESS",
86 "GREATER",
87 "EQUAL",
88 "DOT",
89 "PERCENT",
90 "BACKQUOTE",
91 "LBRACE",
92 "RBRACE",
93 "EQEQUAL",
94 "NOTEQUAL",
95 "LESSEQUAL",
96 "GREATEREQUAL",
97 "TILDE",
98 "CIRCUMFLEX",
99 "LEFTSHIFT",
100 "RIGHTSHIFT",
101 "DOUBLESTAR",
102 /* This table must match the #defines in token.h! */
103 "OP",
104 "<ERRORTOKEN>",
105 "<N_TOKENS>"
109 /* Create and initialize a new tok_state structure */
111 static struct tok_state *
112 tok_new()
114 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
115 if (tok == NULL)
116 return NULL;
117 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
118 tok->done = E_OK;
119 tok->fp = NULL;
120 tok->tabsize = TABSIZE;
121 tok->indent = 0;
122 tok->indstack[0] = 0;
123 tok->atbol = 1;
124 tok->pendin = 0;
125 tok->prompt = tok->nextprompt = NULL;
126 tok->lineno = 0;
127 tok->level = 0;
128 tok->filename = NULL;
129 tok->altwarning = 0;
130 tok->alterror = 0;
131 tok->alttabsize = 1;
132 tok->altindstack[0] = 0;
133 return tok;
137 /* Set up tokenizer for string */
139 struct tok_state *
140 PyTokenizer_FromString(str)
141 char *str;
143 struct tok_state *tok = tok_new();
144 if (tok == NULL)
145 return NULL;
146 tok->buf = tok->cur = tok->end = tok->inp = str;
147 return tok;
151 /* Set up tokenizer for file */
153 struct tok_state *
154 PyTokenizer_FromFile(fp, ps1, ps2)
155 FILE *fp;
156 char *ps1, *ps2;
158 struct tok_state *tok = tok_new();
159 if (tok == NULL)
160 return NULL;
161 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
162 PyMem_DEL(tok);
163 return NULL;
165 tok->cur = tok->inp = tok->buf;
166 tok->end = tok->buf + BUFSIZ;
167 tok->fp = fp;
168 tok->prompt = ps1;
169 tok->nextprompt = ps2;
170 return tok;
174 /* Free a tok_state structure */
176 void
177 PyTokenizer_Free(tok)
178 struct tok_state *tok;
180 if (tok->fp != NULL && tok->buf != NULL)
181 PyMem_DEL(tok->buf);
182 PyMem_DEL(tok);
186 /* Get next char, updating state; error code goes into tok->done */
188 static int
189 tok_nextc(tok)
190 register struct tok_state *tok;
192 for (;;) {
193 if (tok->cur != tok->inp) {
194 return Py_CHARMASK(*tok->cur++); /* Fast path */
196 if (tok->done != E_OK)
197 return EOF;
198 if (tok->fp == NULL) {
199 char *end = strchr(tok->inp, '\n');
200 if (end != NULL)
201 end++;
202 else {
203 end = strchr(tok->inp, '\0');
204 if (end == tok->inp) {
205 tok->done = E_EOF;
206 return EOF;
209 if (tok->start == NULL)
210 tok->buf = tok->cur;
211 tok->lineno++;
212 tok->inp = end;
213 return Py_CHARMASK(*tok->cur++);
215 if (tok->prompt != NULL) {
216 char *new = PyOS_Readline(tok->prompt);
217 if (tok->nextprompt != NULL)
218 tok->prompt = tok->nextprompt;
219 if (new == NULL)
220 tok->done = E_INTR;
221 else if (*new == '\0') {
222 free(new);
223 tok->done = E_EOF;
225 else if (tok->start != NULL) {
226 int start = tok->start - tok->buf;
227 int oldlen = tok->cur - tok->buf;
228 int newlen = oldlen + strlen(new);
229 char *buf = realloc(tok->buf, newlen+1);
230 tok->lineno++;
231 if (buf == NULL) {
232 free(tok->buf);
233 tok->buf = NULL;
234 free(new);
235 tok->done = E_NOMEM;
236 return EOF;
238 tok->buf = buf;
239 tok->cur = tok->buf + oldlen;
240 strcpy(tok->buf + oldlen, new);
241 free(new);
242 tok->inp = tok->buf + newlen;
243 tok->end = tok->inp + 1;
244 tok->start = tok->buf + start;
246 else {
247 tok->lineno++;
248 if (tok->buf != NULL)
249 free(tok->buf);
250 tok->buf = new;
251 tok->cur = tok->buf;
252 tok->inp = strchr(tok->buf, '\0');
253 tok->end = tok->inp + 1;
256 else {
257 int done = 0;
258 int cur = 0;
259 char *pt;
260 if (tok->start == NULL) {
261 if (tok->buf == NULL) {
262 tok->buf = PyMem_NEW(char, BUFSIZ);
263 if (tok->buf == NULL) {
264 tok->done = E_NOMEM;
265 return EOF;
267 tok->end = tok->buf + BUFSIZ;
269 if (fgets(tok->buf, (int)(tok->end - tok->buf),
270 tok->fp) == NULL) {
271 tok->done = E_EOF;
272 done = 1;
274 else {
275 tok->done = E_OK;
276 tok->inp = strchr(tok->buf, '\0');
277 done = tok->inp[-1] == '\n';
280 else {
281 cur = tok->cur - tok->buf;
282 if (feof(tok->fp)) {
283 tok->done = E_EOF;
284 done = 1;
286 else
287 tok->done = E_OK;
289 tok->lineno++;
290 /* Read until '\n' or EOF */
291 while (!done) {
292 int curstart = tok->start == NULL ? -1 :
293 tok->start - tok->buf;
294 int curvalid = tok->inp - tok->buf;
295 int newsize = curvalid + BUFSIZ;
296 char *newbuf = tok->buf;
297 PyMem_RESIZE(newbuf, char, newsize);
298 if (newbuf == NULL) {
299 tok->done = E_NOMEM;
300 tok->cur = tok->inp;
301 return EOF;
303 tok->buf = newbuf;
304 tok->inp = tok->buf + curvalid;
305 tok->end = tok->buf + newsize;
306 tok->start = curstart < 0 ? NULL :
307 tok->buf + curstart;
308 if (fgets(tok->inp,
309 (int)(tok->end - tok->inp),
310 tok->fp) == NULL) {
311 /* Last line does not end in \n,
312 fake one */
313 strcpy(tok->inp, "\n");
315 tok->inp = strchr(tok->inp, '\0');
316 done = tok->inp[-1] == '\n';
318 tok->cur = tok->buf + cur;
319 #ifndef macintosh
320 /* replace "\r\n" with "\n" */
321 /* For Mac we leave the \r, giving a syntax error */
322 pt = tok->inp - 2;
323 if (pt >= tok->buf && *pt == '\r') {
324 *pt++ = '\n';
325 *pt = '\0';
326 tok->inp = pt;
328 #endif
330 if (tok->done != E_OK) {
331 if (tok->prompt != NULL)
332 fprintf(stderr, "\n");
333 tok->cur = tok->inp;
334 return EOF;
337 /*NOTREACHED*/
341 /* Back-up one character */
343 static void
344 tok_backup(tok, c)
345 register struct tok_state *tok;
346 register int c;
348 if (c != EOF) {
349 if (--tok->cur < tok->buf)
350 Py_FatalError("tok_backup: begin of buffer");
351 if (*tok->cur != c)
352 *tok->cur = c;
357 /* Return the token corresponding to a single character */
360 PyToken_OneChar(c)
361 int c;
363 switch (c) {
364 case '(': return LPAR;
365 case ')': return RPAR;
366 case '[': return LSQB;
367 case ']': return RSQB;
368 case ':': return COLON;
369 case ',': return COMMA;
370 case ';': return SEMI;
371 case '+': return PLUS;
372 case '-': return MINUS;
373 case '*': return STAR;
374 case '/': return SLASH;
375 case '|': return VBAR;
376 case '&': return AMPER;
377 case '<': return LESS;
378 case '>': return GREATER;
379 case '=': return EQUAL;
380 case '.': return DOT;
381 case '%': return PERCENT;
382 case '`': return BACKQUOTE;
383 case '{': return LBRACE;
384 case '}': return RBRACE;
385 case '^': return CIRCUMFLEX;
386 case '~': return TILDE;
387 default: return OP;
393 PyToken_TwoChars(c1, c2)
394 int c1, c2;
396 switch (c1) {
397 case '=':
398 switch (c2) {
399 case '=': return EQEQUAL;
401 break;
402 case '!':
403 switch (c2) {
404 case '=': return NOTEQUAL;
406 break;
407 case '<':
408 switch (c2) {
409 case '>': return NOTEQUAL;
410 case '=': return LESSEQUAL;
411 case '<': return LEFTSHIFT;
413 break;
414 case '>':
415 switch (c2) {
416 case '=': return GREATEREQUAL;
417 case '>': return RIGHTSHIFT;
419 break;
420 case '*':
421 switch (c2) {
422 case '*': return DOUBLESTAR;
424 break;
426 return OP;
430 static int
431 indenterror(tok)
432 struct tok_state *tok;
434 if (tok->alterror) {
435 tok->done = E_INDENT;
436 tok->cur = tok->inp;
437 return 1;
439 if (tok->altwarning) {
440 fprintf(stderr, "%s: inconsistent tab/space usage\n",
441 tok->filename);
442 tok->altwarning = 0;
444 return 0;
448 /* Get next token, after space stripping etc. */
451 PyTokenizer_Get(tok, p_start, p_end)
452 register struct tok_state *tok; /* In/out: tokenizer state */
453 char **p_start, **p_end; /* Out: point to start/end of token */
455 register int c;
456 int blankline;
458 *p_start = *p_end = NULL;
459 nextline:
460 tok->start = NULL;
461 blankline = 0;
463 /* Get indentation level */
464 if (tok->atbol) {
465 register int col = 0;
466 register int altcol = 0;
467 tok->atbol = 0;
468 for (;;) {
469 c = tok_nextc(tok);
470 if (c == ' ')
471 col++, altcol++;
472 else if (c == '\t') {
473 col = (col/tok->tabsize + 1) * tok->tabsize;
474 altcol = (altcol/tok->alttabsize + 1)
475 * tok->alttabsize;
477 else if (c == '\014') /* Control-L (formfeed) */
478 col = altcol = 0; /* For Emacs users */
479 else
480 break;
482 tok_backup(tok, c);
483 if (c == '#' || c == '\n') {
484 /* Lines with only whitespace and/or comments
485 shouldn't affect the indentation and are
486 not passed to the parser as NEWLINE tokens,
487 except *totally* empty lines in interactive
488 mode, which signal the end of a command group. */
489 if (col == 0 && c == '\n' && tok->prompt != NULL)
490 blankline = 0; /* Let it through */
491 else
492 blankline = 1; /* Ignore completely */
493 /* We can't jump back right here since we still
494 may need to skip to the end of a comment */
496 if (!blankline && tok->level == 0) {
497 if (col == tok->indstack[tok->indent]) {
498 /* No change */
499 if (altcol != tok->altindstack[tok->indent]) {
500 if (indenterror(tok))
501 return ERRORTOKEN;
504 else if (col > tok->indstack[tok->indent]) {
505 /* Indent -- always one */
506 if (tok->indent+1 >= MAXINDENT) {
507 fprintf(stderr, "excessive indent\n");
508 tok->done = E_TOKEN;
509 tok->cur = tok->inp;
510 return ERRORTOKEN;
512 if (altcol <= tok->altindstack[tok->indent]) {
513 if (indenterror(tok))
514 return ERRORTOKEN;
516 tok->pendin++;
517 tok->indstack[++tok->indent] = col;
518 tok->altindstack[tok->indent] = altcol;
520 else /* col < tok->indstack[tok->indent] */ {
521 /* Dedent -- any number, must be consistent */
522 while (tok->indent > 0 &&
523 col < tok->indstack[tok->indent]) {
524 tok->pendin--;
525 tok->indent--;
527 if (col != tok->indstack[tok->indent]) {
528 fprintf(stderr,
529 "inconsistent dedent\n");
530 tok->done = E_TOKEN;
531 tok->cur = tok->inp;
532 return ERRORTOKEN;
534 if (altcol != tok->altindstack[tok->indent]) {
535 if (indenterror(tok))
536 return ERRORTOKEN;
542 tok->start = tok->cur;
544 /* Return pending indents/dedents */
545 if (tok->pendin != 0) {
546 if (tok->pendin < 0) {
547 tok->pendin++;
548 return DEDENT;
550 else {
551 tok->pendin--;
552 return INDENT;
556 again:
557 tok->start = NULL;
558 /* Skip spaces */
559 do {
560 c = tok_nextc(tok);
561 } while (c == ' ' || c == '\t' || c == '\014');
563 /* Set start of current token */
564 tok->start = tok->cur - 1;
566 /* Skip comment */
567 if (c == '#') {
568 /* Hack to allow overriding the tabsize in the file.
569 This is also recognized by vi, when it occurs near the
570 beginning or end of the file. (Will vi never die...?)
571 For Python it must be at the beginning of the file! */
572 /* XXX The real vi syntax is actually different :-( */
573 /* XXX Should recognize Emacs syntax, too */
574 int x;
575 if (sscanf(tok->cur,
576 " vi:set tabsize=%d:", &x) == 1 &&
577 x >= 1 && x <= 40) {
578 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
579 tok->tabsize = x;
581 do {
582 c = tok_nextc(tok);
583 } while (c != EOF && c != '\n');
586 /* Check for EOF and errors now */
587 if (c == EOF) {
588 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
591 /* Identifier (most frequent token!) */
592 if (isalpha(c) || c == '_') {
593 switch (c) {
594 case 'r':
595 case 'R':
596 c = tok_nextc(tok);
597 if (c == '"' || c == '\'')
598 goto letter_quote;
600 while (isalnum(c) || c == '_') {
601 c = tok_nextc(tok);
603 tok_backup(tok, c);
604 *p_start = tok->start;
605 *p_end = tok->cur;
606 return NAME;
609 /* Newline */
610 if (c == '\n') {
611 tok->atbol = 1;
612 if (blankline || tok->level > 0)
613 goto nextline;
614 *p_start = tok->start;
615 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
616 return NEWLINE;
619 #ifdef macintosh
620 if (c == '\r') {
621 fprintf(stderr,
622 "File contains \\r characters (incorrect line endings?)\n");
623 tok->done = E_TOKEN;
624 tok->cur = tok->inp;
625 return ERRORTOKEN;
627 #endif
628 /* Period or number starting with period? */
629 if (c == '.') {
630 c = tok_nextc(tok);
631 if (isdigit(c)) {
632 goto fraction;
634 else {
635 tok_backup(tok, c);
636 *p_start = tok->start;
637 *p_end = tok->cur;
638 return DOT;
642 /* Number */
643 if (isdigit(c)) {
644 if (c == '0') {
645 /* Hex or octal */
646 c = tok_nextc(tok);
647 if (c == '.')
648 goto fraction;
649 #ifndef WITHOUT_COMPLEX
650 if (c == 'j' || c == 'J')
651 goto imaginary;
652 #endif
653 if (c == 'x' || c == 'X') {
654 /* Hex */
655 do {
656 c = tok_nextc(tok);
657 } while (isxdigit(c));
659 else {
660 /* XXX This is broken! E.g.,
661 09.9 should be accepted as float! */
662 /* Octal; c is first char of it */
663 /* There's no 'isoctdigit' macro, sigh */
664 while ('0' <= c && c < '8') {
665 c = tok_nextc(tok);
668 if (c == 'l' || c == 'L')
669 c = tok_nextc(tok);
671 else {
672 /* Decimal */
673 do {
674 c = tok_nextc(tok);
675 } while (isdigit(c));
676 if (c == 'l' || c == 'L')
677 c = tok_nextc(tok);
678 else {
679 /* Accept floating point numbers.
680 XXX This accepts incomplete things like
681 XXX 12e or 1e+; worry run-time */
682 if (c == '.') {
683 fraction:
684 /* Fraction */
685 do {
686 c = tok_nextc(tok);
687 } while (isdigit(c));
689 if (c == 'e' || c == 'E') {
690 /* Exponent part */
691 c = tok_nextc(tok);
692 if (c == '+' || c == '-')
693 c = tok_nextc(tok);
694 while (isdigit(c)) {
695 c = tok_nextc(tok);
698 #ifndef WITHOUT_COMPLEX
699 if (c == 'j' || c == 'J')
700 /* Imaginary part */
701 imaginary:
702 c = tok_nextc(tok);
703 #endif
706 tok_backup(tok, c);
707 *p_start = tok->start;
708 *p_end = tok->cur;
709 return NUMBER;
712 letter_quote:
713 /* String */
714 if (c == '\'' || c == '"') {
715 int quote2 = tok->cur - tok->start + 1;
716 int quote = c;
717 int triple = 0;
718 int tripcount = 0;
719 for (;;) {
720 c = tok_nextc(tok);
721 if (c == '\n') {
722 if (!triple) {
723 tok->done = E_TOKEN;
724 tok_backup(tok, c);
725 return ERRORTOKEN;
727 tripcount = 0;
729 else if (c == EOF) {
730 tok->done = E_TOKEN;
731 tok->cur = tok->inp;
732 return ERRORTOKEN;
734 else if (c == quote) {
735 tripcount++;
736 if (tok->cur - tok->start == quote2) {
737 c = tok_nextc(tok);
738 if (c == quote) {
739 triple = 1;
740 tripcount = 0;
741 continue;
743 tok_backup(tok, c);
745 if (!triple || tripcount == 3)
746 break;
748 else if (c == '\\') {
749 tripcount = 0;
750 c = tok_nextc(tok);
751 if (c == EOF) {
752 tok->done = E_TOKEN;
753 tok->cur = tok->inp;
754 return ERRORTOKEN;
757 else
758 tripcount = 0;
760 *p_start = tok->start;
761 *p_end = tok->cur;
762 return STRING;
765 /* Line continuation */
766 if (c == '\\') {
767 c = tok_nextc(tok);
768 if (c != '\n') {
769 tok->done = E_TOKEN;
770 tok->cur = tok->inp;
771 return ERRORTOKEN;
773 goto again; /* Read next line */
776 /* Check for two-character token */
778 int c2 = tok_nextc(tok);
779 int token = PyToken_TwoChars(c, c2);
780 if (token != OP) {
781 *p_start = tok->start;
782 *p_end = tok->cur;
783 return token;
785 tok_backup(tok, c2);
788 /* Keep track of parentheses nesting level */
789 switch (c) {
790 case '(':
791 case '[':
792 case '{':
793 tok->level++;
794 break;
795 case ')':
796 case ']':
797 case '}':
798 tok->level--;
799 break;
802 /* Punctuation character */
803 *p_start = tok->start;
804 *p_end = tok->cur;
805 return PyToken_OneChar(c);
809 #ifdef Py_DEBUG
811 void
812 tok_dump(type, start, end)
813 int type;
814 char *start, *end;
816 printf("%s", _PyParser_TokenNames[type]);
817 if (type == NAME || type == NUMBER || type == STRING || type == OP)
818 printf("(%.*s)", (int)(end - start), start);
821 #endif