Remove a ?? in the description of Mac OS support.
[python/dscho.git] / Parser / tokenizer.c
blob6ae5084c11f62b7256fdc57ecb44e429f84fefc8
2 /* Tokenizer implementation */
4 #include "pgenheaders.h"
6 #include <ctype.h>
8 #include "tokenizer.h"
9 #include "errcode.h"
11 extern char *PyOS_Readline(char *);
12 /* Return malloc'ed string including trailing \n;
13 empty malloc'ed string for EOF;
14 NULL if interrupted */
16 /* Don't ever change this -- it would break the portability of Python code */
17 #define TABSIZE 8
19 /* Convert a possibly signed character to a nonnegative int */
20 /* XXX This assumes characters are 8 bits wide */
21 #ifdef __CHAR_UNSIGNED__
22 #define Py_CHARMASK(c) (c)
23 #else
24 #define Py_CHARMASK(c) ((c) & 0xff)
25 #endif
27 /* Forward */
28 static struct tok_state *tok_new(void);
29 static int tok_nextc(struct tok_state *tok);
30 static void tok_backup(struct tok_state *tok, int c);
32 /* Token names */
34 char *_PyParser_TokenNames[] = {
35 "ENDMARKER",
36 "NAME",
37 "NUMBER",
38 "STRING",
39 "NEWLINE",
40 "INDENT",
41 "DEDENT",
42 "LPAR",
43 "RPAR",
44 "LSQB",
45 "RSQB",
46 "COLON",
47 "COMMA",
48 "SEMI",
49 "PLUS",
50 "MINUS",
51 "STAR",
52 "SLASH",
53 "VBAR",
54 "AMPER",
55 "LESS",
56 "GREATER",
57 "EQUAL",
58 "DOT",
59 "PERCENT",
60 "BACKQUOTE",
61 "LBRACE",
62 "RBRACE",
63 "EQEQUAL",
64 "NOTEQUAL",
65 "LESSEQUAL",
66 "GREATEREQUAL",
67 "TILDE",
68 "CIRCUMFLEX",
69 "LEFTSHIFT",
70 "RIGHTSHIFT",
71 "DOUBLESTAR",
72 "PLUSEQUAL",
73 "MINEQUAL",
74 "STAREQUAL",
75 "SLASHEQUAL",
76 "PERCENTEQUAL",
77 "AMPEREQUAL",
78 "VBAREQUAL",
79 "CIRCUMFLEXEQUAL",
80 "LEFTSHIFTEQUAL",
81 "RIGHTSHIFTEQUAL",
82 "DOUBLESTAREQUAL",
83 /* This table must match the #defines in token.h! */
84 "OP",
85 "<ERRORTOKEN>",
86 "<N_TOKENS>"
90 /* Create and initialize a new tok_state structure */
92 static struct tok_state *
93 tok_new(void)
95 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
96 if (tok == NULL)
97 return NULL;
98 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
99 tok->done = E_OK;
100 tok->fp = NULL;
101 tok->tabsize = TABSIZE;
102 tok->indent = 0;
103 tok->indstack[0] = 0;
104 tok->atbol = 1;
105 tok->pendin = 0;
106 tok->prompt = tok->nextprompt = NULL;
107 tok->lineno = 0;
108 tok->level = 0;
109 tok->filename = NULL;
110 tok->altwarning = 0;
111 tok->alterror = 0;
112 tok->alttabsize = 1;
113 tok->altindstack[0] = 0;
114 return tok;
118 /* Set up tokenizer for string */
120 struct tok_state *
121 PyTokenizer_FromString(char *str)
123 struct tok_state *tok = tok_new();
124 if (tok == NULL)
125 return NULL;
126 tok->buf = tok->cur = tok->end = tok->inp = str;
127 return tok;
131 /* Set up tokenizer for file */
133 struct tok_state *
134 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
136 struct tok_state *tok = tok_new();
137 if (tok == NULL)
138 return NULL;
139 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
140 PyMem_DEL(tok);
141 return NULL;
143 tok->cur = tok->inp = tok->buf;
144 tok->end = tok->buf + BUFSIZ;
145 tok->fp = fp;
146 tok->prompt = ps1;
147 tok->nextprompt = ps2;
148 return tok;
152 /* Free a tok_state structure */
154 void
155 PyTokenizer_Free(struct tok_state *tok)
157 if (tok->fp != NULL && tok->buf != NULL)
158 PyMem_DEL(tok->buf);
159 PyMem_DEL(tok);
163 /* Get next char, updating state; error code goes into tok->done */
165 static int
166 tok_nextc(register struct tok_state *tok)
168 for (;;) {
169 if (tok->cur != tok->inp) {
170 return Py_CHARMASK(*tok->cur++); /* Fast path */
172 if (tok->done != E_OK)
173 return EOF;
174 if (tok->fp == NULL) {
175 char *end = strchr(tok->inp, '\n');
176 if (end != NULL)
177 end++;
178 else {
179 end = strchr(tok->inp, '\0');
180 if (end == tok->inp) {
181 tok->done = E_EOF;
182 return EOF;
185 if (tok->start == NULL)
186 tok->buf = tok->cur;
187 tok->lineno++;
188 tok->inp = end;
189 return Py_CHARMASK(*tok->cur++);
191 if (tok->prompt != NULL) {
192 char *new = PyOS_Readline(tok->prompt);
193 if (tok->nextprompt != NULL)
194 tok->prompt = tok->nextprompt;
195 if (new == NULL)
196 tok->done = E_INTR;
197 else if (*new == '\0') {
198 PyMem_FREE(new);
199 tok->done = E_EOF;
201 else if (tok->start != NULL) {
202 size_t start = tok->start - tok->buf;
203 size_t oldlen = tok->cur - tok->buf;
204 size_t newlen = oldlen + strlen(new);
205 char *buf = tok->buf;
206 PyMem_RESIZE(buf, char, newlen+1);
207 tok->lineno++;
208 if (buf == NULL) {
209 PyMem_DEL(tok->buf);
210 tok->buf = NULL;
211 PyMem_FREE(new);
212 tok->done = E_NOMEM;
213 return EOF;
215 tok->buf = buf;
216 tok->cur = tok->buf + oldlen;
217 strcpy(tok->buf + oldlen, new);
218 PyMem_FREE(new);
219 tok->inp = tok->buf + newlen;
220 tok->end = tok->inp + 1;
221 tok->start = tok->buf + start;
223 else {
224 tok->lineno++;
225 if (tok->buf != NULL)
226 PyMem_DEL(tok->buf);
227 tok->buf = new;
228 tok->cur = tok->buf;
229 tok->inp = strchr(tok->buf, '\0');
230 tok->end = tok->inp + 1;
233 else {
234 int done = 0;
235 int cur = 0;
236 char *pt;
237 if (tok->start == NULL) {
238 if (tok->buf == NULL) {
239 tok->buf = PyMem_NEW(char, BUFSIZ);
240 if (tok->buf == NULL) {
241 tok->done = E_NOMEM;
242 return EOF;
244 tok->end = tok->buf + BUFSIZ;
246 if (fgets(tok->buf, (int)(tok->end - tok->buf),
247 tok->fp) == NULL) {
248 tok->done = E_EOF;
249 done = 1;
251 else {
252 tok->done = E_OK;
253 tok->inp = strchr(tok->buf, '\0');
254 done = tok->inp[-1] == '\n';
257 else {
258 cur = tok->cur - tok->buf;
259 if (feof(tok->fp)) {
260 tok->done = E_EOF;
261 done = 1;
263 else
264 tok->done = E_OK;
266 tok->lineno++;
267 /* Read until '\n' or EOF */
268 while (!done) {
269 int curstart = tok->start == NULL ? -1 :
270 tok->start - tok->buf;
271 int curvalid = tok->inp - tok->buf;
272 int newsize = curvalid + BUFSIZ;
273 char *newbuf = tok->buf;
274 PyMem_RESIZE(newbuf, char, newsize);
275 if (newbuf == NULL) {
276 tok->done = E_NOMEM;
277 tok->cur = tok->inp;
278 return EOF;
280 tok->buf = newbuf;
281 tok->inp = tok->buf + curvalid;
282 tok->end = tok->buf + newsize;
283 tok->start = curstart < 0 ? NULL :
284 tok->buf + curstart;
285 if (fgets(tok->inp,
286 (int)(tok->end - tok->inp),
287 tok->fp) == NULL) {
288 /* Last line does not end in \n,
289 fake one */
290 strcpy(tok->inp, "\n");
292 tok->inp = strchr(tok->inp, '\0');
293 done = tok->inp[-1] == '\n';
295 tok->cur = tok->buf + cur;
296 #ifndef macintosh
297 /* replace "\r\n" with "\n" */
298 /* For Mac we leave the \r, giving a syntax error */
299 pt = tok->inp - 2;
300 if (pt >= tok->buf && *pt == '\r') {
301 *pt++ = '\n';
302 *pt = '\0';
303 tok->inp = pt;
305 #endif
307 if (tok->done != E_OK) {
308 if (tok->prompt != NULL)
309 PySys_WriteStderr("\n");
310 tok->cur = tok->inp;
311 return EOF;
314 /*NOTREACHED*/
318 /* Back-up one character */
320 static void
321 tok_backup(register struct tok_state *tok, register int c)
323 if (c != EOF) {
324 if (--tok->cur < tok->buf)
325 Py_FatalError("tok_backup: begin of buffer");
326 if (*tok->cur != c)
327 *tok->cur = c;
332 /* Return the token corresponding to a single character */
335 PyToken_OneChar(int c)
337 switch (c) {
338 case '(': return LPAR;
339 case ')': return RPAR;
340 case '[': return LSQB;
341 case ']': return RSQB;
342 case ':': return COLON;
343 case ',': return COMMA;
344 case ';': return SEMI;
345 case '+': return PLUS;
346 case '-': return MINUS;
347 case '*': return STAR;
348 case '/': return SLASH;
349 case '|': return VBAR;
350 case '&': return AMPER;
351 case '<': return LESS;
352 case '>': return GREATER;
353 case '=': return EQUAL;
354 case '.': return DOT;
355 case '%': return PERCENT;
356 case '`': return BACKQUOTE;
357 case '{': return LBRACE;
358 case '}': return RBRACE;
359 case '^': return CIRCUMFLEX;
360 case '~': return TILDE;
361 default: return OP;
367 PyToken_TwoChars(int c1, int c2)
369 switch (c1) {
370 case '=':
371 switch (c2) {
372 case '=': return EQEQUAL;
374 break;
375 case '!':
376 switch (c2) {
377 case '=': return NOTEQUAL;
379 break;
380 case '<':
381 switch (c2) {
382 case '>': return NOTEQUAL;
383 case '=': return LESSEQUAL;
384 case '<': return LEFTSHIFT;
386 break;
387 case '>':
388 switch (c2) {
389 case '=': return GREATEREQUAL;
390 case '>': return RIGHTSHIFT;
392 break;
393 case '+':
394 switch (c2) {
395 case '=': return PLUSEQUAL;
397 break;
398 case '-':
399 switch (c2) {
400 case '=': return MINEQUAL;
402 break;
403 case '*':
404 switch (c2) {
405 case '*': return DOUBLESTAR;
406 case '=': return STAREQUAL;
408 break;
409 case '/':
410 switch (c2) {
411 case '=': return SLASHEQUAL;
413 break;
414 case '|':
415 switch (c2) {
416 case '=': return VBAREQUAL;
418 break;
419 case '%':
420 switch (c2) {
421 case '=': return PERCENTEQUAL;
423 break;
424 case '&':
425 switch (c2) {
426 case '=': return AMPEREQUAL;
428 break;
429 case '^':
430 switch (c2) {
431 case '=': return CIRCUMFLEXEQUAL;
433 break;
435 return OP;
439 PyToken_ThreeChars(int c1, int c2, int c3)
441 switch (c1) {
442 case '<':
443 switch (c2) {
444 case '<':
445 switch (c3) {
446 case '=':
447 return LEFTSHIFTEQUAL;
448 break;
450 break;
452 break;
453 case '>':
454 switch (c2) {
455 case '>':
456 switch (c3) {
457 case '=':
458 return RIGHTSHIFTEQUAL;
459 break;
461 break;
463 break;
464 case '*':
465 switch (c2) {
466 case '*':
467 switch (c3) {
468 case '=':
469 return DOUBLESTAREQUAL;
470 break;
472 break;
474 break;
476 return OP;
479 static int
480 indenterror(struct tok_state *tok)
482 if (tok->alterror) {
483 tok->done = E_TABSPACE;
484 tok->cur = tok->inp;
485 return 1;
487 if (tok->altwarning) {
488 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
489 "in indentation\n", tok->filename);
490 tok->altwarning = 0;
492 return 0;
496 /* Get next token, after space stripping etc. */
499 PyTokenizer_Get(register struct tok_state *tok, char **p_start,
500 char **p_end)
502 register int c;
503 int blankline;
505 *p_start = *p_end = NULL;
506 nextline:
507 tok->start = NULL;
508 blankline = 0;
510 /* Get indentation level */
511 if (tok->atbol) {
512 register int col = 0;
513 register int altcol = 0;
514 tok->atbol = 0;
515 for (;;) {
516 c = tok_nextc(tok);
517 if (c == ' ')
518 col++, altcol++;
519 else if (c == '\t') {
520 col = (col/tok->tabsize + 1) * tok->tabsize;
521 altcol = (altcol/tok->alttabsize + 1)
522 * tok->alttabsize;
524 else if (c == '\014') /* Control-L (formfeed) */
525 col = altcol = 0; /* For Emacs users */
526 else
527 break;
529 tok_backup(tok, c);
530 if (c == '#' || c == '\n') {
531 /* Lines with only whitespace and/or comments
532 shouldn't affect the indentation and are
533 not passed to the parser as NEWLINE tokens,
534 except *totally* empty lines in interactive
535 mode, which signal the end of a command group. */
536 if (col == 0 && c == '\n' && tok->prompt != NULL)
537 blankline = 0; /* Let it through */
538 else
539 blankline = 1; /* Ignore completely */
540 /* We can't jump back right here since we still
541 may need to skip to the end of a comment */
543 if (!blankline && tok->level == 0) {
544 if (col == tok->indstack[tok->indent]) {
545 /* No change */
546 if (altcol != tok->altindstack[tok->indent]) {
547 if (indenterror(tok))
548 return ERRORTOKEN;
551 else if (col > tok->indstack[tok->indent]) {
552 /* Indent -- always one */
553 if (tok->indent+1 >= MAXINDENT) {
554 tok->done = E_TOODEEP;
555 tok->cur = tok->inp;
556 return ERRORTOKEN;
558 if (altcol <= tok->altindstack[tok->indent]) {
559 if (indenterror(tok))
560 return ERRORTOKEN;
562 tok->pendin++;
563 tok->indstack[++tok->indent] = col;
564 tok->altindstack[tok->indent] = altcol;
566 else /* col < tok->indstack[tok->indent] */ {
567 /* Dedent -- any number, must be consistent */
568 while (tok->indent > 0 &&
569 col < tok->indstack[tok->indent]) {
570 tok->pendin--;
571 tok->indent--;
573 if (col != tok->indstack[tok->indent]) {
574 tok->done = E_DEDENT;
575 tok->cur = tok->inp;
576 return ERRORTOKEN;
578 if (altcol != tok->altindstack[tok->indent]) {
579 if (indenterror(tok))
580 return ERRORTOKEN;
586 tok->start = tok->cur;
588 /* Return pending indents/dedents */
589 if (tok->pendin != 0) {
590 if (tok->pendin < 0) {
591 tok->pendin++;
592 return DEDENT;
594 else {
595 tok->pendin--;
596 return INDENT;
600 again:
601 tok->start = NULL;
602 /* Skip spaces */
603 do {
604 c = tok_nextc(tok);
605 } while (c == ' ' || c == '\t' || c == '\014');
607 /* Set start of current token */
608 tok->start = tok->cur - 1;
610 /* Skip comment, while looking for tab-setting magic */
611 if (c == '#') {
612 static char *tabforms[] = {
613 "tab-width:", /* Emacs */
614 ":tabstop=", /* vim, full form */
615 ":ts=", /* vim, abbreviated form */
616 "set tabsize=", /* will vi never die? */
617 /* more templates can be added here to support other editors */
619 char cbuf[80];
620 char *tp, **cp;
621 tp = cbuf;
622 do {
623 *tp++ = c = tok_nextc(tok);
624 } while (c != EOF && c != '\n' &&
625 tp - cbuf + 1 < sizeof(cbuf));
626 *tp = '\0';
627 for (cp = tabforms;
628 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
629 cp++) {
630 if ((tp = strstr(cbuf, *cp))) {
631 int newsize = atoi(tp + strlen(*cp));
633 if (newsize >= 1 && newsize <= 40) {
634 tok->tabsize = newsize;
635 if (Py_VerboseFlag)
636 PySys_WriteStderr(
637 "Tab size set to %d\n",
638 newsize);
642 while (c != EOF && c != '\n')
643 c = tok_nextc(tok);
646 /* Check for EOF and errors now */
647 if (c == EOF) {
648 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
651 /* Identifier (most frequent token!) */
652 if (isalpha(c) || c == '_') {
653 /* Process r"", u"" and ur"" */
654 switch (c) {
655 case 'r':
656 case 'R':
657 c = tok_nextc(tok);
658 if (c == '"' || c == '\'')
659 goto letter_quote;
660 break;
661 case 'u':
662 case 'U':
663 c = tok_nextc(tok);
664 if (c == 'r' || c == 'R')
665 c = tok_nextc(tok);
666 if (c == '"' || c == '\'')
667 goto letter_quote;
668 break;
670 while (isalnum(c) || c == '_') {
671 c = tok_nextc(tok);
673 tok_backup(tok, c);
674 *p_start = tok->start;
675 *p_end = tok->cur;
676 return NAME;
679 /* Newline */
680 if (c == '\n') {
681 tok->atbol = 1;
682 if (blankline || tok->level > 0)
683 goto nextline;
684 *p_start = tok->start;
685 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
686 return NEWLINE;
689 #ifdef macintosh
690 if (c == '\r') {
691 PySys_WriteStderr(
692 "File contains \\r characters (incorrect line endings?)\n");
693 tok->done = E_TOKEN;
694 tok->cur = tok->inp;
695 return ERRORTOKEN;
697 #endif
698 /* Period or number starting with period? */
699 if (c == '.') {
700 c = tok_nextc(tok);
701 if (isdigit(c)) {
702 goto fraction;
704 else {
705 tok_backup(tok, c);
706 *p_start = tok->start;
707 *p_end = tok->cur;
708 return DOT;
712 /* Number */
713 if (isdigit(c)) {
714 if (c == '0') {
715 /* Hex or octal */
716 c = tok_nextc(tok);
717 if (c == '.')
718 goto fraction;
719 #ifndef WITHOUT_COMPLEX
720 if (c == 'j' || c == 'J')
721 goto imaginary;
722 #endif
723 if (c == 'x' || c == 'X') {
724 /* Hex */
725 do {
726 c = tok_nextc(tok);
727 } while (isxdigit(c));
729 else {
730 /* XXX This is broken! E.g.,
731 09.9 should be accepted as float! */
732 /* Octal; c is first char of it */
733 /* There's no 'isoctdigit' macro, sigh */
734 while ('0' <= c && c < '8') {
735 c = tok_nextc(tok);
738 if (c == 'l' || c == 'L')
739 c = tok_nextc(tok);
741 else {
742 /* Decimal */
743 do {
744 c = tok_nextc(tok);
745 } while (isdigit(c));
746 if (c == 'l' || c == 'L')
747 c = tok_nextc(tok);
748 else {
749 /* Accept floating point numbers.
750 XXX This accepts incomplete things like
751 XXX 12e or 1e+; worry run-time */
752 if (c == '.') {
753 fraction:
754 /* Fraction */
755 do {
756 c = tok_nextc(tok);
757 } while (isdigit(c));
759 if (c == 'e' || c == 'E') {
760 /* Exponent part */
761 c = tok_nextc(tok);
762 if (c == '+' || c == '-')
763 c = tok_nextc(tok);
764 while (isdigit(c)) {
765 c = tok_nextc(tok);
768 #ifndef WITHOUT_COMPLEX
769 if (c == 'j' || c == 'J')
770 /* Imaginary part */
771 imaginary:
772 c = tok_nextc(tok);
773 #endif
776 tok_backup(tok, c);
777 *p_start = tok->start;
778 *p_end = tok->cur;
779 return NUMBER;
782 letter_quote:
783 /* String */
784 if (c == '\'' || c == '"') {
785 int quote2 = tok->cur - tok->start + 1;
786 int quote = c;
787 int triple = 0;
788 int tripcount = 0;
789 for (;;) {
790 c = tok_nextc(tok);
791 if (c == '\n') {
792 if (!triple) {
793 tok->done = E_TOKEN;
794 tok_backup(tok, c);
795 return ERRORTOKEN;
797 tripcount = 0;
799 else if (c == EOF) {
800 tok->done = E_TOKEN;
801 tok->cur = tok->inp;
802 return ERRORTOKEN;
804 else if (c == quote) {
805 tripcount++;
806 if (tok->cur - tok->start == quote2) {
807 c = tok_nextc(tok);
808 if (c == quote) {
809 triple = 1;
810 tripcount = 0;
811 continue;
813 tok_backup(tok, c);
815 if (!triple || tripcount == 3)
816 break;
818 else if (c == '\\') {
819 tripcount = 0;
820 c = tok_nextc(tok);
821 if (c == EOF) {
822 tok->done = E_TOKEN;
823 tok->cur = tok->inp;
824 return ERRORTOKEN;
827 else
828 tripcount = 0;
830 *p_start = tok->start;
831 *p_end = tok->cur;
832 return STRING;
835 /* Line continuation */
836 if (c == '\\') {
837 c = tok_nextc(tok);
838 if (c != '\n') {
839 tok->done = E_TOKEN;
840 tok->cur = tok->inp;
841 return ERRORTOKEN;
843 goto again; /* Read next line */
846 /* Check for two-character token */
848 int c2 = tok_nextc(tok);
849 int token = PyToken_TwoChars(c, c2);
850 if (token != OP) {
851 int c3 = tok_nextc(tok);
852 int token3 = PyToken_ThreeChars(c, c2, c3);
853 if (token3 != OP) {
854 token = token3;
855 } else {
856 tok_backup(tok, c3);
858 *p_start = tok->start;
859 *p_end = tok->cur;
860 return token;
862 tok_backup(tok, c2);
865 /* Keep track of parentheses nesting level */
866 switch (c) {
867 case '(':
868 case '[':
869 case '{':
870 tok->level++;
871 break;
872 case ')':
873 case ']':
874 case '}':
875 tok->level--;
876 break;
879 /* Punctuation character */
880 *p_start = tok->start;
881 *p_end = tok->cur;
882 return PyToken_OneChar(c);
886 #ifdef Py_DEBUG
888 void
889 tok_dump(int type, char *start, char *end)
891 printf("%s", _PyParser_TokenNames[type]);
892 if (type == NAME || type == NUMBER || type == STRING || type == OP)
893 printf("(%.*s)", (int)(end - start), start);
896 #endif