Use full package paths in imports.
[python/dscho.git] / Parser / tokenizer.c
blobb4e0fbf7e58d9446e02d3f5af619b42b0191f2fa
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
9 #include "tokenizer.h"
10 #include "errcode.h"
12 extern char *PyOS_Readline(char *);
13 /* Return malloc'ed string including trailing \n;
14 empty malloc'ed string for EOF;
15 NULL if interrupted */
17 /* Don't ever change this -- it would break the portability of Python code */
18 #define TABSIZE 8
20 /* Convert a possibly signed character to a nonnegative int */
21 /* XXX This assumes characters are 8 bits wide */
22 #ifdef __CHAR_UNSIGNED__
23 #define Py_CHARMASK(c) (c)
24 #else
25 #define Py_CHARMASK(c) ((c) & 0xff)
26 #endif
28 /* Forward */
29 static struct tok_state *tok_new(void);
30 static int tok_nextc(struct tok_state *tok);
31 static void tok_backup(struct tok_state *tok, int c);
33 /* Token names */
35 char *_PyParser_TokenNames[] = {
36 "ENDMARKER",
37 "NAME",
38 "NUMBER",
39 "STRING",
40 "NEWLINE",
41 "INDENT",
42 "DEDENT",
43 "LPAR",
44 "RPAR",
45 "LSQB",
46 "RSQB",
47 "COLON",
48 "COMMA",
49 "SEMI",
50 "PLUS",
51 "MINUS",
52 "STAR",
53 "SLASH",
54 "VBAR",
55 "AMPER",
56 "LESS",
57 "GREATER",
58 "EQUAL",
59 "DOT",
60 "PERCENT",
61 "BACKQUOTE",
62 "LBRACE",
63 "RBRACE",
64 "EQEQUAL",
65 "NOTEQUAL",
66 "LESSEQUAL",
67 "GREATEREQUAL",
68 "TILDE",
69 "CIRCUMFLEX",
70 "LEFTSHIFT",
71 "RIGHTSHIFT",
72 "DOUBLESTAR",
73 "PLUSEQUAL",
74 "MINEQUAL",
75 "STAREQUAL",
76 "SLASHEQUAL",
77 "PERCENTEQUAL",
78 "AMPEREQUAL",
79 "VBAREQUAL",
80 "CIRCUMFLEXEQUAL",
81 "LEFTSHIFTEQUAL",
82 "RIGHTSHIFTEQUAL",
83 "DOUBLESTAREQUAL",
84 "DOUBLESLASH",
85 "DOUBLESLASHEQUAL",
86 /* This table must match the #defines in token.h! */
87 "OP",
88 "<ERRORTOKEN>",
89 "<N_TOKENS>"
93 /* Create and initialize a new tok_state structure */
95 static struct tok_state *
96 tok_new(void)
98 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
99 if (tok == NULL)
100 return NULL;
101 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
102 tok->done = E_OK;
103 tok->fp = NULL;
104 tok->tabsize = TABSIZE;
105 tok->indent = 0;
106 tok->indstack[0] = 0;
107 tok->atbol = 1;
108 tok->pendin = 0;
109 tok->prompt = tok->nextprompt = NULL;
110 tok->lineno = 0;
111 tok->level = 0;
112 tok->filename = NULL;
113 tok->altwarning = 0;
114 tok->alterror = 0;
115 tok->alttabsize = 1;
116 tok->altindstack[0] = 0;
117 return tok;
121 /* Set up tokenizer for string */
123 struct tok_state *
124 PyTokenizer_FromString(char *str)
126 struct tok_state *tok = tok_new();
127 if (tok == NULL)
128 return NULL;
129 tok->buf = tok->cur = tok->end = tok->inp = str;
130 return tok;
134 /* Set up tokenizer for file */
136 struct tok_state *
137 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
139 struct tok_state *tok = tok_new();
140 if (tok == NULL)
141 return NULL;
142 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
143 PyMem_DEL(tok);
144 return NULL;
146 tok->cur = tok->inp = tok->buf;
147 tok->end = tok->buf + BUFSIZ;
148 tok->fp = fp;
149 tok->prompt = ps1;
150 tok->nextprompt = ps2;
151 return tok;
155 /* Free a tok_state structure */
157 void
158 PyTokenizer_Free(struct tok_state *tok)
160 if (tok->fp != NULL && tok->buf != NULL)
161 PyMem_DEL(tok->buf);
162 PyMem_DEL(tok);
166 /* Get next char, updating state; error code goes into tok->done */
168 static int
169 tok_nextc(register struct tok_state *tok)
171 for (;;) {
172 if (tok->cur != tok->inp) {
173 return Py_CHARMASK(*tok->cur++); /* Fast path */
175 if (tok->done != E_OK)
176 return EOF;
177 if (tok->fp == NULL) {
178 char *end = strchr(tok->inp, '\n');
179 if (end != NULL)
180 end++;
181 else {
182 end = strchr(tok->inp, '\0');
183 if (end == tok->inp) {
184 tok->done = E_EOF;
185 return EOF;
188 if (tok->start == NULL)
189 tok->buf = tok->cur;
190 tok->lineno++;
191 tok->inp = end;
192 return Py_CHARMASK(*tok->cur++);
194 if (tok->prompt != NULL) {
195 char *new = PyOS_Readline(tok->prompt);
196 if (tok->nextprompt != NULL)
197 tok->prompt = tok->nextprompt;
198 if (new == NULL)
199 tok->done = E_INTR;
200 else if (*new == '\0') {
201 PyMem_FREE(new);
202 tok->done = E_EOF;
204 else if (tok->start != NULL) {
205 size_t start = tok->start - tok->buf;
206 size_t oldlen = tok->cur - tok->buf;
207 size_t newlen = oldlen + strlen(new);
208 char *buf = tok->buf;
209 PyMem_RESIZE(buf, char, newlen+1);
210 tok->lineno++;
211 if (buf == NULL) {
212 PyMem_DEL(tok->buf);
213 tok->buf = NULL;
214 PyMem_FREE(new);
215 tok->done = E_NOMEM;
216 return EOF;
218 tok->buf = buf;
219 tok->cur = tok->buf + oldlen;
220 strcpy(tok->buf + oldlen, new);
221 PyMem_FREE(new);
222 tok->inp = tok->buf + newlen;
223 tok->end = tok->inp + 1;
224 tok->start = tok->buf + start;
226 else {
227 tok->lineno++;
228 if (tok->buf != NULL)
229 PyMem_DEL(tok->buf);
230 tok->buf = new;
231 tok->cur = tok->buf;
232 tok->inp = strchr(tok->buf, '\0');
233 tok->end = tok->inp + 1;
236 else {
237 int done = 0;
238 int cur = 0;
239 char *pt;
240 if (tok->start == NULL) {
241 if (tok->buf == NULL) {
242 tok->buf = PyMem_NEW(char, BUFSIZ);
243 if (tok->buf == NULL) {
244 tok->done = E_NOMEM;
245 return EOF;
247 tok->end = tok->buf + BUFSIZ;
249 if (Py_UniversalNewlineFgets(tok->buf, (int)(tok->end - tok->buf),
250 tok->fp, NULL) == NULL) {
251 tok->done = E_EOF;
252 done = 1;
254 else {
255 tok->done = E_OK;
256 tok->inp = strchr(tok->buf, '\0');
257 done = tok->inp[-1] == '\n';
260 else {
261 cur = tok->cur - tok->buf;
262 if (feof(tok->fp)) {
263 tok->done = E_EOF;
264 done = 1;
266 else
267 tok->done = E_OK;
269 tok->lineno++;
270 /* Read until '\n' or EOF */
271 while (!done) {
272 int curstart = tok->start == NULL ? -1 :
273 tok->start - tok->buf;
274 int curvalid = tok->inp - tok->buf;
275 int newsize = curvalid + BUFSIZ;
276 char *newbuf = tok->buf;
277 PyMem_RESIZE(newbuf, char, newsize);
278 if (newbuf == NULL) {
279 tok->done = E_NOMEM;
280 tok->cur = tok->inp;
281 return EOF;
283 tok->buf = newbuf;
284 tok->inp = tok->buf + curvalid;
285 tok->end = tok->buf + newsize;
286 tok->start = curstart < 0 ? NULL :
287 tok->buf + curstart;
288 if (Py_UniversalNewlineFgets(tok->inp,
289 (int)(tok->end - tok->inp),
290 tok->fp, NULL) == NULL) {
291 /* Last line does not end in \n,
292 fake one */
293 strcpy(tok->inp, "\n");
295 tok->inp = strchr(tok->inp, '\0');
296 done = tok->inp[-1] == '\n';
298 tok->cur = tok->buf + cur;
299 #ifndef macintosh
300 /* replace "\r\n" with "\n" */
301 /* For Mac we leave the \r, giving a syntax error */
302 pt = tok->inp - 2;
303 if (pt >= tok->buf && *pt == '\r') {
304 *pt++ = '\n';
305 *pt = '\0';
306 tok->inp = pt;
308 #endif
310 if (tok->done != E_OK) {
311 if (tok->prompt != NULL)
312 PySys_WriteStderr("\n");
313 tok->cur = tok->inp;
314 return EOF;
317 /*NOTREACHED*/
321 /* Back-up one character */
323 static void
324 tok_backup(register struct tok_state *tok, register int c)
326 if (c != EOF) {
327 if (--tok->cur < tok->buf)
328 Py_FatalError("tok_backup: begin of buffer");
329 if (*tok->cur != c)
330 *tok->cur = c;
335 /* Return the token corresponding to a single character */
338 PyToken_OneChar(int c)
340 switch (c) {
341 case '(': return LPAR;
342 case ')': return RPAR;
343 case '[': return LSQB;
344 case ']': return RSQB;
345 case ':': return COLON;
346 case ',': return COMMA;
347 case ';': return SEMI;
348 case '+': return PLUS;
349 case '-': return MINUS;
350 case '*': return STAR;
351 case '/': return SLASH;
352 case '|': return VBAR;
353 case '&': return AMPER;
354 case '<': return LESS;
355 case '>': return GREATER;
356 case '=': return EQUAL;
357 case '.': return DOT;
358 case '%': return PERCENT;
359 case '`': return BACKQUOTE;
360 case '{': return LBRACE;
361 case '}': return RBRACE;
362 case '^': return CIRCUMFLEX;
363 case '~': return TILDE;
364 default: return OP;
370 PyToken_TwoChars(int c1, int c2)
372 switch (c1) {
373 case '=':
374 switch (c2) {
375 case '=': return EQEQUAL;
377 break;
378 case '!':
379 switch (c2) {
380 case '=': return NOTEQUAL;
382 break;
383 case '<':
384 switch (c2) {
385 case '>': return NOTEQUAL;
386 case '=': return LESSEQUAL;
387 case '<': return LEFTSHIFT;
389 break;
390 case '>':
391 switch (c2) {
392 case '=': return GREATEREQUAL;
393 case '>': return RIGHTSHIFT;
395 break;
396 case '+':
397 switch (c2) {
398 case '=': return PLUSEQUAL;
400 break;
401 case '-':
402 switch (c2) {
403 case '=': return MINEQUAL;
405 break;
406 case '*':
407 switch (c2) {
408 case '*': return DOUBLESTAR;
409 case '=': return STAREQUAL;
411 break;
412 case '/':
413 switch (c2) {
414 case '/': return DOUBLESLASH;
415 case '=': return SLASHEQUAL;
417 break;
418 case '|':
419 switch (c2) {
420 case '=': return VBAREQUAL;
422 break;
423 case '%':
424 switch (c2) {
425 case '=': return PERCENTEQUAL;
427 break;
428 case '&':
429 switch (c2) {
430 case '=': return AMPEREQUAL;
432 break;
433 case '^':
434 switch (c2) {
435 case '=': return CIRCUMFLEXEQUAL;
437 break;
439 return OP;
443 PyToken_ThreeChars(int c1, int c2, int c3)
445 switch (c1) {
446 case '<':
447 switch (c2) {
448 case '<':
449 switch (c3) {
450 case '=':
451 return LEFTSHIFTEQUAL;
453 break;
455 break;
456 case '>':
457 switch (c2) {
458 case '>':
459 switch (c3) {
460 case '=':
461 return RIGHTSHIFTEQUAL;
463 break;
465 break;
466 case '*':
467 switch (c2) {
468 case '*':
469 switch (c3) {
470 case '=':
471 return DOUBLESTAREQUAL;
473 break;
475 break;
476 case '/':
477 switch (c2) {
478 case '/':
479 switch (c3) {
480 case '=':
481 return DOUBLESLASHEQUAL;
483 break;
485 break;
487 return OP;
490 static int
491 indenterror(struct tok_state *tok)
493 if (tok->alterror) {
494 tok->done = E_TABSPACE;
495 tok->cur = tok->inp;
496 return 1;
498 if (tok->altwarning) {
499 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
500 "in indentation\n", tok->filename);
501 tok->altwarning = 0;
503 return 0;
507 /* Get next token, after space stripping etc. */
510 PyTokenizer_Get(register struct tok_state *tok, char **p_start,
511 char **p_end)
513 register int c;
514 int blankline;
516 *p_start = *p_end = NULL;
517 nextline:
518 tok->start = NULL;
519 blankline = 0;
521 /* Get indentation level */
522 if (tok->atbol) {
523 register int col = 0;
524 register int altcol = 0;
525 tok->atbol = 0;
526 for (;;) {
527 c = tok_nextc(tok);
528 if (c == ' ')
529 col++, altcol++;
530 else if (c == '\t') {
531 col = (col/tok->tabsize + 1) * tok->tabsize;
532 altcol = (altcol/tok->alttabsize + 1)
533 * tok->alttabsize;
535 else if (c == '\014') /* Control-L (formfeed) */
536 col = altcol = 0; /* For Emacs users */
537 else
538 break;
540 tok_backup(tok, c);
541 if (c == '#' || c == '\n') {
542 /* Lines with only whitespace and/or comments
543 shouldn't affect the indentation and are
544 not passed to the parser as NEWLINE tokens,
545 except *totally* empty lines in interactive
546 mode, which signal the end of a command group. */
547 if (col == 0 && c == '\n' && tok->prompt != NULL)
548 blankline = 0; /* Let it through */
549 else
550 blankline = 1; /* Ignore completely */
551 /* We can't jump back right here since we still
552 may need to skip to the end of a comment */
554 if (!blankline && tok->level == 0) {
555 if (col == tok->indstack[tok->indent]) {
556 /* No change */
557 if (altcol != tok->altindstack[tok->indent]) {
558 if (indenterror(tok))
559 return ERRORTOKEN;
562 else if (col > tok->indstack[tok->indent]) {
563 /* Indent -- always one */
564 if (tok->indent+1 >= MAXINDENT) {
565 tok->done = E_TOODEEP;
566 tok->cur = tok->inp;
567 return ERRORTOKEN;
569 if (altcol <= tok->altindstack[tok->indent]) {
570 if (indenterror(tok))
571 return ERRORTOKEN;
573 tok->pendin++;
574 tok->indstack[++tok->indent] = col;
575 tok->altindstack[tok->indent] = altcol;
577 else /* col < tok->indstack[tok->indent] */ {
578 /* Dedent -- any number, must be consistent */
579 while (tok->indent > 0 &&
580 col < tok->indstack[tok->indent]) {
581 tok->pendin--;
582 tok->indent--;
584 if (col != tok->indstack[tok->indent]) {
585 tok->done = E_DEDENT;
586 tok->cur = tok->inp;
587 return ERRORTOKEN;
589 if (altcol != tok->altindstack[tok->indent]) {
590 if (indenterror(tok))
591 return ERRORTOKEN;
597 tok->start = tok->cur;
599 /* Return pending indents/dedents */
600 if (tok->pendin != 0) {
601 if (tok->pendin < 0) {
602 tok->pendin++;
603 return DEDENT;
605 else {
606 tok->pendin--;
607 return INDENT;
611 again:
612 tok->start = NULL;
613 /* Skip spaces */
614 do {
615 c = tok_nextc(tok);
616 } while (c == ' ' || c == '\t' || c == '\014');
618 /* Set start of current token */
619 tok->start = tok->cur - 1;
621 /* Skip comment, while looking for tab-setting magic */
622 if (c == '#') {
623 static char *tabforms[] = {
624 "tab-width:", /* Emacs */
625 ":tabstop=", /* vim, full form */
626 ":ts=", /* vim, abbreviated form */
627 "set tabsize=", /* will vi never die? */
628 /* more templates can be added here to support other editors */
630 char cbuf[80];
631 char *tp, **cp;
632 tp = cbuf;
633 do {
634 *tp++ = c = tok_nextc(tok);
635 } while (c != EOF && c != '\n' &&
636 tp - cbuf + 1 < sizeof(cbuf));
637 *tp = '\0';
638 for (cp = tabforms;
639 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
640 cp++) {
641 if ((tp = strstr(cbuf, *cp))) {
642 int newsize = atoi(tp + strlen(*cp));
644 if (newsize >= 1 && newsize <= 40) {
645 tok->tabsize = newsize;
646 if (Py_VerboseFlag)
647 PySys_WriteStderr(
648 "Tab size set to %d\n",
649 newsize);
653 while (c != EOF && c != '\n')
654 c = tok_nextc(tok);
657 /* Check for EOF and errors now */
658 if (c == EOF) {
659 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
662 /* Identifier (most frequent token!) */
663 if (isalpha(c) || c == '_') {
664 /* Process r"", u"" and ur"" */
665 switch (c) {
666 case 'r':
667 case 'R':
668 c = tok_nextc(tok);
669 if (c == '"' || c == '\'')
670 goto letter_quote;
671 break;
672 case 'u':
673 case 'U':
674 c = tok_nextc(tok);
675 if (c == 'r' || c == 'R')
676 c = tok_nextc(tok);
677 if (c == '"' || c == '\'')
678 goto letter_quote;
679 break;
681 while (isalnum(c) || c == '_') {
682 c = tok_nextc(tok);
684 tok_backup(tok, c);
685 *p_start = tok->start;
686 *p_end = tok->cur;
687 return NAME;
690 /* Newline */
691 if (c == '\n') {
692 tok->atbol = 1;
693 if (blankline || tok->level > 0)
694 goto nextline;
695 *p_start = tok->start;
696 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
697 return NEWLINE;
700 #ifdef macintosh
701 if (c == '\r') {
702 PySys_WriteStderr(
703 "File contains \\r characters (incorrect line endings?)\n");
704 tok->done = E_TOKEN;
705 tok->cur = tok->inp;
706 return ERRORTOKEN;
708 #endif
709 /* Period or number starting with period? */
710 if (c == '.') {
711 c = tok_nextc(tok);
712 if (isdigit(c)) {
713 goto fraction;
715 else {
716 tok_backup(tok, c);
717 *p_start = tok->start;
718 *p_end = tok->cur;
719 return DOT;
723 /* Number */
724 if (isdigit(c)) {
725 if (c == '0') {
726 /* Hex or octal -- maybe. */
727 c = tok_nextc(tok);
728 if (c == '.')
729 goto fraction;
730 #ifndef WITHOUT_COMPLEX
731 if (c == 'j' || c == 'J')
732 goto imaginary;
733 #endif
734 if (c == 'x' || c == 'X') {
735 /* Hex */
736 do {
737 c = tok_nextc(tok);
738 } while (isxdigit(c));
740 else {
741 int found_decimal = 0;
742 /* Octal; c is first char of it */
743 /* There's no 'isoctdigit' macro, sigh */
744 while ('0' <= c && c < '8') {
745 c = tok_nextc(tok);
747 if (isdigit(c)) {
748 found_decimal = 1;
749 do {
750 c = tok_nextc(tok);
751 } while (isdigit(c));
753 if (c == '.')
754 goto fraction;
755 else if (c == 'e' || c == 'E')
756 goto exponent;
757 #ifndef WITHOUT_COMPLEX
758 else if (c == 'j' || c == 'J')
759 goto imaginary;
760 #endif
761 else if (found_decimal) {
762 tok->done = E_TOKEN;
763 tok_backup(tok, c);
764 return ERRORTOKEN;
767 if (c == 'l' || c == 'L')
768 c = tok_nextc(tok);
770 else {
771 /* Decimal */
772 do {
773 c = tok_nextc(tok);
774 } while (isdigit(c));
775 if (c == 'l' || c == 'L')
776 c = tok_nextc(tok);
777 else {
778 /* Accept floating point numbers. */
779 if (c == '.') {
780 fraction:
781 /* Fraction */
782 do {
783 c = tok_nextc(tok);
784 } while (isdigit(c));
786 if (c == 'e' || c == 'E') {
787 exponent:
788 /* Exponent part */
789 c = tok_nextc(tok);
790 if (c == '+' || c == '-')
791 c = tok_nextc(tok);
792 if (!isdigit(c)) {
793 tok->done = E_TOKEN;
794 tok_backup(tok, c);
795 return ERRORTOKEN;
797 do {
798 c = tok_nextc(tok);
799 } while (isdigit(c));
801 #ifndef WITHOUT_COMPLEX
802 if (c == 'j' || c == 'J')
803 /* Imaginary part */
804 imaginary:
805 c = tok_nextc(tok);
806 #endif
809 tok_backup(tok, c);
810 *p_start = tok->start;
811 *p_end = tok->cur;
812 return NUMBER;
815 letter_quote:
816 /* String */
817 if (c == '\'' || c == '"') {
818 int quote2 = tok->cur - tok->start + 1;
819 int quote = c;
820 int triple = 0;
821 int tripcount = 0;
822 for (;;) {
823 c = tok_nextc(tok);
824 if (c == '\n') {
825 if (!triple) {
826 tok->done = E_TOKEN;
827 tok_backup(tok, c);
828 return ERRORTOKEN;
830 tripcount = 0;
832 else if (c == EOF) {
833 tok->done = E_TOKEN;
834 tok->cur = tok->inp;
835 return ERRORTOKEN;
837 else if (c == quote) {
838 tripcount++;
839 if (tok->cur - tok->start == quote2) {
840 c = tok_nextc(tok);
841 if (c == quote) {
842 triple = 1;
843 tripcount = 0;
844 continue;
846 tok_backup(tok, c);
848 if (!triple || tripcount == 3)
849 break;
851 else if (c == '\\') {
852 tripcount = 0;
853 c = tok_nextc(tok);
854 if (c == EOF) {
855 tok->done = E_TOKEN;
856 tok->cur = tok->inp;
857 return ERRORTOKEN;
860 else
861 tripcount = 0;
863 *p_start = tok->start;
864 *p_end = tok->cur;
865 return STRING;
868 /* Line continuation */
869 if (c == '\\') {
870 c = tok_nextc(tok);
871 if (c != '\n') {
872 tok->done = E_TOKEN;
873 tok->cur = tok->inp;
874 return ERRORTOKEN;
876 goto again; /* Read next line */
879 /* Check for two-character token */
881 int c2 = tok_nextc(tok);
882 int token = PyToken_TwoChars(c, c2);
883 if (token != OP) {
884 int c3 = tok_nextc(tok);
885 int token3 = PyToken_ThreeChars(c, c2, c3);
886 if (token3 != OP) {
887 token = token3;
888 } else {
889 tok_backup(tok, c3);
891 *p_start = tok->start;
892 *p_end = tok->cur;
893 return token;
895 tok_backup(tok, c2);
898 /* Keep track of parentheses nesting level */
899 switch (c) {
900 case '(':
901 case '[':
902 case '{':
903 tok->level++;
904 break;
905 case ')':
906 case ']':
907 case '}':
908 tok->level--;
909 break;
912 /* Punctuation character */
913 *p_start = tok->start;
914 *p_end = tok->cur;
915 return PyToken_OneChar(c);
919 #ifdef Py_DEBUG
921 void
922 tok_dump(int type, char *start, char *end)
924 printf("%s", _PyParser_TokenNames[type]);
925 if (type == NAME || type == NUMBER || type == STRING || type == OP)
926 printf("(%.*s)", (int)(end - start), start);
929 #endif