Bump to 2.3.1 to pick up the missing file.
[python/dscho.git] / Parser / tokenizer.c
blob749a59b68c285b49d98484d159128080f3c4cdf5
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
21 extern char *PyOS_Readline(char *);
22 /* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
26 /* Don't ever change this -- it would break the portability of Python code */
27 #define TABSIZE 8
29 /* Convert a possibly signed character to a nonnegative int */
30 /* XXX This assumes characters are 8 bits wide */
31 #ifdef __CHAR_UNSIGNED__
32 #define Py_CHARMASK(c) (c)
33 #else
34 #define Py_CHARMASK(c) ((c) & 0xff)
35 #endif
37 /* Forward */
38 static struct tok_state *tok_new(void);
39 static int tok_nextc(struct tok_state *tok);
40 static void tok_backup(struct tok_state *tok, int c);
42 /* Token names */
44 char *_PyParser_TokenNames[] = {
45 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
73 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
77 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
81 "DOUBLESTAR",
82 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
93 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
95 /* This table must match the #defines in token.h! */
96 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
102 /* Create and initialize a new tok_state structure */
104 static struct tok_state *
105 tok_new(void)
107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
108 if (tok == NULL)
109 return NULL;
110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
120 tok->level = 0;
121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
131 tok->cont_line = 0;
132 #ifndef PGEN
133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
135 #endif
136 return tok;
139 #ifdef PGEN
141 static char *
142 decoding_fgets(char *s, int size, struct tok_state *tok)
144 return fgets(s, size, tok->fp);
147 static int
148 decoding_feof(struct tok_state *tok)
150 return feof(tok->fp);
153 static const char *
154 decode_str(const char *str, struct tok_state *tok)
156 return str;
159 #else /* PGEN */
161 static char *
162 error_ret(struct tok_state *tok) /* XXX */
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166 PyMem_DEL(tok->buf);
167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
171 static char *
172 new_string(const char *s, int len)
174 char* result = PyMem_NEW(char, len + 1);
175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
179 return result;
182 static char *
183 get_normal_name(char *s) /* for utf-8 and latin-1 */
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
205 /* Return the coding spec in S, or NULL if none is found. */
207 static char *
208 get_coding_spec(const char *s, int size)
210 int i;
211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
219 for (; i < size - 6; i++) { /* XXX inefficient search */
220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
230 begin = t;
231 while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
232 t[0] == '.')
233 t++;
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
239 PyMem_DEL(r);
240 r = new_string(q, strlen(q));
242 return r;
246 return NULL;
249 /* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
254 static int
255 check_coding_spec(const char* line, int size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
258 char * cs;
259 int r = 1;
261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
264 cs = get_coding_spec(line, size);
265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
273 #ifdef Py_USING_UNICODE
274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
279 #else
280 /* Without Unicode support, we cannot
281 process the coding spec. Since there
282 won't be any Unicode literals, that
283 won't matter. */
284 #endif
286 } else { /* then, compare cs with BOM */
287 r = (strcmp(tok->encoding, cs) == 0);
288 PyMem_DEL(cs);
291 return r;
294 /* See whether the file starts with a BOM. If it does,
295 invoke the set_readline function with the new encoding.
296 Return 1 on success, 0 on failure. */
298 static int
299 check_bom(int get_char(struct tok_state *),
300 void unget_char(int, struct tok_state *),
301 int set_readline(struct tok_state *, const char *),
302 struct tok_state *tok)
304 int ch = get_char(tok);
305 tok->decoding_state = 1;
306 if (ch == EOF) {
307 return 1;
308 } else if (ch == 0xEF) {
309 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
310 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
311 #if 0
312 /* Disable support for UTF-16 BOMs until a decision
313 is made whether this needs to be supported. */
314 } else if (ch == 0xFE) {
315 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
316 if (!set_readline(tok, "utf-16-be")) return 0;
317 tok->decoding_state = -1;
318 } else if (ch == 0xFF) {
319 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
320 if (!set_readline(tok, "utf-16-le")) return 0;
321 tok->decoding_state = -1;
322 #endif
323 } else {
324 unget_char(ch, tok);
325 return 1;
327 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
328 return 1;
329 NON_BOM:
330 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
331 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
332 return 1;
335 /* Read a line of text from TOK into S, using the stream in TOK.
336 Return NULL on failure, else S. */
338 static char *
339 fp_readl(char *s, int size, struct tok_state *tok)
341 #ifndef Py_USING_UNICODE
342 /* In a non-Unicode built, this should never be called. */
343 Py_FatalError("fp_readl should not be called in this build.");
344 return NULL; /* Keep compiler happy (not reachable) */
345 #else
346 PyObject* utf8;
347 PyObject* buf = tok->decoding_buffer;
348 if (buf == NULL) {
349 PyObject *args = PyTuple_New(0);
350 if (args == NULL)
351 return error_ret(tok);
352 buf = PyObject_Call(tok->decoding_readline, args, NULL);
353 Py_DECREF(args);
354 if (buf == NULL)
355 return error_ret(tok);
356 } else {
357 tok->decoding_buffer = NULL;
359 utf8 = PyUnicode_AsUTF8String(buf);
360 Py_DECREF(buf);
361 if (utf8 == NULL)
362 return error_ret(tok);
363 else {
364 const char* str = PyString_AsString(utf8);
365 assert(strlen(str) < (size_t)size); /* XXX */
366 strcpy(s, str);
367 Py_DECREF(utf8);
368 if (s[0] == '\0') return NULL; /* EOF */
369 return s;
371 #endif
374 /* Set the readline function for TOK to a StreamReader's
375 readline function. The StreamReader is named ENC.
377 This function is called from check_bom and check_coding_spec.
379 ENC is usually identical to the future value of tok->encoding,
380 except for the (currently unsupported) case of UTF-16.
382 Return 1 on success, 0 on failure. */
384 static int
385 fp_setreadl(struct tok_state *tok, const char* enc)
387 PyObject *reader, *stream, *readline;
389 stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
390 if (stream == NULL)
391 return 0;
393 reader = PyCodec_StreamReader(enc, stream, NULL);
394 Py_DECREF(stream);
395 if (reader == NULL)
396 return 0;
398 readline = PyObject_GetAttrString(reader, "readline");
399 Py_DECREF(reader);
400 if (readline == NULL)
401 return 0;
403 tok->decoding_readline = readline;
404 return 1;
407 /* Fetch the next byte from TOK. */
409 static int fp_getc(struct tok_state *tok) {
410 return getc(tok->fp);
413 /* Unfetch the last byte back into TOK. */
415 static void fp_ungetc(int c, struct tok_state *tok) {
416 ungetc(c, tok->fp);
419 /* Read a line of input from TOK. Determine encoding
420 if necessary. */
422 static char *
423 decoding_fgets(char *s, int size, struct tok_state *tok)
425 char *line = NULL;
426 int warn = 0, badchar = 0;
427 for (;;) {
428 if (tok->decoding_state < 0) {
429 /* We already have a codec associated with
430 this input. */
431 line = fp_readl(s, size, tok);
432 break;
433 } else if (tok->decoding_state > 0) {
434 /* We want a 'raw' read. */
435 line = Py_UniversalNewlineFgets(s, size,
436 tok->fp, NULL);
437 warn = 1;
438 break;
439 } else {
440 /* We have not yet determined the encoding.
441 If an encoding is found, use the file-pointer
442 reader functions from now on. */
443 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
444 return error_ret(tok);
445 assert(tok->decoding_state != 0);
448 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
449 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
450 return error_ret(tok);
453 #ifndef PGEN
454 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
455 unsigned char *c;
456 for (c = (unsigned char *)line; *c; c++)
457 if (*c > 127) {
458 badchar = *c;
459 break;
462 if (badchar) {
463 char buf[200];
464 sprintf(buf, "Non-ASCII character '\\x%.2x', "
465 "but no declared encoding", badchar);
466 /* Need to add 1 to the line number, since this line
467 has not been counted, yet. */
468 PyErr_WarnExplicit(PyExc_DeprecationWarning,
469 buf, tok->filename, tok->lineno + 1,
470 NULL, NULL);
471 tok->issued_encoding_warning = 1;
473 #endif
474 return line;
477 static int
478 decoding_feof(struct tok_state *tok)
480 if (tok->decoding_state >= 0) {
481 return feof(tok->fp);
482 } else {
483 PyObject* buf = tok->decoding_buffer;
484 if (buf == NULL) {
485 PyObject *args = PyTuple_New(0);
486 if (args == NULL) {
487 error_ret(tok);
488 return 1;
490 buf = PyObject_Call(tok->decoding_readline,
491 args, NULL);
492 Py_DECREF(args);
493 if (buf == NULL) {
494 error_ret(tok);
495 return 1;
496 } else {
497 tok->decoding_buffer = buf;
500 return PyObject_Length(buf) == 0;
504 /* Fetch a byte from TOK, using the string buffer. */
506 static int buf_getc(struct tok_state *tok) {
507 return *tok->str++;
510 /* Unfetch a byte from TOK, using the string buffer. */
512 static void buf_ungetc(int c, struct tok_state *tok) {
513 tok->str--;
514 assert(*tok->str == c); /* tok->cur may point to read-only segment */
517 /* Set the readline function for TOK to ENC. For the string-based
518 tokenizer, this means to just record the encoding. */
520 static int buf_setreadl(struct tok_state *tok, const char* enc) {
521 tok->enc = enc;
522 return 1;
525 /* Return a UTF-8 encoding Python string object from the
526 C byte string STR, which is encoded with ENC. */
528 #ifdef Py_USING_UNICODE
529 static PyObject *
530 translate_into_utf8(const char* str, const char* enc) {
531 PyObject *utf8;
532 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
533 if (buf == NULL)
534 return NULL;
535 utf8 = PyUnicode_AsUTF8String(buf);
536 Py_DECREF(buf);
537 return utf8;
539 #endif
541 /* Decode a byte string STR for use as the buffer of TOK.
542 Look for encoding declarations inside STR, and record them
543 inside TOK. */
545 static const char *
546 decode_str(const char *str, struct tok_state *tok)
548 PyObject* utf8 = NULL;
549 const char *s;
550 int lineno = 0;
551 tok->enc = NULL;
552 tok->str = str;
553 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
554 return NULL;
555 str = tok->str; /* string after BOM if any */
556 assert(str);
557 #ifdef Py_USING_UNICODE
558 if (tok->enc != NULL) {
559 utf8 = translate_into_utf8(str, tok->enc);
560 if (utf8 == NULL)
561 return NULL;
562 str = PyString_AsString(utf8);
564 #endif
565 for (s = str;; s++) {
566 if (*s == '\0') break;
567 else if (*s == '\n') {
568 lineno++;
569 if (lineno == 2) break;
572 tok->enc = NULL;
573 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
574 return NULL;
575 #ifdef Py_USING_UNICODE
576 if (tok->enc != NULL) {
577 assert(utf8 == NULL);
578 utf8 = translate_into_utf8(str, tok->enc);
579 if (utf8 == NULL)
580 return NULL;
581 str = PyString_AsString(utf8);
583 #endif
584 assert(tok->decoding_buffer == NULL);
585 tok->decoding_buffer = utf8; /* CAUTION */
586 return str;
589 #endif /* PGEN */
591 /* Set up tokenizer for string */
593 struct tok_state *
594 PyTokenizer_FromString(char *str)
596 struct tok_state *tok = tok_new();
597 if (tok == NULL)
598 return NULL;
599 str = (char *)decode_str(str, tok);
600 if (str == NULL)
601 return NULL;
602 tok->buf = tok->cur = tok->end = tok->inp = str;
603 return tok;
607 /* Set up tokenizer for file */
609 struct tok_state *
610 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
612 struct tok_state *tok = tok_new();
613 if (tok == NULL)
614 return NULL;
615 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
616 PyMem_DEL(tok);
617 return NULL;
619 tok->cur = tok->inp = tok->buf;
620 tok->end = tok->buf + BUFSIZ;
621 tok->fp = fp;
622 tok->prompt = ps1;
623 tok->nextprompt = ps2;
624 return tok;
628 /* Free a tok_state structure */
630 void
631 PyTokenizer_Free(struct tok_state *tok)
633 if (tok->encoding != NULL)
634 PyMem_DEL(tok->encoding);
635 #ifndef PGEN
636 Py_XDECREF(tok->decoding_readline);
637 Py_XDECREF(tok->decoding_buffer);
638 #endif
639 if (tok->fp != NULL && tok->buf != NULL)
640 PyMem_DEL(tok->buf);
641 PyMem_DEL(tok);
645 /* Get next char, updating state; error code goes into tok->done */
647 static int
648 tok_nextc(register struct tok_state *tok)
650 for (;;) {
651 if (tok->cur != tok->inp) {
652 return Py_CHARMASK(*tok->cur++); /* Fast path */
654 if (tok->done != E_OK)
655 return EOF;
656 if (tok->fp == NULL) {
657 char *end = strchr(tok->inp, '\n');
658 if (end != NULL)
659 end++;
660 else {
661 end = strchr(tok->inp, '\0');
662 if (end == tok->inp) {
663 tok->done = E_EOF;
664 return EOF;
667 if (tok->start == NULL)
668 tok->buf = tok->cur;
669 tok->lineno++;
670 tok->inp = end;
671 return Py_CHARMASK(*tok->cur++);
673 if (tok->prompt != NULL) {
674 char *new = PyOS_Readline(tok->prompt);
675 if (tok->nextprompt != NULL)
676 tok->prompt = tok->nextprompt;
677 if (new == NULL)
678 tok->done = E_INTR;
679 else if (*new == '\0') {
680 PyMem_FREE(new);
681 tok->done = E_EOF;
683 else if (tok->start != NULL) {
684 size_t start = tok->start - tok->buf;
685 size_t oldlen = tok->cur - tok->buf;
686 size_t newlen = oldlen + strlen(new);
687 char *buf = tok->buf;
688 PyMem_RESIZE(buf, char, newlen+1);
689 tok->lineno++;
690 if (buf == NULL) {
691 PyMem_DEL(tok->buf);
692 tok->buf = NULL;
693 PyMem_FREE(new);
694 tok->done = E_NOMEM;
695 return EOF;
697 tok->buf = buf;
698 tok->cur = tok->buf + oldlen;
699 strcpy(tok->buf + oldlen, new);
700 PyMem_FREE(new);
701 tok->inp = tok->buf + newlen;
702 tok->end = tok->inp + 1;
703 tok->start = tok->buf + start;
705 else {
706 tok->lineno++;
707 if (tok->buf != NULL)
708 PyMem_DEL(tok->buf);
709 tok->buf = new;
710 tok->cur = tok->buf;
711 tok->inp = strchr(tok->buf, '\0');
712 tok->end = tok->inp + 1;
715 else {
716 int done = 0;
717 int cur = 0;
718 char *pt;
719 if (tok->start == NULL) {
720 if (tok->buf == NULL) {
721 tok->buf = PyMem_NEW(char, BUFSIZ);
722 if (tok->buf == NULL) {
723 tok->done = E_NOMEM;
724 return EOF;
726 tok->end = tok->buf + BUFSIZ;
728 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
729 tok) == NULL) {
730 tok->done = E_EOF;
731 done = 1;
733 else {
734 tok->done = E_OK;
735 tok->inp = strchr(tok->buf, '\0');
736 done = tok->inp[-1] == '\n';
739 else {
740 cur = tok->cur - tok->buf;
741 if (decoding_feof(tok)) {
742 tok->done = E_EOF;
743 done = 1;
745 else
746 tok->done = E_OK;
748 tok->lineno++;
749 /* Read until '\n' or EOF */
750 while (!done) {
751 int curstart = tok->start == NULL ? -1 :
752 tok->start - tok->buf;
753 int curvalid = tok->inp - tok->buf;
754 int newsize = curvalid + BUFSIZ;
755 char *newbuf = tok->buf;
756 PyMem_RESIZE(newbuf, char, newsize);
757 if (newbuf == NULL) {
758 tok->done = E_NOMEM;
759 tok->cur = tok->inp;
760 return EOF;
762 tok->buf = newbuf;
763 tok->inp = tok->buf + curvalid;
764 tok->end = tok->buf + newsize;
765 tok->start = curstart < 0 ? NULL :
766 tok->buf + curstart;
767 if (decoding_fgets(tok->inp,
768 (int)(tok->end - tok->inp),
769 tok) == NULL) {
770 /* Last line does not end in \n,
771 fake one */
772 strcpy(tok->inp, "\n");
774 tok->inp = strchr(tok->inp, '\0');
775 done = tok->inp[-1] == '\n';
777 tok->cur = tok->buf + cur;
778 #ifndef macintosh
779 /* replace "\r\n" with "\n" */
780 /* For Mac we leave the \r, giving a syntax error */
781 pt = tok->inp - 2;
782 if (pt >= tok->buf && *pt == '\r') {
783 *pt++ = '\n';
784 *pt = '\0';
785 tok->inp = pt;
787 #endif
789 if (tok->done != E_OK) {
790 if (tok->prompt != NULL)
791 PySys_WriteStderr("\n");
792 tok->cur = tok->inp;
793 return EOF;
796 /*NOTREACHED*/
800 /* Back-up one character */
802 static void
803 tok_backup(register struct tok_state *tok, register int c)
805 if (c != EOF) {
806 if (--tok->cur < tok->buf)
807 Py_FatalError("tok_backup: begin of buffer");
808 if (*tok->cur != c)
809 *tok->cur = c;
814 /* Return the token corresponding to a single character */
817 PyToken_OneChar(int c)
819 switch (c) {
820 case '(': return LPAR;
821 case ')': return RPAR;
822 case '[': return LSQB;
823 case ']': return RSQB;
824 case ':': return COLON;
825 case ',': return COMMA;
826 case ';': return SEMI;
827 case '+': return PLUS;
828 case '-': return MINUS;
829 case '*': return STAR;
830 case '/': return SLASH;
831 case '|': return VBAR;
832 case '&': return AMPER;
833 case '<': return LESS;
834 case '>': return GREATER;
835 case '=': return EQUAL;
836 case '.': return DOT;
837 case '%': return PERCENT;
838 case '`': return BACKQUOTE;
839 case '{': return LBRACE;
840 case '}': return RBRACE;
841 case '^': return CIRCUMFLEX;
842 case '~': return TILDE;
843 default: return OP;
849 PyToken_TwoChars(int c1, int c2)
851 switch (c1) {
852 case '=':
853 switch (c2) {
854 case '=': return EQEQUAL;
856 break;
857 case '!':
858 switch (c2) {
859 case '=': return NOTEQUAL;
861 break;
862 case '<':
863 switch (c2) {
864 case '>': return NOTEQUAL;
865 case '=': return LESSEQUAL;
866 case '<': return LEFTSHIFT;
868 break;
869 case '>':
870 switch (c2) {
871 case '=': return GREATEREQUAL;
872 case '>': return RIGHTSHIFT;
874 break;
875 case '+':
876 switch (c2) {
877 case '=': return PLUSEQUAL;
879 break;
880 case '-':
881 switch (c2) {
882 case '=': return MINEQUAL;
884 break;
885 case '*':
886 switch (c2) {
887 case '*': return DOUBLESTAR;
888 case '=': return STAREQUAL;
890 break;
891 case '/':
892 switch (c2) {
893 case '/': return DOUBLESLASH;
894 case '=': return SLASHEQUAL;
896 break;
897 case '|':
898 switch (c2) {
899 case '=': return VBAREQUAL;
901 break;
902 case '%':
903 switch (c2) {
904 case '=': return PERCENTEQUAL;
906 break;
907 case '&':
908 switch (c2) {
909 case '=': return AMPEREQUAL;
911 break;
912 case '^':
913 switch (c2) {
914 case '=': return CIRCUMFLEXEQUAL;
916 break;
918 return OP;
922 PyToken_ThreeChars(int c1, int c2, int c3)
924 switch (c1) {
925 case '<':
926 switch (c2) {
927 case '<':
928 switch (c3) {
929 case '=':
930 return LEFTSHIFTEQUAL;
932 break;
934 break;
935 case '>':
936 switch (c2) {
937 case '>':
938 switch (c3) {
939 case '=':
940 return RIGHTSHIFTEQUAL;
942 break;
944 break;
945 case '*':
946 switch (c2) {
947 case '*':
948 switch (c3) {
949 case '=':
950 return DOUBLESTAREQUAL;
952 break;
954 break;
955 case '/':
956 switch (c2) {
957 case '/':
958 switch (c3) {
959 case '=':
960 return DOUBLESLASHEQUAL;
962 break;
964 break;
966 return OP;
969 static int
970 indenterror(struct tok_state *tok)
972 if (tok->alterror) {
973 tok->done = E_TABSPACE;
974 tok->cur = tok->inp;
975 return 1;
977 if (tok->altwarning) {
978 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
979 "in indentation\n", tok->filename);
980 tok->altwarning = 0;
982 return 0;
986 /* Get next token, after space stripping etc. */
988 static int
989 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
991 register int c;
992 int blankline;
994 *p_start = *p_end = NULL;
995 nextline:
996 tok->start = NULL;
997 blankline = 0;
999 /* Get indentation level */
1000 if (tok->atbol) {
1001 register int col = 0;
1002 register int altcol = 0;
1003 tok->atbol = 0;
1004 for (;;) {
1005 c = tok_nextc(tok);
1006 if (c == ' ')
1007 col++, altcol++;
1008 else if (c == '\t') {
1009 col = (col/tok->tabsize + 1) * tok->tabsize;
1010 altcol = (altcol/tok->alttabsize + 1)
1011 * tok->alttabsize;
1013 else if (c == '\014') /* Control-L (formfeed) */
1014 col = altcol = 0; /* For Emacs users */
1015 else
1016 break;
1018 tok_backup(tok, c);
1019 if (c == '#' || c == '\n') {
1020 /* Lines with only whitespace and/or comments
1021 shouldn't affect the indentation and are
1022 not passed to the parser as NEWLINE tokens,
1023 except *totally* empty lines in interactive
1024 mode, which signal the end of a command group. */
1025 if (col == 0 && c == '\n' && tok->prompt != NULL)
1026 blankline = 0; /* Let it through */
1027 else
1028 blankline = 1; /* Ignore completely */
1029 /* We can't jump back right here since we still
1030 may need to skip to the end of a comment */
1032 if (!blankline && tok->level == 0) {
1033 if (col == tok->indstack[tok->indent]) {
1034 /* No change */
1035 if (altcol != tok->altindstack[tok->indent]) {
1036 if (indenterror(tok))
1037 return ERRORTOKEN;
1040 else if (col > tok->indstack[tok->indent]) {
1041 /* Indent -- always one */
1042 if (tok->indent+1 >= MAXINDENT) {
1043 tok->done = E_TOODEEP;
1044 tok->cur = tok->inp;
1045 return ERRORTOKEN;
1047 if (altcol <= tok->altindstack[tok->indent]) {
1048 if (indenterror(tok))
1049 return ERRORTOKEN;
1051 tok->pendin++;
1052 tok->indstack[++tok->indent] = col;
1053 tok->altindstack[tok->indent] = altcol;
1055 else /* col < tok->indstack[tok->indent] */ {
1056 /* Dedent -- any number, must be consistent */
1057 while (tok->indent > 0 &&
1058 col < tok->indstack[tok->indent]) {
1059 tok->pendin--;
1060 tok->indent--;
1062 if (col != tok->indstack[tok->indent]) {
1063 tok->done = E_DEDENT;
1064 tok->cur = tok->inp;
1065 return ERRORTOKEN;
1067 if (altcol != tok->altindstack[tok->indent]) {
1068 if (indenterror(tok))
1069 return ERRORTOKEN;
1075 tok->start = tok->cur;
1077 /* Return pending indents/dedents */
1078 if (tok->pendin != 0) {
1079 if (tok->pendin < 0) {
1080 tok->pendin++;
1081 return DEDENT;
1083 else {
1084 tok->pendin--;
1085 return INDENT;
1089 again:
1090 tok->start = NULL;
1091 /* Skip spaces */
1092 do {
1093 c = tok_nextc(tok);
1094 } while (c == ' ' || c == '\t' || c == '\014');
1096 /* Set start of current token */
1097 tok->start = tok->cur - 1;
1099 /* Skip comment, while looking for tab-setting magic */
1100 if (c == '#') {
1101 static char *tabforms[] = {
1102 "tab-width:", /* Emacs */
1103 ":tabstop=", /* vim, full form */
1104 ":ts=", /* vim, abbreviated form */
1105 "set tabsize=", /* will vi never die? */
1106 /* more templates can be added here to support other editors */
1108 char cbuf[80];
1109 char *tp, **cp;
1110 tp = cbuf;
1111 do {
1112 *tp++ = c = tok_nextc(tok);
1113 } while (c != EOF && c != '\n' &&
1114 tp - cbuf + 1 < sizeof(cbuf));
1115 *tp = '\0';
1116 for (cp = tabforms;
1117 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1118 cp++) {
1119 if ((tp = strstr(cbuf, *cp))) {
1120 int newsize = atoi(tp + strlen(*cp));
1122 if (newsize >= 1 && newsize <= 40) {
1123 tok->tabsize = newsize;
1124 if (Py_VerboseFlag)
1125 PySys_WriteStderr(
1126 "Tab size set to %d\n",
1127 newsize);
1131 while (c != EOF && c != '\n')
1132 c = tok_nextc(tok);
1135 /* Check for EOF and errors now */
1136 if (c == EOF) {
1137 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1140 /* Identifier (most frequent token!) */
1141 if (isalpha(c) || c == '_') {
1142 /* Process r"", u"" and ur"" */
1143 switch (c) {
1144 case 'r':
1145 case 'R':
1146 c = tok_nextc(tok);
1147 if (c == '"' || c == '\'')
1148 goto letter_quote;
1149 break;
1150 case 'u':
1151 case 'U':
1152 c = tok_nextc(tok);
1153 if (c == 'r' || c == 'R')
1154 c = tok_nextc(tok);
1155 if (c == '"' || c == '\'')
1156 goto letter_quote;
1157 break;
1159 while (isalnum(c) || c == '_') {
1160 c = tok_nextc(tok);
1162 tok_backup(tok, c);
1163 *p_start = tok->start;
1164 *p_end = tok->cur;
1165 return NAME;
1168 /* Newline */
1169 if (c == '\n') {
1170 tok->atbol = 1;
1171 if (blankline || tok->level > 0)
1172 goto nextline;
1173 *p_start = tok->start;
1174 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1175 tok->cont_line = 0;
1176 return NEWLINE;
1179 #ifdef macintosh
1180 if (c == '\r') {
1181 PySys_WriteStderr(
1182 "File contains \\r characters (incorrect line endings?)\n");
1183 tok->done = E_TOKEN;
1184 tok->cur = tok->inp;
1185 return ERRORTOKEN;
1187 #endif
1188 /* Period or number starting with period? */
1189 if (c == '.') {
1190 c = tok_nextc(tok);
1191 if (isdigit(c)) {
1192 goto fraction;
1194 else {
1195 tok_backup(tok, c);
1196 *p_start = tok->start;
1197 *p_end = tok->cur;
1198 return DOT;
1202 /* Number */
1203 if (isdigit(c)) {
1204 if (c == '0') {
1205 /* Hex or octal -- maybe. */
1206 c = tok_nextc(tok);
1207 if (c == '.')
1208 goto fraction;
1209 #ifndef WITHOUT_COMPLEX
1210 if (c == 'j' || c == 'J')
1211 goto imaginary;
1212 #endif
1213 if (c == 'x' || c == 'X') {
1214 /* Hex */
1215 do {
1216 c = tok_nextc(tok);
1217 } while (isxdigit(c));
1219 else {
1220 int found_decimal = 0;
1221 /* Octal; c is first char of it */
1222 /* There's no 'isoctdigit' macro, sigh */
1223 while ('0' <= c && c < '8') {
1224 c = tok_nextc(tok);
1226 if (isdigit(c)) {
1227 found_decimal = 1;
1228 do {
1229 c = tok_nextc(tok);
1230 } while (isdigit(c));
1232 if (c == '.')
1233 goto fraction;
1234 else if (c == 'e' || c == 'E')
1235 goto exponent;
1236 #ifndef WITHOUT_COMPLEX
1237 else if (c == 'j' || c == 'J')
1238 goto imaginary;
1239 #endif
1240 else if (found_decimal) {
1241 tok->done = E_TOKEN;
1242 tok_backup(tok, c);
1243 return ERRORTOKEN;
1246 if (c == 'l' || c == 'L')
1247 c = tok_nextc(tok);
1249 else {
1250 /* Decimal */
1251 do {
1252 c = tok_nextc(tok);
1253 } while (isdigit(c));
1254 if (c == 'l' || c == 'L')
1255 c = tok_nextc(tok);
1256 else {
1257 /* Accept floating point numbers. */
1258 if (c == '.') {
1259 fraction:
1260 /* Fraction */
1261 do {
1262 c = tok_nextc(tok);
1263 } while (isdigit(c));
1265 if (c == 'e' || c == 'E') {
1266 exponent:
1267 /* Exponent part */
1268 c = tok_nextc(tok);
1269 if (c == '+' || c == '-')
1270 c = tok_nextc(tok);
1271 if (!isdigit(c)) {
1272 tok->done = E_TOKEN;
1273 tok_backup(tok, c);
1274 return ERRORTOKEN;
1276 do {
1277 c = tok_nextc(tok);
1278 } while (isdigit(c));
1280 #ifndef WITHOUT_COMPLEX
1281 if (c == 'j' || c == 'J')
1282 /* Imaginary part */
1283 imaginary:
1284 c = tok_nextc(tok);
1285 #endif
1288 tok_backup(tok, c);
1289 *p_start = tok->start;
1290 *p_end = tok->cur;
1291 return NUMBER;
1294 letter_quote:
1295 /* String */
1296 if (c == '\'' || c == '"') {
1297 int quote2 = tok->cur - tok->start + 1;
1298 int quote = c;
1299 int triple = 0;
1300 int tripcount = 0;
1301 for (;;) {
1302 c = tok_nextc(tok);
1303 if (c == '\n') {
1304 if (!triple) {
1305 tok->done = E_EOLS;
1306 tok_backup(tok, c);
1307 return ERRORTOKEN;
1309 tripcount = 0;
1310 tok->cont_line = 1; /* multiline string. */
1312 else if (c == EOF) {
1313 if (triple)
1314 tok->done = E_EOFS;
1315 else
1316 tok->done = E_EOLS;
1317 tok->cur = tok->inp;
1318 return ERRORTOKEN;
1320 else if (c == quote) {
1321 tripcount++;
1322 if (tok->cur - tok->start == quote2) {
1323 c = tok_nextc(tok);
1324 if (c == quote) {
1325 triple = 1;
1326 tripcount = 0;
1327 continue;
1329 tok_backup(tok, c);
1331 if (!triple || tripcount == 3)
1332 break;
1334 else if (c == '\\') {
1335 tripcount = 0;
1336 c = tok_nextc(tok);
1337 if (c == EOF) {
1338 tok->done = E_EOLS;
1339 tok->cur = tok->inp;
1340 return ERRORTOKEN;
1343 else
1344 tripcount = 0;
1346 *p_start = tok->start;
1347 *p_end = tok->cur;
1348 return STRING;
1351 /* Line continuation */
1352 if (c == '\\') {
1353 c = tok_nextc(tok);
1354 if (c != '\n') {
1355 tok->done = E_TOKEN;
1356 tok->cur = tok->inp;
1357 return ERRORTOKEN;
1359 tok->cont_line = 1;
1360 goto again; /* Read next line */
1363 /* Check for two-character token */
1365 int c2 = tok_nextc(tok);
1366 int token = PyToken_TwoChars(c, c2);
1367 if (token != OP) {
1368 int c3 = tok_nextc(tok);
1369 int token3 = PyToken_ThreeChars(c, c2, c3);
1370 if (token3 != OP) {
1371 token = token3;
1372 } else {
1373 tok_backup(tok, c3);
1375 *p_start = tok->start;
1376 *p_end = tok->cur;
1377 return token;
1379 tok_backup(tok, c2);
1382 /* Keep track of parentheses nesting level */
1383 switch (c) {
1384 case '(':
1385 case '[':
1386 case '{':
1387 tok->level++;
1388 break;
1389 case ')':
1390 case ']':
1391 case '}':
1392 tok->level--;
1393 break;
1396 /* Punctuation character */
1397 *p_start = tok->start;
1398 *p_end = tok->cur;
1399 return PyToken_OneChar(c);
1403 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1405 int result = tok_get(tok, p_start, p_end);
1406 if (tok->decoding_erred) {
1407 result = ERRORTOKEN;
1408 tok->done = E_DECODE;
1410 return result;
1413 #ifdef Py_DEBUG
1415 void
1416 tok_dump(int type, char *start, char *end)
1418 printf("%s", _PyParser_TokenNames[type]);
1419 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1420 printf("(%.*s)", (int)(end - start), start);
1423 #endif