Fix sf bug 666219: assertion error in httplib.
[python/dscho.git] / Parser / tokenizer.c
bloba97720c54cf3d1544f7b7eed4e1d834091eb5819
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
21 extern char *PyOS_Readline(FILE *, FILE *, char *);
22 /* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
26 /* Don't ever change this -- it would break the portability of Python code */
27 #define TABSIZE 8
29 /* Convert a possibly signed character to a nonnegative int */
30 /* XXX This assumes characters are 8 bits wide */
31 #ifdef __CHAR_UNSIGNED__
32 #define Py_CHARMASK(c) (c)
33 #else
34 #define Py_CHARMASK(c) ((c) & 0xff)
35 #endif
37 /* Forward */
38 static struct tok_state *tok_new(void);
39 static int tok_nextc(struct tok_state *tok);
40 static void tok_backup(struct tok_state *tok, int c);
42 /* Token names */
44 char *_PyParser_TokenNames[] = {
45 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
73 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
77 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
81 "DOUBLESTAR",
82 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
93 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
95 /* This table must match the #defines in token.h! */
96 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
102 /* Create and initialize a new tok_state structure */
104 static struct tok_state *
105 tok_new(void)
107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
108 if (tok == NULL)
109 return NULL;
110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
120 tok->level = 0;
121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
131 tok->cont_line = 0;
132 #ifndef PGEN
133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
135 #endif
136 return tok;
139 #ifdef PGEN
141 static char *
142 decoding_fgets(char *s, int size, struct tok_state *tok)
144 return fgets(s, size, tok->fp);
147 static int
148 decoding_feof(struct tok_state *tok)
150 return feof(tok->fp);
153 static const char *
154 decode_str(const char *str, struct tok_state *tok)
156 return str;
159 #else /* PGEN */
161 static char *
162 error_ret(struct tok_state *tok) /* XXX */
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166 PyMem_DEL(tok->buf);
167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
171 static char *
172 new_string(const char *s, int len)
174 char* result = PyMem_NEW(char, len + 1);
175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
179 return result;
182 static char *
183 get_normal_name(char *s) /* for utf-8 and latin-1 */
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
205 /* Return the coding spec in S, or NULL if none is found. */
207 static char *
208 get_coding_spec(const char *s, int size)
210 int i;
211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
219 for (; i < size - 6; i++) { /* XXX inefficient search */
220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
230 begin = t;
231 while (isalnum((int)t[0]) ||
232 t[0] == '-' || t[0] == '_' || t[0] == '.')
233 t++;
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
239 PyMem_DEL(r);
240 r = new_string(q, strlen(q));
242 return r;
246 return NULL;
249 /* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
254 static int
255 check_coding_spec(const char* line, int size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
258 char * cs;
259 int r = 1;
261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
264 cs = get_coding_spec(line, size);
265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
273 #ifdef Py_USING_UNICODE
274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
279 #else
280 /* Without Unicode support, we cannot
281 process the coding spec. Since there
282 won't be any Unicode literals, that
283 won't matter. */
284 #endif
286 } else { /* then, compare cs with BOM */
287 r = (strcmp(tok->encoding, cs) == 0);
288 PyMem_DEL(cs);
291 return r;
294 /* See whether the file starts with a BOM. If it does,
295 invoke the set_readline function with the new encoding.
296 Return 1 on success, 0 on failure. */
298 static int
299 check_bom(int get_char(struct tok_state *),
300 void unget_char(int, struct tok_state *),
301 int set_readline(struct tok_state *, const char *),
302 struct tok_state *tok)
304 int ch = get_char(tok);
305 tok->decoding_state = 1;
306 if (ch == EOF) {
307 return 1;
308 } else if (ch == 0xEF) {
309 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
310 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
311 #if 0
312 /* Disable support for UTF-16 BOMs until a decision
313 is made whether this needs to be supported. */
314 } else if (ch == 0xFE) {
315 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
316 if (!set_readline(tok, "utf-16-be")) return 0;
317 tok->decoding_state = -1;
318 } else if (ch == 0xFF) {
319 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
320 if (!set_readline(tok, "utf-16-le")) return 0;
321 tok->decoding_state = -1;
322 #endif
323 } else {
324 unget_char(ch, tok);
325 return 1;
327 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
328 return 1;
329 NON_BOM:
330 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
331 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
332 return 1;
335 /* Read a line of text from TOK into S, using the stream in TOK.
336 Return NULL on failure, else S. */
338 static char *
339 fp_readl(char *s, int size, struct tok_state *tok)
341 #ifndef Py_USING_UNICODE
342 /* In a non-Unicode built, this should never be called. */
343 Py_FatalError("fp_readl should not be called in this build.");
344 return NULL; /* Keep compiler happy (not reachable) */
345 #else
346 PyObject* utf8;
347 PyObject* buf = tok->decoding_buffer;
348 if (buf == NULL) {
349 /* Ask for one less byte so we can terminate it */
350 PyObject *args = Py_BuildValue("(i)", size-1);
351 if (args == NULL)
352 return error_ret(tok);
353 buf = PyObject_Call(tok->decoding_readline, args, NULL);
354 Py_DECREF(args);
355 if (buf == NULL)
356 return error_ret(tok);
357 } else {
358 tok->decoding_buffer = NULL;
360 utf8 = PyUnicode_AsUTF8String(buf);
361 Py_DECREF(buf);
362 if (utf8 == NULL)
363 return error_ret(tok);
364 else {
365 const char* str = PyString_AsString(utf8);
366 assert(strlen(str) < (size_t)size); /* XXX */
367 strcpy(s, str);
368 Py_DECREF(utf8);
369 if (s[0] == '\0') return NULL; /* EOF */
370 return s;
372 #endif
375 /* Set the readline function for TOK to a StreamReader's
376 readline function. The StreamReader is named ENC.
378 This function is called from check_bom and check_coding_spec.
380 ENC is usually identical to the future value of tok->encoding,
381 except for the (currently unsupported) case of UTF-16.
383 Return 1 on success, 0 on failure. */
385 static int
386 fp_setreadl(struct tok_state *tok, const char* enc)
388 PyObject *reader, *stream, *readline;
390 /* XXX: constify filename argument. */
391 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
392 if (stream == NULL)
393 return 0;
395 reader = PyCodec_StreamReader(enc, stream, NULL);
396 Py_DECREF(stream);
397 if (reader == NULL)
398 return 0;
400 readline = PyObject_GetAttrString(reader, "readline");
401 Py_DECREF(reader);
402 if (readline == NULL)
403 return 0;
405 tok->decoding_readline = readline;
406 return 1;
409 /* Fetch the next byte from TOK. */
411 static int fp_getc(struct tok_state *tok) {
412 return getc(tok->fp);
415 /* Unfetch the last byte back into TOK. */
417 static void fp_ungetc(int c, struct tok_state *tok) {
418 ungetc(c, tok->fp);
421 /* Read a line of input from TOK. Determine encoding
422 if necessary. */
424 static char *
425 decoding_fgets(char *s, int size, struct tok_state *tok)
427 char *line = NULL;
428 int warn = 0, badchar = 0;
429 for (;;) {
430 if (tok->decoding_state < 0) {
431 /* We already have a codec associated with
432 this input. */
433 line = fp_readl(s, size, tok);
434 break;
435 } else if (tok->decoding_state > 0) {
436 /* We want a 'raw' read. */
437 line = Py_UniversalNewlineFgets(s, size,
438 tok->fp, NULL);
439 warn = 1;
440 break;
441 } else {
442 /* We have not yet determined the encoding.
443 If an encoding is found, use the file-pointer
444 reader functions from now on. */
445 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
446 return error_ret(tok);
447 assert(tok->decoding_state != 0);
450 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
451 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
452 return error_ret(tok);
455 #ifndef PGEN
456 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
457 unsigned char *c;
458 for (c = (unsigned char *)line; *c; c++)
459 if (*c > 127) {
460 badchar = *c;
461 break;
464 if (badchar) {
465 char buf[500];
466 /* Need to add 1 to the line number, since this line
467 has not been counted, yet. */
468 sprintf(buf,
469 "Non-ASCII character '\\x%.2x' "
470 "in file %.200s on line %i, "
471 "but no encoding declared; "
472 "see http://www.python.org/peps/pep-0263.html for details",
473 badchar, tok->filename, tok->lineno + 1);
474 /* We don't use PyErr_WarnExplicit() here because
475 printing the line in question to e.g. a log file
476 could result in sensitive information being
477 exposed. */
478 PyErr_Warn(PyExc_DeprecationWarning, buf);
479 tok->issued_encoding_warning = 1;
481 #endif
482 return line;
485 static int
486 decoding_feof(struct tok_state *tok)
488 if (tok->decoding_state >= 0) {
489 return feof(tok->fp);
490 } else {
491 PyObject* buf = tok->decoding_buffer;
492 if (buf == NULL) {
493 PyObject *args = PyTuple_New(0);
494 if (args == NULL) {
495 error_ret(tok);
496 return 1;
498 buf = PyObject_Call(tok->decoding_readline,
499 args, NULL);
500 Py_DECREF(args);
501 if (buf == NULL) {
502 error_ret(tok);
503 return 1;
504 } else {
505 tok->decoding_buffer = buf;
508 return PyObject_Length(buf) == 0;
512 /* Fetch a byte from TOK, using the string buffer. */
514 static int buf_getc(struct tok_state *tok) {
515 return Py_CHARMASK(*tok->str++);
518 /* Unfetch a byte from TOK, using the string buffer. */
520 static void buf_ungetc(int c, struct tok_state *tok) {
521 tok->str--;
522 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
525 /* Set the readline function for TOK to ENC. For the string-based
526 tokenizer, this means to just record the encoding. */
528 static int buf_setreadl(struct tok_state *tok, const char* enc) {
529 tok->enc = enc;
530 return 1;
533 /* Return a UTF-8 encoding Python string object from the
534 C byte string STR, which is encoded with ENC. */
536 #ifdef Py_USING_UNICODE
537 static PyObject *
538 translate_into_utf8(const char* str, const char* enc) {
539 PyObject *utf8;
540 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
541 if (buf == NULL)
542 return NULL;
543 utf8 = PyUnicode_AsUTF8String(buf);
544 Py_DECREF(buf);
545 return utf8;
547 #endif
549 /* Decode a byte string STR for use as the buffer of TOK.
550 Look for encoding declarations inside STR, and record them
551 inside TOK. */
553 static const char *
554 decode_str(const char *str, struct tok_state *tok)
556 PyObject* utf8 = NULL;
557 const char *s;
558 int lineno = 0;
559 tok->enc = NULL;
560 tok->str = str;
561 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
562 return NULL;
563 str = tok->str; /* string after BOM if any */
564 assert(str);
565 #ifdef Py_USING_UNICODE
566 if (tok->enc != NULL) {
567 utf8 = translate_into_utf8(str, tok->enc);
568 if (utf8 == NULL)
569 return NULL;
570 str = PyString_AsString(utf8);
572 #endif
573 for (s = str;; s++) {
574 if (*s == '\0') break;
575 else if (*s == '\n') {
576 lineno++;
577 if (lineno == 2) break;
580 tok->enc = NULL;
581 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
582 return NULL;
583 #ifdef Py_USING_UNICODE
584 if (tok->enc != NULL) {
585 assert(utf8 == NULL);
586 utf8 = translate_into_utf8(str, tok->enc);
587 if (utf8 == NULL)
588 return NULL;
589 str = PyString_AsString(utf8);
591 #endif
592 assert(tok->decoding_buffer == NULL);
593 tok->decoding_buffer = utf8; /* CAUTION */
594 return str;
597 #endif /* PGEN */
599 /* Set up tokenizer for string */
601 struct tok_state *
602 PyTokenizer_FromString(const char *str)
604 struct tok_state *tok = tok_new();
605 if (tok == NULL)
606 return NULL;
607 str = (char *)decode_str(str, tok);
608 if (str == NULL)
609 return NULL;
610 /* XXX: constify members. */
611 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
612 return tok;
616 /* Set up tokenizer for file */
618 struct tok_state *
619 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
621 struct tok_state *tok = tok_new();
622 if (tok == NULL)
623 return NULL;
624 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
625 PyMem_DEL(tok);
626 return NULL;
628 tok->cur = tok->inp = tok->buf;
629 tok->end = tok->buf + BUFSIZ;
630 tok->fp = fp;
631 tok->prompt = ps1;
632 tok->nextprompt = ps2;
633 return tok;
637 /* Free a tok_state structure */
639 void
640 PyTokenizer_Free(struct tok_state *tok)
642 if (tok->encoding != NULL)
643 PyMem_DEL(tok->encoding);
644 #ifndef PGEN
645 Py_XDECREF(tok->decoding_readline);
646 Py_XDECREF(tok->decoding_buffer);
647 #endif
648 if (tok->fp != NULL && tok->buf != NULL)
649 PyMem_DEL(tok->buf);
650 PyMem_DEL(tok);
654 /* Get next char, updating state; error code goes into tok->done */
656 static int
657 tok_nextc(register struct tok_state *tok)
659 for (;;) {
660 if (tok->cur != tok->inp) {
661 return Py_CHARMASK(*tok->cur++); /* Fast path */
663 if (tok->done != E_OK)
664 return EOF;
665 if (tok->fp == NULL) {
666 char *end = strchr(tok->inp, '\n');
667 if (end != NULL)
668 end++;
669 else {
670 end = strchr(tok->inp, '\0');
671 if (end == tok->inp) {
672 tok->done = E_EOF;
673 return EOF;
676 if (tok->start == NULL)
677 tok->buf = tok->cur;
678 tok->lineno++;
679 tok->inp = end;
680 return Py_CHARMASK(*tok->cur++);
682 if (tok->prompt != NULL) {
683 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
684 if (tok->nextprompt != NULL)
685 tok->prompt = tok->nextprompt;
686 if (new == NULL)
687 tok->done = E_INTR;
688 else if (*new == '\0') {
689 PyMem_FREE(new);
690 tok->done = E_EOF;
692 else if (tok->start != NULL) {
693 size_t start = tok->start - tok->buf;
694 size_t oldlen = tok->cur - tok->buf;
695 size_t newlen = oldlen + strlen(new);
696 char *buf = tok->buf;
697 PyMem_RESIZE(buf, char, newlen+1);
698 tok->lineno++;
699 if (buf == NULL) {
700 PyMem_DEL(tok->buf);
701 tok->buf = NULL;
702 PyMem_FREE(new);
703 tok->done = E_NOMEM;
704 return EOF;
706 tok->buf = buf;
707 tok->cur = tok->buf + oldlen;
708 strcpy(tok->buf + oldlen, new);
709 PyMem_FREE(new);
710 tok->inp = tok->buf + newlen;
711 tok->end = tok->inp + 1;
712 tok->start = tok->buf + start;
714 else {
715 tok->lineno++;
716 if (tok->buf != NULL)
717 PyMem_DEL(tok->buf);
718 tok->buf = new;
719 tok->cur = tok->buf;
720 tok->inp = strchr(tok->buf, '\0');
721 tok->end = tok->inp + 1;
724 else {
725 int done = 0;
726 int cur = 0;
727 char *pt;
728 if (tok->start == NULL) {
729 if (tok->buf == NULL) {
730 tok->buf = PyMem_NEW(char, BUFSIZ);
731 if (tok->buf == NULL) {
732 tok->done = E_NOMEM;
733 return EOF;
735 tok->end = tok->buf + BUFSIZ;
737 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
738 tok) == NULL) {
739 tok->done = E_EOF;
740 done = 1;
742 else {
743 tok->done = E_OK;
744 tok->inp = strchr(tok->buf, '\0');
745 done = tok->inp[-1] == '\n';
748 else {
749 cur = tok->cur - tok->buf;
750 if (decoding_feof(tok)) {
751 tok->done = E_EOF;
752 done = 1;
754 else
755 tok->done = E_OK;
757 tok->lineno++;
758 /* Read until '\n' or EOF */
759 while (!done) {
760 int curstart = tok->start == NULL ? -1 :
761 tok->start - tok->buf;
762 int curvalid = tok->inp - tok->buf;
763 int newsize = curvalid + BUFSIZ;
764 char *newbuf = tok->buf;
765 PyMem_RESIZE(newbuf, char, newsize);
766 if (newbuf == NULL) {
767 tok->done = E_NOMEM;
768 tok->cur = tok->inp;
769 return EOF;
771 tok->buf = newbuf;
772 tok->inp = tok->buf + curvalid;
773 tok->end = tok->buf + newsize;
774 tok->start = curstart < 0 ? NULL :
775 tok->buf + curstart;
776 if (decoding_fgets(tok->inp,
777 (int)(tok->end - tok->inp),
778 tok) == NULL) {
779 /* Last line does not end in \n,
780 fake one */
781 strcpy(tok->inp, "\n");
783 tok->inp = strchr(tok->inp, '\0');
784 done = tok->inp[-1] == '\n';
786 tok->cur = tok->buf + cur;
787 #ifndef macintosh
788 /* replace "\r\n" with "\n" */
789 /* For Mac we leave the \r, giving a syntax error */
790 pt = tok->inp - 2;
791 if (pt >= tok->buf && *pt == '\r') {
792 *pt++ = '\n';
793 *pt = '\0';
794 tok->inp = pt;
796 #endif
798 if (tok->done != E_OK) {
799 if (tok->prompt != NULL)
800 PySys_WriteStderr("\n");
801 tok->cur = tok->inp;
802 return EOF;
805 /*NOTREACHED*/
809 /* Back-up one character */
811 static void
812 tok_backup(register struct tok_state *tok, register int c)
814 if (c != EOF) {
815 if (--tok->cur < tok->buf)
816 Py_FatalError("tok_backup: begin of buffer");
817 if (*tok->cur != c)
818 *tok->cur = c;
823 /* Return the token corresponding to a single character */
826 PyToken_OneChar(int c)
828 switch (c) {
829 case '(': return LPAR;
830 case ')': return RPAR;
831 case '[': return LSQB;
832 case ']': return RSQB;
833 case ':': return COLON;
834 case ',': return COMMA;
835 case ';': return SEMI;
836 case '+': return PLUS;
837 case '-': return MINUS;
838 case '*': return STAR;
839 case '/': return SLASH;
840 case '|': return VBAR;
841 case '&': return AMPER;
842 case '<': return LESS;
843 case '>': return GREATER;
844 case '=': return EQUAL;
845 case '.': return DOT;
846 case '%': return PERCENT;
847 case '`': return BACKQUOTE;
848 case '{': return LBRACE;
849 case '}': return RBRACE;
850 case '^': return CIRCUMFLEX;
851 case '~': return TILDE;
852 default: return OP;
858 PyToken_TwoChars(int c1, int c2)
860 switch (c1) {
861 case '=':
862 switch (c2) {
863 case '=': return EQEQUAL;
865 break;
866 case '!':
867 switch (c2) {
868 case '=': return NOTEQUAL;
870 break;
871 case '<':
872 switch (c2) {
873 case '>': return NOTEQUAL;
874 case '=': return LESSEQUAL;
875 case '<': return LEFTSHIFT;
877 break;
878 case '>':
879 switch (c2) {
880 case '=': return GREATEREQUAL;
881 case '>': return RIGHTSHIFT;
883 break;
884 case '+':
885 switch (c2) {
886 case '=': return PLUSEQUAL;
888 break;
889 case '-':
890 switch (c2) {
891 case '=': return MINEQUAL;
893 break;
894 case '*':
895 switch (c2) {
896 case '*': return DOUBLESTAR;
897 case '=': return STAREQUAL;
899 break;
900 case '/':
901 switch (c2) {
902 case '/': return DOUBLESLASH;
903 case '=': return SLASHEQUAL;
905 break;
906 case '|':
907 switch (c2) {
908 case '=': return VBAREQUAL;
910 break;
911 case '%':
912 switch (c2) {
913 case '=': return PERCENTEQUAL;
915 break;
916 case '&':
917 switch (c2) {
918 case '=': return AMPEREQUAL;
920 break;
921 case '^':
922 switch (c2) {
923 case '=': return CIRCUMFLEXEQUAL;
925 break;
927 return OP;
931 PyToken_ThreeChars(int c1, int c2, int c3)
933 switch (c1) {
934 case '<':
935 switch (c2) {
936 case '<':
937 switch (c3) {
938 case '=':
939 return LEFTSHIFTEQUAL;
941 break;
943 break;
944 case '>':
945 switch (c2) {
946 case '>':
947 switch (c3) {
948 case '=':
949 return RIGHTSHIFTEQUAL;
951 break;
953 break;
954 case '*':
955 switch (c2) {
956 case '*':
957 switch (c3) {
958 case '=':
959 return DOUBLESTAREQUAL;
961 break;
963 break;
964 case '/':
965 switch (c2) {
966 case '/':
967 switch (c3) {
968 case '=':
969 return DOUBLESLASHEQUAL;
971 break;
973 break;
975 return OP;
978 static int
979 indenterror(struct tok_state *tok)
981 if (tok->alterror) {
982 tok->done = E_TABSPACE;
983 tok->cur = tok->inp;
984 return 1;
986 if (tok->altwarning) {
987 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
988 "in indentation\n", tok->filename);
989 tok->altwarning = 0;
991 return 0;
995 /* Get next token, after space stripping etc. */
997 static int
998 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1000 register int c;
1001 int blankline;
1003 *p_start = *p_end = NULL;
1004 nextline:
1005 tok->start = NULL;
1006 blankline = 0;
1008 /* Get indentation level */
1009 if (tok->atbol) {
1010 register int col = 0;
1011 register int altcol = 0;
1012 tok->atbol = 0;
1013 for (;;) {
1014 c = tok_nextc(tok);
1015 if (c == ' ')
1016 col++, altcol++;
1017 else if (c == '\t') {
1018 col = (col/tok->tabsize + 1) * tok->tabsize;
1019 altcol = (altcol/tok->alttabsize + 1)
1020 * tok->alttabsize;
1022 else if (c == '\014') /* Control-L (formfeed) */
1023 col = altcol = 0; /* For Emacs users */
1024 else
1025 break;
1027 tok_backup(tok, c);
1028 if (c == '#' || c == '\n') {
1029 /* Lines with only whitespace and/or comments
1030 shouldn't affect the indentation and are
1031 not passed to the parser as NEWLINE tokens,
1032 except *totally* empty lines in interactive
1033 mode, which signal the end of a command group. */
1034 if (col == 0 && c == '\n' && tok->prompt != NULL)
1035 blankline = 0; /* Let it through */
1036 else
1037 blankline = 1; /* Ignore completely */
1038 /* We can't jump back right here since we still
1039 may need to skip to the end of a comment */
1041 if (!blankline && tok->level == 0) {
1042 if (col == tok->indstack[tok->indent]) {
1043 /* No change */
1044 if (altcol != tok->altindstack[tok->indent]) {
1045 if (indenterror(tok))
1046 return ERRORTOKEN;
1049 else if (col > tok->indstack[tok->indent]) {
1050 /* Indent -- always one */
1051 if (tok->indent+1 >= MAXINDENT) {
1052 tok->done = E_TOODEEP;
1053 tok->cur = tok->inp;
1054 return ERRORTOKEN;
1056 if (altcol <= tok->altindstack[tok->indent]) {
1057 if (indenterror(tok))
1058 return ERRORTOKEN;
1060 tok->pendin++;
1061 tok->indstack[++tok->indent] = col;
1062 tok->altindstack[tok->indent] = altcol;
1064 else /* col < tok->indstack[tok->indent] */ {
1065 /* Dedent -- any number, must be consistent */
1066 while (tok->indent > 0 &&
1067 col < tok->indstack[tok->indent]) {
1068 tok->pendin--;
1069 tok->indent--;
1071 if (col != tok->indstack[tok->indent]) {
1072 tok->done = E_DEDENT;
1073 tok->cur = tok->inp;
1074 return ERRORTOKEN;
1076 if (altcol != tok->altindstack[tok->indent]) {
1077 if (indenterror(tok))
1078 return ERRORTOKEN;
1084 tok->start = tok->cur;
1086 /* Return pending indents/dedents */
1087 if (tok->pendin != 0) {
1088 if (tok->pendin < 0) {
1089 tok->pendin++;
1090 return DEDENT;
1092 else {
1093 tok->pendin--;
1094 return INDENT;
1098 again:
1099 tok->start = NULL;
1100 /* Skip spaces */
1101 do {
1102 c = tok_nextc(tok);
1103 } while (c == ' ' || c == '\t' || c == '\014');
1105 /* Set start of current token */
1106 tok->start = tok->cur - 1;
1108 /* Skip comment, while looking for tab-setting magic */
1109 if (c == '#') {
1110 static char *tabforms[] = {
1111 "tab-width:", /* Emacs */
1112 ":tabstop=", /* vim, full form */
1113 ":ts=", /* vim, abbreviated form */
1114 "set tabsize=", /* will vi never die? */
1115 /* more templates can be added here to support other editors */
1117 char cbuf[80];
1118 char *tp, **cp;
1119 tp = cbuf;
1120 do {
1121 *tp++ = c = tok_nextc(tok);
1122 } while (c != EOF && c != '\n' &&
1123 tp - cbuf + 1 < sizeof(cbuf));
1124 *tp = '\0';
1125 for (cp = tabforms;
1126 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1127 cp++) {
1128 if ((tp = strstr(cbuf, *cp))) {
1129 int newsize = atoi(tp + strlen(*cp));
1131 if (newsize >= 1 && newsize <= 40) {
1132 tok->tabsize = newsize;
1133 if (Py_VerboseFlag)
1134 PySys_WriteStderr(
1135 "Tab size set to %d\n",
1136 newsize);
1140 while (c != EOF && c != '\n')
1141 c = tok_nextc(tok);
1144 /* Check for EOF and errors now */
1145 if (c == EOF) {
1146 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1149 /* Identifier (most frequent token!) */
1150 if (isalpha(c) || c == '_') {
1151 /* Process r"", u"" and ur"" */
1152 switch (c) {
1153 case 'r':
1154 case 'R':
1155 c = tok_nextc(tok);
1156 if (c == '"' || c == '\'')
1157 goto letter_quote;
1158 break;
1159 case 'u':
1160 case 'U':
1161 c = tok_nextc(tok);
1162 if (c == 'r' || c == 'R')
1163 c = tok_nextc(tok);
1164 if (c == '"' || c == '\'')
1165 goto letter_quote;
1166 break;
1168 while (isalnum(c) || c == '_') {
1169 c = tok_nextc(tok);
1171 tok_backup(tok, c);
1172 *p_start = tok->start;
1173 *p_end = tok->cur;
1174 return NAME;
1177 /* Newline */
1178 if (c == '\n') {
1179 tok->atbol = 1;
1180 if (blankline || tok->level > 0)
1181 goto nextline;
1182 *p_start = tok->start;
1183 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1184 tok->cont_line = 0;
1185 return NEWLINE;
1188 #ifdef macintosh
1189 if (c == '\r') {
1190 PySys_WriteStderr(
1191 "File contains \\r characters (incorrect line endings?)\n");
1192 tok->done = E_TOKEN;
1193 tok->cur = tok->inp;
1194 return ERRORTOKEN;
1196 #endif
1197 /* Period or number starting with period? */
1198 if (c == '.') {
1199 c = tok_nextc(tok);
1200 if (isdigit(c)) {
1201 goto fraction;
1203 else {
1204 tok_backup(tok, c);
1205 *p_start = tok->start;
1206 *p_end = tok->cur;
1207 return DOT;
1211 /* Number */
1212 if (isdigit(c)) {
1213 if (c == '0') {
1214 /* Hex or octal -- maybe. */
1215 c = tok_nextc(tok);
1216 if (c == '.')
1217 goto fraction;
1218 #ifndef WITHOUT_COMPLEX
1219 if (c == 'j' || c == 'J')
1220 goto imaginary;
1221 #endif
1222 if (c == 'x' || c == 'X') {
1223 /* Hex */
1224 do {
1225 c = tok_nextc(tok);
1226 } while (isxdigit(c));
1228 else {
1229 int found_decimal = 0;
1230 /* Octal; c is first char of it */
1231 /* There's no 'isoctdigit' macro, sigh */
1232 while ('0' <= c && c < '8') {
1233 c = tok_nextc(tok);
1235 if (isdigit(c)) {
1236 found_decimal = 1;
1237 do {
1238 c = tok_nextc(tok);
1239 } while (isdigit(c));
1241 if (c == '.')
1242 goto fraction;
1243 else if (c == 'e' || c == 'E')
1244 goto exponent;
1245 #ifndef WITHOUT_COMPLEX
1246 else if (c == 'j' || c == 'J')
1247 goto imaginary;
1248 #endif
1249 else if (found_decimal) {
1250 tok->done = E_TOKEN;
1251 tok_backup(tok, c);
1252 return ERRORTOKEN;
1255 if (c == 'l' || c == 'L')
1256 c = tok_nextc(tok);
1258 else {
1259 /* Decimal */
1260 do {
1261 c = tok_nextc(tok);
1262 } while (isdigit(c));
1263 if (c == 'l' || c == 'L')
1264 c = tok_nextc(tok);
1265 else {
1266 /* Accept floating point numbers. */
1267 if (c == '.') {
1268 fraction:
1269 /* Fraction */
1270 do {
1271 c = tok_nextc(tok);
1272 } while (isdigit(c));
1274 if (c == 'e' || c == 'E') {
1275 exponent:
1276 /* Exponent part */
1277 c = tok_nextc(tok);
1278 if (c == '+' || c == '-')
1279 c = tok_nextc(tok);
1280 if (!isdigit(c)) {
1281 tok->done = E_TOKEN;
1282 tok_backup(tok, c);
1283 return ERRORTOKEN;
1285 do {
1286 c = tok_nextc(tok);
1287 } while (isdigit(c));
1289 #ifndef WITHOUT_COMPLEX
1290 if (c == 'j' || c == 'J')
1291 /* Imaginary part */
1292 imaginary:
1293 c = tok_nextc(tok);
1294 #endif
1297 tok_backup(tok, c);
1298 *p_start = tok->start;
1299 *p_end = tok->cur;
1300 return NUMBER;
1303 letter_quote:
1304 /* String */
1305 if (c == '\'' || c == '"') {
1306 int quote2 = tok->cur - tok->start + 1;
1307 int quote = c;
1308 int triple = 0;
1309 int tripcount = 0;
1310 for (;;) {
1311 c = tok_nextc(tok);
1312 if (c == '\n') {
1313 if (!triple) {
1314 tok->done = E_EOLS;
1315 tok_backup(tok, c);
1316 return ERRORTOKEN;
1318 tripcount = 0;
1319 tok->cont_line = 1; /* multiline string. */
1321 else if (c == EOF) {
1322 if (triple)
1323 tok->done = E_EOFS;
1324 else
1325 tok->done = E_EOLS;
1326 tok->cur = tok->inp;
1327 return ERRORTOKEN;
1329 else if (c == quote) {
1330 tripcount++;
1331 if (tok->cur - tok->start == quote2) {
1332 c = tok_nextc(tok);
1333 if (c == quote) {
1334 triple = 1;
1335 tripcount = 0;
1336 continue;
1338 tok_backup(tok, c);
1340 if (!triple || tripcount == 3)
1341 break;
1343 else if (c == '\\') {
1344 tripcount = 0;
1345 c = tok_nextc(tok);
1346 if (c == EOF) {
1347 tok->done = E_EOLS;
1348 tok->cur = tok->inp;
1349 return ERRORTOKEN;
1352 else
1353 tripcount = 0;
1355 *p_start = tok->start;
1356 *p_end = tok->cur;
1357 return STRING;
1360 /* Line continuation */
1361 if (c == '\\') {
1362 c = tok_nextc(tok);
1363 if (c != '\n') {
1364 tok->done = E_TOKEN;
1365 tok->cur = tok->inp;
1366 return ERRORTOKEN;
1368 tok->cont_line = 1;
1369 goto again; /* Read next line */
1372 /* Check for two-character token */
1374 int c2 = tok_nextc(tok);
1375 int token = PyToken_TwoChars(c, c2);
1376 if (token != OP) {
1377 int c3 = tok_nextc(tok);
1378 int token3 = PyToken_ThreeChars(c, c2, c3);
1379 if (token3 != OP) {
1380 token = token3;
1381 } else {
1382 tok_backup(tok, c3);
1384 *p_start = tok->start;
1385 *p_end = tok->cur;
1386 return token;
1388 tok_backup(tok, c2);
1391 /* Keep track of parentheses nesting level */
1392 switch (c) {
1393 case '(':
1394 case '[':
1395 case '{':
1396 tok->level++;
1397 break;
1398 case ')':
1399 case ']':
1400 case '}':
1401 tok->level--;
1402 break;
1405 /* Punctuation character */
1406 *p_start = tok->start;
1407 *p_end = tok->cur;
1408 return PyToken_OneChar(c);
1412 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1414 int result = tok_get(tok, p_start, p_end);
1415 if (tok->decoding_erred) {
1416 result = ERRORTOKEN;
1417 tok->done = E_DECODE;
1419 return result;
1422 #ifdef Py_DEBUG
1424 void
1425 tok_dump(int type, char *start, char *end)
1427 printf("%s", _PyParser_TokenNames[type]);
1428 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1429 printf("(%.*s)", (int)(end - start), start);
1432 #endif