Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #include "pydebug.h"
  20 #endif /* PGEN */
  21
  22 extern char *PyOS_Readline(FILE *, FILE *, char *);
  23 /* Return malloc'ed string including trailing \n;
  24    empty malloc'ed string for EOF;
  25    NULL if interrupted */
  26
  27 /* Don't ever change this -- it would break the portability of Python code */
  28 #define TABSIZE 8
  29
  30 /* Forward */
  31 static struct tok_state *tok_new(void);
  32 static int tok_nextc(struct tok_state *tok);
  33 static void tok_backup(struct tok_state *tok, int c);
  34
  35 /* Token names */
  36
  37 char *_PyParser_TokenNames[] = {
  38     "ENDMARKER",
  39     "NAME",
  40     "NUMBER",
  41     "STRING",
  42     "NEWLINE",
  43     "INDENT",
  44     "DEDENT",
  45     "LPAR",
  46     "RPAR",
  47     "LSQB",
  48     "RSQB",
  49     "COLON",
  50     "COMMA",
  51     "SEMI",
  52     "PLUS",
  53     "MINUS",
  54     "STAR",
  55     "SLASH",
  56     "VBAR",
  57     "AMPER",
  58     "LESS",
  59     "GREATER",
  60     "EQUAL",
  61     "DOT",
  62     "PERCENT",
  63     "BACKQUOTE",
  64     "LBRACE",
  65     "RBRACE",
  66     "EQEQUAL",
  67     "NOTEQUAL",
  68     "LESSEQUAL",
  69     "GREATEREQUAL",
  70     "TILDE",
  71     "CIRCUMFLEX",
  72     "LEFTSHIFT",
  73     "RIGHTSHIFT",
  74     "DOUBLESTAR",
  75     "PLUSEQUAL",
  76     "MINEQUAL",
  77     "STAREQUAL",
  78     "SLASHEQUAL",
  79     "PERCENTEQUAL",
  80     "AMPEREQUAL",
  81     "VBAREQUAL",
  82     "CIRCUMFLEXEQUAL",
  83     "LEFTSHIFTEQUAL",
  84     "RIGHTSHIFTEQUAL",
  85     "DOUBLESTAREQUAL",
  86     "DOUBLESLASH",
  87     "DOUBLESLASHEQUAL",
  88     "AT",
  89     /* This table must match the #defines in token.h! */
  90     "OP",
  91     "<ERRORTOKEN>",
  92     "<N_TOKENS>"
  93 };
  94
  95 /* Create and initialize a new tok_state structure */
  96
  97 static struct tok_state *
  98 tok_new(void)
  99 {
 100     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 101                                             sizeof(struct tok_state));
 102     if (tok == NULL)
 103         return NULL;
 104     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 105     tok->done = E_OK;
 106     tok->fp = NULL;
 107     tok->input = NULL;
 108     tok->tabsize = TABSIZE;
 109     tok->indent = 0;
 110     tok->indstack[0] = 0;
 111     tok->atbol = 1;
 112     tok->pendin = 0;
 113     tok->prompt = tok->nextprompt = NULL;
 114     tok->lineno = 0;
 115     tok->level = 0;
 116     tok->filename = NULL;
 117     tok->altwarning = 0;
 118     tok->alterror = 0;
 119     tok->alttabsize = 1;
 120     tok->altindstack[0] = 0;
 121     tok->decoding_state = 0;
 122     tok->decoding_erred = 0;
 123     tok->read_coding_spec = 0;
 124     tok->encoding = NULL;
 125     tok->cont_line = 0;
 126 #ifndef PGEN
 127     tok->decoding_readline = NULL;
 128     tok->decoding_buffer = NULL;
 129 #endif
 130     return tok;
 131 }
 132
 133 static char *
 134 new_string(const char *s, Py_ssize_t len)
 135 {
 136     char* result = (char *)PyMem_MALLOC(len + 1);
 137     if (result != NULL) {
 138         memcpy(result, s, len);
 139         result[len] = '\0';
 140     }
 141     return result;
 142 }
 143
 144 #ifdef PGEN
 145
 146 static char *
 147 decoding_fgets(char *s, int size, struct tok_state *tok)
 148 {
 149     return fgets(s, size, tok->fp);
 150 }
 151
 152 static int
 153 decoding_feof(struct tok_state *tok)
 154 {
 155     return feof(tok->fp);
 156 }
 157
 158 static char *
 159 decode_str(const char *str, int exec_input, struct tok_state *tok)
 160 {
 161     return new_string(str, strlen(str));
 162 }
 163
 164 #else /* PGEN */
 165
 166 static char *
 167 error_ret(struct tok_state *tok) /* XXX */
 168 {
 169     tok->decoding_erred = 1;
 170     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 171         PyMem_FREE(tok->buf);
 172     tok->buf = NULL;
 173     return NULL;                /* as if it were EOF */
 174 }
 175
 176
 177 static char *
 178 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 179 {
 180     char buf[13];
 181     int i;
 182     for (i = 0; i < 12; i++) {
 183         int c = s[i];
 184         if (c == '\0')
 185             break;
 186         else if (c == '_')
 187             buf[i] = '-';
 188         else
 189             buf[i] = tolower(c);
 190     }
 191     buf[i] = '\0';
 192     if (strcmp(buf, "utf-8") == 0 ||
 193         strncmp(buf, "utf-8-", 6) == 0)
 194         return "utf-8";
 195     else if (strcmp(buf, "latin-1") == 0 ||
 196              strcmp(buf, "iso-8859-1") == 0 ||
 197              strcmp(buf, "iso-latin-1") == 0 ||
 198              strncmp(buf, "latin-1-", 8) == 0 ||
 199              strncmp(buf, "iso-8859-1-", 11) == 0 ||
 200              strncmp(buf, "iso-latin-1-", 12) == 0)
 201         return "iso-8859-1";
 202     else
 203         return s;
 204 }
 205
 206 /* Return the coding spec in S, or NULL if none is found.  */
 207
 208 static char *
 209 get_coding_spec(const char *s, Py_ssize_t size)
 210 {
 211     Py_ssize_t i;
 212     /* Coding spec must be in a comment, and that comment must be
 213      * the only statement on the source code line. */
 214     for (i = 0; i < size - 6; i++) {
 215         if (s[i] == '#')
 216             break;
 217         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 218             return NULL;
 219     }
 220     for (; i < size - 6; i++) { /* XXX inefficient search */
 221         const char* t = s + i;
 222         if (strncmp(t, "coding", 6) == 0) {
 223             const char* begin = NULL;
 224             t += 6;
 225             if (t[0] != ':' && t[0] != '=')
 226                 continue;
 227             do {
 228                 t++;
 229             } while (t[0] == '\x20' || t[0] == '\t');
 230
 231             begin = t;
 232             while (Py_ISALNUM(t[0]) ||
 233                    t[0] == '-' || t[0] == '_' || t[0] == '.')
 234                 t++;
 235
 236             if (begin < t) {
 237                 char* r = new_string(begin, t - begin);
 238                 char* q = get_normal_name(r);
 239                 if (r != q) {
 240                     PyMem_FREE(r);
 241                     r = new_string(q, strlen(q));
 242                 }
 243                 return r;
 244             }
 245         }
 246     }
 247     return NULL;
 248 }
 249
 250 /* Check whether the line contains a coding spec. If it does,
 251    invoke the set_readline function for the new encoding.
 252    This function receives the tok_state and the new encoding.
 253    Return 1 on success, 0 on failure.  */
 254
 255 static int
 256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 257                   int set_readline(struct tok_state *, const char *))
 258 {
 259     char * cs;
 260     int r = 1;
 261
 262     if (tok->cont_line)
 263         /* It's a continuation line, so it can't be a coding spec. */
 264         return 1;
 265     cs = get_coding_spec(line, size);
 266     if (cs != NULL) {
 267         tok->read_coding_spec = 1;
 268         if (tok->encoding == NULL) {
 269             assert(tok->decoding_state == 1); /* raw */
 270             if (strcmp(cs, "utf-8") == 0 ||
 271                 strcmp(cs, "iso-8859-1") == 0) {
 272                 tok->encoding = cs;
 273             } else {
 274 #ifdef Py_USING_UNICODE
 275                 r = set_readline(tok, cs);
 276                 if (r) {
 277                     tok->encoding = cs;
 278                     tok->decoding_state = -1;
 279                 }
 280                 else
 281                     PyMem_FREE(cs);
 282 #else
 283                 /* Without Unicode support, we cannot
 284                    process the coding spec. Since there
 285                    won't be any Unicode literals, that
 286                    won't matter. */
 287                 PyMem_FREE(cs);
 288 #endif
 289             }
 290         } else {                /* then, compare cs with BOM */
 291             r = (strcmp(tok->encoding, cs) == 0);
 292             PyMem_FREE(cs);
 293         }
 294     }
 295     if (!r) {
 296         cs = tok->encoding;
 297         if (!cs)
 298             cs = "with BOM";
 299         PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 300     }
 301     return r;
 302 }
 303
 304 /* See whether the file starts with a BOM. If it does,
 305    invoke the set_readline function with the new encoding.
 306    Return 1 on success, 0 on failure.  */
 307
 308 static int
 309 check_bom(int get_char(struct tok_state *),
 310           void unget_char(int, struct tok_state *),
 311           int set_readline(struct tok_state *, const char *),
 312           struct tok_state *tok)
 313 {
 314     int ch1, ch2, ch3;
 315     ch1 = get_char(tok);
 316     tok->decoding_state = 1;
 317     if (ch1 == EOF) {
 318         return 1;
 319     } else if (ch1 == 0xEF) {
 320         ch2 = get_char(tok);
 321         if (ch2 != 0xBB) {
 322             unget_char(ch2, tok);
 323             unget_char(ch1, tok);
 324             return 1;
 325         }
 326         ch3 = get_char(tok);
 327         if (ch3 != 0xBF) {
 328             unget_char(ch3, tok);
 329             unget_char(ch2, tok);
 330             unget_char(ch1, tok);
 331             return 1;
 332         }
 333 #if 0
 334     /* Disable support for UTF-16 BOMs until a decision
 335        is made whether this needs to be supported.  */
 336     } else if (ch1 == 0xFE) {
 337         ch2 = get_char(tok);
 338         if (ch2 != 0xFF) {
 339             unget_char(ch2, tok);
 340             unget_char(ch1, tok);
 341             return 1;
 342         }
 343         if (!set_readline(tok, "utf-16-be"))
 344             return 0;
 345         tok->decoding_state = -1;
 346     } else if (ch1 == 0xFF) {
 347         ch2 = get_char(tok);
 348         if (ch2 != 0xFE) {
 349             unget_char(ch2, tok);
 350             unget_char(ch1, tok);
 351             return 1;
 352         }
 353         if (!set_readline(tok, "utf-16-le"))
 354             return 0;
 355         tok->decoding_state = -1;
 356 #endif
 357     } else {
 358         unget_char(ch1, tok);
 359         return 1;
 360     }
 361     if (tok->encoding != NULL)
 362         PyMem_FREE(tok->encoding);
 363     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
 364     return 1;
 365 }
 366
 367 /* Read a line of text from TOK into S, using the stream in TOK.
 368    Return NULL on failure, else S.
 369
 370    On entry, tok->decoding_buffer will be one of:
 371      1) NULL: need to call tok->decoding_readline to get a new line
 372      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 373        stored the result in tok->decoding_buffer
 374      3) PyStringObject *: previous call to fp_readl did not have enough room
 375        (in the s buffer) to copy entire contents of the line read
 376        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 377        In this case, fp_readl is called in a loop (with an expanded buffer)
 378        until the buffer ends with a '\n' (or until the end of the file is
 379        reached): see tok_nextc and its calls to decoding_fgets.
 380 */
 381
 382 static char *
 383 fp_readl(char *s, int size, struct tok_state *tok)
 384 {
 385 #ifndef Py_USING_UNICODE
 386     /* In a non-Unicode built, this should never be called. */
 387     Py_FatalError("fp_readl should not be called in this build.");
 388     return NULL; /* Keep compiler happy (not reachable) */
 389 #else
 390     PyObject* utf8 = NULL;
 391     PyObject* buf = tok->decoding_buffer;
 392     char *str;
 393     Py_ssize_t utf8len;
 394
 395     /* Ask for one less byte so we can terminate it */
 396     assert(size > 0);
 397     size--;
 398
 399     if (buf == NULL) {
 400         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 401         if (buf == NULL)
 402             return error_ret(tok);
 403     } else {
 404         tok->decoding_buffer = NULL;
 405         if (PyString_CheckExact(buf))
 406             utf8 = buf;
 407     }
 408     if (utf8 == NULL) {
 409         utf8 = PyUnicode_AsUTF8String(buf);
 410         Py_DECREF(buf);
 411         if (utf8 == NULL)
 412             return error_ret(tok);
 413     }
 414     str = PyString_AsString(utf8);
 415     utf8len = PyString_GET_SIZE(utf8);
 416     if (utf8len > size) {
 417         tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 418         if (tok->decoding_buffer == NULL) {
 419             Py_DECREF(utf8);
 420             return error_ret(tok);
 421         }
 422         utf8len = size;
 423     }
 424     memcpy(s, str, utf8len);
 425     s[utf8len] = '\0';
 426     Py_DECREF(utf8);
 427     if (utf8len == 0)
 428         return NULL; /* EOF */
 429     return s;
 430 #endif
 431 }
 432
 433 /* Set the readline function for TOK to a StreamReader's
 434    readline function. The StreamReader is named ENC.
 435
 436    This function is called from check_bom and check_coding_spec.
 437
 438    ENC is usually identical to the future value of tok->encoding,
 439    except for the (currently unsupported) case of UTF-16.
 440
 441    Return 1 on success, 0 on failure. */
 442
 443 static int
 444 fp_setreadl(struct tok_state *tok, const char* enc)
 445 {
 446     PyObject *reader, *stream, *readline;
 447
 448     /* XXX: constify filename argument. */
 449     stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 450     if (stream == NULL)
 451         return 0;
 452
 453     reader = PyCodec_StreamReader(enc, stream, NULL);
 454     Py_DECREF(stream);
 455     if (reader == NULL)
 456         return 0;
 457
 458     readline = PyObject_GetAttrString(reader, "readline");
 459     Py_DECREF(reader);
 460     if (readline == NULL)
 461         return 0;
 462
 463     tok->decoding_readline = readline;
 464     return 1;
 465 }
 466
 467 /* Fetch the next byte from TOK. */
 468
 469 static int fp_getc(struct tok_state *tok) {
 470     return getc(tok->fp);
 471 }
 472
 473 /* Unfetch the last byte back into TOK.  */
 474
 475 static void fp_ungetc(int c, struct tok_state *tok) {
 476     ungetc(c, tok->fp);
 477 }
 478
 479 /* Read a line of input from TOK. Determine encoding
 480    if necessary.  */
 481
 482 static char *
 483 decoding_fgets(char *s, int size, struct tok_state *tok)
 484 {
 485     char *line = NULL;
 486     int badchar = 0;
 487     for (;;) {
 488         if (tok->decoding_state < 0) {
 489             /* We already have a codec associated with
 490                this input. */
 491             line = fp_readl(s, size, tok);
 492             break;
 493         } else if (tok->decoding_state > 0) {
 494             /* We want a 'raw' read. */
 495             line = Py_UniversalNewlineFgets(s, size,
 496                                             tok->fp, NULL);
 497             break;
 498         } else {
 499             /* We have not yet determined the encoding.
 500                If an encoding is found, use the file-pointer
 501                reader functions from now on. */
 502             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 503                 return error_ret(tok);
 504             assert(tok->decoding_state != 0);
 505         }
 506     }
 507     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 508         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 509             return error_ret(tok);
 510         }
 511     }
 512 #ifndef PGEN
 513     /* The default encoding is ASCII, so make sure we don't have any
 514        non-ASCII bytes in it. */
 515     if (line && !tok->encoding) {
 516         unsigned char *c;
 517         for (c = (unsigned char *)line; *c; c++)
 518             if (*c > 127) {
 519                 badchar = *c;
 520                 break;
 521             }
 522     }
 523     if (badchar) {
 524         char buf[500];
 525         /* Need to add 1 to the line number, since this line
 526            has not been counted, yet.  */
 527         sprintf(buf,
 528             "Non-ASCII character '\\x%.2x' "
 529             "in file %.200s on line %i, "
 530             "but no encoding declared; "
 531             "see http://www.python.org/peps/pep-0263.html for details",
 532             badchar, tok->filename, tok->lineno + 1);
 533         PyErr_SetString(PyExc_SyntaxError, buf);
 534         return error_ret(tok);
 535     }
 536 #endif
 537     return line;
 538 }
 539
 540 static int
 541 decoding_feof(struct tok_state *tok)
 542 {
 543     if (tok->decoding_state >= 0) {
 544         return feof(tok->fp);
 545     } else {
 546         PyObject* buf = tok->decoding_buffer;
 547         if (buf == NULL) {
 548             buf = PyObject_CallObject(tok->decoding_readline, NULL);
 549             if (buf == NULL) {
 550                 error_ret(tok);
 551                 return 1;
 552             } else {
 553                 tok->decoding_buffer = buf;
 554             }
 555         }
 556         return PyObject_Length(buf) == 0;
 557     }
 558 }
 559
 560 /* Fetch a byte from TOK, using the string buffer. */
 561
 562 static int
 563 buf_getc(struct tok_state *tok) {
 564     return Py_CHARMASK(*tok->str++);
 565 }
 566
 567 /* Unfetch a byte from TOK, using the string buffer. */
 568
 569 static void
 570 buf_ungetc(int c, struct tok_state *tok) {
 571     tok->str--;
 572     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
 573 }
 574
 575 /* Set the readline function for TOK to ENC. For the string-based
 576    tokenizer, this means to just record the encoding. */
 577
 578 static int
 579 buf_setreadl(struct tok_state *tok, const char* enc) {
 580     tok->enc = enc;
 581     return 1;
 582 }
 583
 584 /* Return a UTF-8 encoding Python string object from the
 585    C byte string STR, which is encoded with ENC. */
 586
 587 #ifdef Py_USING_UNICODE
 588 static PyObject *
 589 translate_into_utf8(const char* str, const char* enc) {
 590     PyObject *utf8;
 591     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 592     if (buf == NULL)
 593         return NULL;
 594     utf8 = PyUnicode_AsUTF8String(buf);
 595     Py_DECREF(buf);
 596     return utf8;
 597 }
 598 #endif
 599
 600
 601 static char *
 602 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
 603     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
 604     char *buf, *current;
 605     char c = '\0';
 606     buf = PyMem_MALLOC(needed_length);
 607     if (buf == NULL) {
 608         tok->done = E_NOMEM;
 609         return NULL;
 610     }
 611     for (current = buf; *s; s++, current++) {
 612         c = *s;
 613         if (skip_next_lf) {
 614             skip_next_lf = 0;
 615             if (c == '\n') {
 616                 c = *++s;
 617                 if (!c)
 618                     break;
 619             }
 620         }
 621         if (c == '\r') {
 622             skip_next_lf = 1;
 623             c = '\n';
 624         }
 625         *current = c;
 626     }
 627     /* If this is exec input, add a newline to the end of the string if
 628        there isn't one already. */
 629     if (exec_input && c != '\n') {
 630         *current = '\n';
 631         current++;
 632     }
 633     *current = '\0';
 634     final_length = current - buf + 1;
 635     if (final_length < needed_length && final_length)
 636         /* should never fail */
 637         buf = PyMem_REALLOC(buf, final_length);
 638     return buf;
 639 }
 640
 641 /* Decode a byte string STR for use as the buffer of TOK.
 642    Look for encoding declarations inside STR, and record them
 643    inside TOK.  */
 644
 645 static const char *
 646 decode_str(const char *input, int single, struct tok_state *tok)
 647 {
 648     PyObject* utf8 = NULL;
 649     const char *str;
 650     const char *s;
 651     const char *newl[2] = {NULL, NULL};
 652     int lineno = 0;
 653     tok->input = str = translate_newlines(input, single, tok);
 654     if (str == NULL)
 655         return NULL;
 656     tok->enc = NULL;
 657     tok->str = str;
 658     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 659         return error_ret(tok);
 660     str = tok->str;             /* string after BOM if any */
 661     assert(str);
 662 #ifdef Py_USING_UNICODE
 663     if (tok->enc != NULL) {
 664         utf8 = translate_into_utf8(str, tok->enc);
 665         if (utf8 == NULL)
 666             return error_ret(tok);
 667         str = PyString_AsString(utf8);
 668     }
 669 #endif
 670     for (s = str;; s++) {
 671         if (*s == '\0') break;
 672         else if (*s == '\n') {
 673             assert(lineno < 2);
 674             newl[lineno] = s;
 675             lineno++;
 676             if (lineno == 2) break;
 677         }
 678     }
 679     tok->enc = NULL;
 680     /* need to check line 1 and 2 separately since check_coding_spec
 681        assumes a single line as input */
 682     if (newl[0]) {
 683         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 684             return error_ret(tok);
 685         if (tok->enc == NULL && newl[1]) {
 686             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 687                                    tok, buf_setreadl))
 688                 return error_ret(tok);
 689         }
 690     }
 691 #ifdef Py_USING_UNICODE
 692     if (tok->enc != NULL) {
 693         assert(utf8 == NULL);
 694         utf8 = translate_into_utf8(str, tok->enc);
 695         if (utf8 == NULL)
 696             return error_ret(tok);
 697         str = PyString_AsString(utf8);
 698     }
 699 #endif
 700     assert(tok->decoding_buffer == NULL);
 701     tok->decoding_buffer = utf8; /* CAUTION */
 702     return str;
 703 }
 704
 705 #endif /* PGEN */
 706
 707 /* Set up tokenizer for string */
 708
 709 struct tok_state *
 710 PyTokenizer_FromString(const char *str, int exec_input)
 711 {
 712     struct tok_state *tok = tok_new();
 713     if (tok == NULL)
 714         return NULL;
 715     str = (char *)decode_str(str, exec_input, tok);
 716     if (str == NULL) {
 717         PyTokenizer_Free(tok);
 718         return NULL;
 719     }
 720
 721     /* XXX: constify members. */
 722     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 723     return tok;
 724 }
 725
 726
 727 /* Set up tokenizer for file */
 728
 729 struct tok_state *
 730 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 731 {
 732     struct tok_state *tok = tok_new();
 733     if (tok == NULL)
 734         return NULL;
 735     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 736         PyTokenizer_Free(tok);
 737         return NULL;
 738     }
 739     tok->cur = tok->inp = tok->buf;
 740     tok->end = tok->buf + BUFSIZ;
 741     tok->fp = fp;
 742     tok->prompt = ps1;
 743     tok->nextprompt = ps2;
 744     return tok;
 745 }
 746
 747
 748 /* Free a tok_state structure */
 749
 750 void
 751 PyTokenizer_Free(struct tok_state *tok)
 752 {
 753     if (tok->encoding != NULL)
 754         PyMem_FREE(tok->encoding);
 755 #ifndef PGEN
 756     Py_XDECREF(tok->decoding_readline);
 757     Py_XDECREF(tok->decoding_buffer);
 758 #endif
 759     if (tok->fp != NULL && tok->buf != NULL)
 760         PyMem_FREE(tok->buf);
 761     if (tok->input)
 762         PyMem_FREE((char *)tok->input);
 763     PyMem_FREE(tok);
 764 }
 765
 766 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 767 static int
 768 tok_stdin_decode(struct tok_state *tok, char **inp)
 769 {
 770     PyObject *enc, *sysstdin, *decoded, *utf8;
 771     const char *encoding;
 772     char *converted;
 773
 774     if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 775         return 0;
 776     sysstdin = PySys_GetObject("stdin");
 777     if (sysstdin == NULL || !PyFile_Check(sysstdin))
 778         return 0;
 779
 780     enc = ((PyFileObject *)sysstdin)->f_encoding;
 781     if (enc == NULL || !PyString_Check(enc))
 782         return 0;
 783     Py_INCREF(enc);
 784
 785     encoding = PyString_AsString(enc);
 786     decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 787     if (decoded == NULL)
 788         goto error_clear;
 789
 790     utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 791     Py_DECREF(decoded);
 792     if (utf8 == NULL)
 793         goto error_clear;
 794
 795     assert(PyString_Check(utf8));
 796     converted = new_string(PyString_AS_STRING(utf8),
 797                            PyString_GET_SIZE(utf8));
 798     Py_DECREF(utf8);
 799     if (converted == NULL)
 800         goto error_nomem;
 801
 802     PyMem_FREE(*inp);
 803     *inp = converted;
 804     if (tok->encoding != NULL)
 805         PyMem_FREE(tok->encoding);
 806     tok->encoding = new_string(encoding, strlen(encoding));
 807     if (tok->encoding == NULL)
 808         goto error_nomem;
 809
 810     Py_DECREF(enc);
 811     return 0;
 812
 813 error_nomem:
 814     Py_DECREF(enc);
 815     tok->done = E_NOMEM;
 816     return -1;
 817
 818 error_clear:
 819     Py_DECREF(enc);
 820     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
 821         tok->done = E_ERROR;
 822         return -1;
 823     }
 824     /* Fallback to iso-8859-1: for backward compatibility */
 825     PyErr_Clear();
 826     return 0;
 827 }
 828 #endif
 829
 830 /* Get next char, updating state; error code goes into tok->done */
 831
 832 static int
 833 tok_nextc(register struct tok_state *tok)
 834 {
 835     for (;;) {
 836         if (tok->cur != tok->inp) {
 837             return Py_CHARMASK(*tok->cur++); /* Fast path */
 838         }
 839         if (tok->done != E_OK)
 840             return EOF;
 841         if (tok->fp == NULL) {
 842             char *end = strchr(tok->inp, '\n');
 843             if (end != NULL)
 844                 end++;
 845             else {
 846                 end = strchr(tok->inp, '\0');
 847                 if (end == tok->inp) {
 848                     tok->done = E_EOF;
 849                     return EOF;
 850                 }
 851             }
 852             if (tok->start == NULL)
 853                 tok->buf = tok->cur;
 854             tok->line_start = tok->cur;
 855             tok->lineno++;
 856             tok->inp = end;
 857             return Py_CHARMASK(*tok->cur++);
 858         }
 859         if (tok->prompt != NULL) {
 860             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 861             if (tok->nextprompt != NULL)
 862                 tok->prompt = tok->nextprompt;
 863             if (newtok == NULL)
 864                 tok->done = E_INTR;
 865             else if (*newtok == '\0') {
 866                 PyMem_FREE(newtok);
 867                 tok->done = E_EOF;
 868             }
 869 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 870             else if (tok_stdin_decode(tok, &newtok) != 0)
 871                 PyMem_FREE(newtok);
 872 #endif
 873             else if (tok->start != NULL) {
 874                 size_t start = tok->start - tok->buf;
 875                 size_t oldlen = tok->cur - tok->buf;
 876                 size_t newlen = oldlen + strlen(newtok);
 877                 char *buf = tok->buf;
 878                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 879                 tok->lineno++;
 880                 if (buf == NULL) {
 881                     PyMem_FREE(tok->buf);
 882                     tok->buf = NULL;
 883                     PyMem_FREE(newtok);
 884                     tok->done = E_NOMEM;
 885                     return EOF;
 886                 }
 887                 tok->buf = buf;
 888                 tok->cur = tok->buf + oldlen;
 889                 tok->line_start = tok->cur;
 890                 strcpy(tok->buf + oldlen, newtok);
 891                 PyMem_FREE(newtok);
 892                 tok->inp = tok->buf + newlen;
 893                 tok->end = tok->inp + 1;
 894                 tok->start = tok->buf + start;
 895             }
 896             else {
 897                 tok->lineno++;
 898                 if (tok->buf != NULL)
 899                     PyMem_FREE(tok->buf);
 900                 tok->buf = newtok;
 901                 tok->line_start = tok->buf;
 902                 tok->cur = tok->buf;
 903                 tok->line_start = tok->buf;
 904                 tok->inp = strchr(tok->buf, '\0');
 905                 tok->end = tok->inp + 1;
 906             }
 907         }
 908         else {
 909             int done = 0;
 910             Py_ssize_t cur = 0;
 911             char *pt;
 912             if (tok->start == NULL) {
 913                 if (tok->buf == NULL) {
 914                     tok->buf = (char *)
 915                         PyMem_MALLOC(BUFSIZ);
 916                     if (tok->buf == NULL) {
 917                         tok->done = E_NOMEM;
 918                         return EOF;
 919                     }
 920                     tok->end = tok->buf + BUFSIZ;
 921                 }
 922                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 923                           tok) == NULL) {
 924                     tok->done = E_EOF;
 925                     done = 1;
 926                 }
 927                 else {
 928                     tok->done = E_OK;
 929                     tok->inp = strchr(tok->buf, '\0');
 930                     done = tok->inp[-1] == '\n';
 931                 }
 932             }
 933             else {
 934                 cur = tok->cur - tok->buf;
 935                 if (decoding_feof(tok)) {
 936                     tok->done = E_EOF;
 937                     done = 1;
 938                 }
 939                 else
 940                     tok->done = E_OK;
 941             }
 942             tok->lineno++;
 943             /* Read until '\n' or EOF */
 944             while (!done) {
 945                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 946                           tok->start - tok->buf;
 947                 Py_ssize_t curvalid = tok->inp - tok->buf;
 948                 Py_ssize_t newsize = curvalid + BUFSIZ;
 949                 char *newbuf = tok->buf;
 950                 newbuf = (char *)PyMem_REALLOC(newbuf,
 951                                                newsize);
 952                 if (newbuf == NULL) {
 953                     tok->done = E_NOMEM;
 954                     tok->cur = tok->inp;
 955                     return EOF;
 956                 }
 957                 tok->buf = newbuf;
 958                 tok->inp = tok->buf + curvalid;
 959                 tok->end = tok->buf + newsize;
 960                 tok->start = curstart < 0 ? NULL :
 961                          tok->buf + curstart;
 962                 if (decoding_fgets(tok->inp,
 963                                (int)(tok->end - tok->inp),
 964                                tok) == NULL) {
 965                     /* Break out early on decoding
 966                        errors, as tok->buf will be NULL
 967                      */
 968                     if (tok->decoding_erred)
 969                         return EOF;
 970                     /* Last line does not end in \n,
 971                        fake one */
 972                     strcpy(tok->inp, "\n");
 973                 }
 974                 tok->inp = strchr(tok->inp, '\0');
 975                 done = tok->inp[-1] == '\n';
 976             }
 977             if (tok->buf != NULL) {
 978                 tok->cur = tok->buf + cur;
 979                 tok->line_start = tok->cur;
 980                 /* replace "\r\n" with "\n" */
 981                 /* For Mac leave the \r, giving a syntax error */
 982                 pt = tok->inp - 2;
 983                 if (pt >= tok->buf && *pt == '\r') {
 984                     *pt++ = '\n';
 985                     *pt = '\0';
 986                     tok->inp = pt;
 987                 }
 988             }
 989         }
 990         if (tok->done != E_OK) {
 991             if (tok->prompt != NULL)
 992                 PySys_WriteStderr("\n");
 993             tok->cur = tok->inp;
 994             return EOF;
 995         }
 996     }
 997     /*NOTREACHED*/
 998 }
 999
1000
1001 /* Back-up one character */
1002
1003 static void
1004 tok_backup(register struct tok_state *tok, register int c)
1005 {
1006     if (c != EOF) {
1007         if (--tok->cur < tok->buf)
1008             Py_FatalError("tok_backup: beginning of buffer");
1009         if (*tok->cur != c)
1010             *tok->cur = c;
1011     }
1012 }
1013
1014
1015 /* Return the token corresponding to a single character */
1016
1017 int
1018 PyToken_OneChar(int c)
1019 {
1020     switch (c) {
1021     case '(':           return LPAR;
1022     case ')':           return RPAR;
1023     case '[':           return LSQB;
1024     case ']':           return RSQB;
1025     case ':':           return COLON;
1026     case ',':           return COMMA;
1027     case ';':           return SEMI;
1028     case '+':           return PLUS;
1029     case '-':           return MINUS;
1030     case '*':           return STAR;
1031     case '/':           return SLASH;
1032     case '|':           return VBAR;
1033     case '&':           return AMPER;
1034     case '<':           return LESS;
1035     case '>':           return GREATER;
1036     case '=':           return EQUAL;
1037     case '.':           return DOT;
1038     case '%':           return PERCENT;
1039     case '`':           return BACKQUOTE;
1040     case '{':           return LBRACE;
1041     case '}':           return RBRACE;
1042     case '^':           return CIRCUMFLEX;
1043     case '~':           return TILDE;
1044     case '@':       return AT;
1045     default:            return OP;
1046     }
1047 }
1048
1049
1050 int
1051 PyToken_TwoChars(int c1, int c2)
1052 {
1053     switch (c1) {
1054     case '=':
1055         switch (c2) {
1056         case '=':               return EQEQUAL;
1057         }
1058         break;
1059     case '!':
1060         switch (c2) {
1061         case '=':               return NOTEQUAL;
1062         }
1063         break;
1064     case '<':
1065         switch (c2) {
1066         case '>':               return NOTEQUAL;
1067         case '=':               return LESSEQUAL;
1068         case '<':               return LEFTSHIFT;
1069         }
1070         break;
1071     case '>':
1072         switch (c2) {
1073         case '=':               return GREATEREQUAL;
1074         case '>':               return RIGHTSHIFT;
1075         }
1076         break;
1077     case '+':
1078         switch (c2) {
1079         case '=':               return PLUSEQUAL;
1080         }
1081         break;
1082     case '-':
1083         switch (c2) {
1084         case '=':               return MINEQUAL;
1085         }
1086         break;
1087     case '*':
1088         switch (c2) {
1089         case '*':               return DOUBLESTAR;
1090         case '=':               return STAREQUAL;
1091         }
1092         break;
1093     case '/':
1094         switch (c2) {
1095         case '/':               return DOUBLESLASH;
1096         case '=':               return SLASHEQUAL;
1097         }
1098         break;
1099     case '|':
1100         switch (c2) {
1101         case '=':               return VBAREQUAL;
1102         }
1103         break;
1104     case '%':
1105         switch (c2) {
1106         case '=':               return PERCENTEQUAL;
1107         }
1108         break;
1109     case '&':
1110         switch (c2) {
1111         case '=':               return AMPEREQUAL;
1112         }
1113         break;
1114     case '^':
1115         switch (c2) {
1116         case '=':               return CIRCUMFLEXEQUAL;
1117         }
1118         break;
1119     }
1120     return OP;
1121 }
1122
1123 int
1124 PyToken_ThreeChars(int c1, int c2, int c3)
1125 {
1126     switch (c1) {
1127     case '<':
1128         switch (c2) {
1129         case '<':
1130             switch (c3) {
1131             case '=':
1132                 return LEFTSHIFTEQUAL;
1133             }
1134             break;
1135         }
1136         break;
1137     case '>':
1138         switch (c2) {
1139         case '>':
1140             switch (c3) {
1141             case '=':
1142                 return RIGHTSHIFTEQUAL;
1143             }
1144             break;
1145         }
1146         break;
1147     case '*':
1148         switch (c2) {
1149         case '*':
1150             switch (c3) {
1151             case '=':
1152                 return DOUBLESTAREQUAL;
1153             }
1154             break;
1155         }
1156         break;
1157     case '/':
1158         switch (c2) {
1159         case '/':
1160             switch (c3) {
1161             case '=':
1162                 return DOUBLESLASHEQUAL;
1163             }
1164             break;
1165         }
1166         break;
1167     }
1168     return OP;
1169 }
1170
1171 static int
1172 indenterror(struct tok_state *tok)
1173 {
1174     if (tok->alterror) {
1175         tok->done = E_TABSPACE;
1176         tok->cur = tok->inp;
1177         return 1;
1178     }
1179     if (tok->altwarning) {
1180         PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1181                           "in indentation\n", tok->filename);
1182         tok->altwarning = 0;
1183     }
1184     return 0;
1185 }
1186
1187 /* Get next token, after space stripping etc. */
1188
1189 static int
1190 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1191 {
1192     register int c;
1193     int blankline;
1194
1195     *p_start = *p_end = NULL;
1196   nextline:
1197     tok->start = NULL;
1198     blankline = 0;
1199
1200     /* Get indentation level */
1201     if (tok->atbol) {
1202         register int col = 0;
1203         register int altcol = 0;
1204         tok->atbol = 0;
1205         for (;;) {
1206             c = tok_nextc(tok);
1207             if (c == ' ')
1208                 col++, altcol++;
1209             else if (c == '\t') {
1210                 col = (col/tok->tabsize + 1) * tok->tabsize;
1211                 altcol = (altcol/tok->alttabsize + 1)
1212                     * tok->alttabsize;
1213             }
1214             else if (c == '\014') /* Control-L (formfeed) */
1215                 col = altcol = 0; /* For Emacs users */
1216             else
1217                 break;
1218         }
1219         tok_backup(tok, c);
1220         if (c == '#' || c == '\n') {
1221             /* Lines with only whitespace and/or comments
1222                shouldn't affect the indentation and are
1223                not passed to the parser as NEWLINE tokens,
1224                except *totally* empty lines in interactive
1225                mode, which signal the end of a command group. */
1226             if (col == 0 && c == '\n' && tok->prompt != NULL)
1227                 blankline = 0; /* Let it through */
1228             else
1229                 blankline = 1; /* Ignore completely */
1230             /* We can't jump back right here since we still
1231                may need to skip to the end of a comment */
1232         }
1233         if (!blankline && tok->level == 0) {
1234             if (col == tok->indstack[tok->indent]) {
1235                 /* No change */
1236                 if (altcol != tok->altindstack[tok->indent]) {
1237                     if (indenterror(tok))
1238                         return ERRORTOKEN;
1239                 }
1240             }
1241             else if (col > tok->indstack[tok->indent]) {
1242                 /* Indent -- always one */
1243                 if (tok->indent+1 >= MAXINDENT) {
1244                     tok->done = E_TOODEEP;
1245                     tok->cur = tok->inp;
1246                     return ERRORTOKEN;
1247                 }
1248                 if (altcol <= tok->altindstack[tok->indent]) {
1249                     if (indenterror(tok))
1250                         return ERRORTOKEN;
1251                 }
1252                 tok->pendin++;
1253                 tok->indstack[++tok->indent] = col;
1254                 tok->altindstack[tok->indent] = altcol;
1255             }
1256             else /* col < tok->indstack[tok->indent] */ {
1257                 /* Dedent -- any number, must be consistent */
1258                 while (tok->indent > 0 &&
1259                     col < tok->indstack[tok->indent]) {
1260                     tok->pendin--;
1261                     tok->indent--;
1262                 }
1263                 if (col != tok->indstack[tok->indent]) {
1264                     tok->done = E_DEDENT;
1265                     tok->cur = tok->inp;
1266                     return ERRORTOKEN;
1267                 }
1268                 if (altcol != tok->altindstack[tok->indent]) {
1269                     if (indenterror(tok))
1270                         return ERRORTOKEN;
1271                 }
1272             }
1273         }
1274     }
1275
1276     tok->start = tok->cur;
1277
1278     /* Return pending indents/dedents */
1279     if (tok->pendin != 0) {
1280         if (tok->pendin < 0) {
1281             tok->pendin++;
1282             return DEDENT;
1283         }
1284         else {
1285             tok->pendin--;
1286             return INDENT;
1287         }
1288     }
1289
1290  again:
1291     tok->start = NULL;
1292     /* Skip spaces */
1293     do {
1294         c = tok_nextc(tok);
1295     } while (c == ' ' || c == '\t' || c == '\014');
1296
1297     /* Set start of current token */
1298     tok->start = tok->cur - 1;
1299
1300     /* Skip comment, while looking for tab-setting magic */
1301     if (c == '#') {
1302         static char *tabforms[] = {
1303             "tab-width:",                       /* Emacs */
1304             ":tabstop=",                        /* vim, full form */
1305             ":ts=",                             /* vim, abbreviated form */
1306             "set tabsize=",                     /* will vi never die? */
1307         /* more templates can be added here to support other editors */
1308         };
1309         char cbuf[80];
1310         char *tp, **cp;
1311         tp = cbuf;
1312         do {
1313             *tp++ = c = tok_nextc(tok);
1314         } while (c != EOF && c != '\n' &&
1315                  (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1316         *tp = '\0';
1317         for (cp = tabforms;
1318              cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1319              cp++) {
1320             if ((tp = strstr(cbuf, *cp))) {
1321                 int newsize = atoi(tp + strlen(*cp));
1322
1323                 if (newsize >= 1 && newsize <= 40) {
1324                     tok->tabsize = newsize;
1325                     if (Py_VerboseFlag)
1326                         PySys_WriteStderr(
1327                         "Tab size set to %d\n",
1328                         newsize);
1329                 }
1330             }
1331         }
1332         while (c != EOF && c != '\n')
1333             c = tok_nextc(tok);
1334     }
1335
1336     /* Check for EOF and errors now */
1337     if (c == EOF) {
1338         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1339     }
1340
1341     /* Identifier (most frequent token!) */
1342     if (Py_ISALPHA(c) || c == '_') {
1343         /* Process r"", u"" and ur"" */
1344         switch (c) {
1345         case 'b':
1346         case 'B':
1347             c = tok_nextc(tok);
1348             if (c == 'r' || c == 'R')
1349                 c = tok_nextc(tok);
1350             if (c == '"' || c == '\'')
1351                 goto letter_quote;
1352             break;
1353         case 'r':
1354         case 'R':
1355             c = tok_nextc(tok);
1356             if (c == '"' || c == '\'')
1357                 goto letter_quote;
1358             break;
1359         case 'u':
1360         case 'U':
1361             c = tok_nextc(tok);
1362             if (c == 'r' || c == 'R')
1363                 c = tok_nextc(tok);
1364             if (c == '"' || c == '\'')
1365                 goto letter_quote;
1366             break;
1367         }
1368         while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1369             c = tok_nextc(tok);
1370         }
1371         tok_backup(tok, c);
1372         *p_start = tok->start;
1373         *p_end = tok->cur;
1374         return NAME;
1375     }
1376
1377     /* Newline */
1378     if (c == '\n') {
1379         tok->atbol = 1;
1380         if (blankline || tok->level > 0)
1381             goto nextline;
1382         *p_start = tok->start;
1383         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1384         tok->cont_line = 0;
1385         return NEWLINE;
1386     }
1387
1388     /* Period or number starting with period? */
1389     if (c == '.') {
1390         c = tok_nextc(tok);
1391         if (isdigit(c)) {
1392             goto fraction;
1393         }
1394         else {
1395             tok_backup(tok, c);
1396             *p_start = tok->start;
1397             *p_end = tok->cur;
1398             return DOT;
1399         }
1400     }
1401
1402     /* Number */
1403     if (isdigit(c)) {
1404         if (c == '0') {
1405             /* Hex, octal or binary -- maybe. */
1406             c = tok_nextc(tok);
1407             if (c == '.')
1408                 goto fraction;
1409 #ifndef WITHOUT_COMPLEX
1410             if (c == 'j' || c == 'J')
1411                 goto imaginary;
1412 #endif
1413             if (c == 'x' || c == 'X') {
1414
1415                 /* Hex */
1416                 c = tok_nextc(tok);
1417                 if (!isxdigit(c)) {
1418                     tok->done = E_TOKEN;
1419                     tok_backup(tok, c);
1420                     return ERRORTOKEN;
1421                 }
1422                 do {
1423                     c = tok_nextc(tok);
1424                 } while (isxdigit(c));
1425             }
1426             else if (c == 'o' || c == 'O') {
1427                 /* Octal */
1428                 c = tok_nextc(tok);
1429                 if (c < '0' || c >= '8') {
1430                     tok->done = E_TOKEN;
1431                     tok_backup(tok, c);
1432                     return ERRORTOKEN;
1433                 }
1434                 do {
1435                     c = tok_nextc(tok);
1436                 } while ('0' <= c && c < '8');
1437             }
1438             else if (c == 'b' || c == 'B') {
1439                 /* Binary */
1440                 c = tok_nextc(tok);
1441                 if (c != '0' && c != '1') {
1442                     tok->done = E_TOKEN;
1443                     tok_backup(tok, c);
1444                     return ERRORTOKEN;
1445                 }
1446                 do {
1447                     c = tok_nextc(tok);
1448                 } while (c == '0' || c == '1');
1449             }
1450             else {
1451                 int found_decimal = 0;
1452                 /* Octal; c is first char of it */
1453                 /* There's no 'isoctdigit' macro, sigh */
1454                 while ('0' <= c && c < '8') {
1455                     c = tok_nextc(tok);
1456                 }
1457                 if (isdigit(c)) {
1458                     found_decimal = 1;
1459                     do {
1460                         c = tok_nextc(tok);
1461                     } while (isdigit(c));
1462                 }
1463                 if (c == '.')
1464                     goto fraction;
1465                 else if (c == 'e' || c == 'E')
1466                     goto exponent;
1467 #ifndef WITHOUT_COMPLEX
1468                 else if (c == 'j' || c == 'J')
1469                     goto imaginary;
1470 #endif
1471                 else if (found_decimal) {
1472                     tok->done = E_TOKEN;
1473                     tok_backup(tok, c);
1474                     return ERRORTOKEN;
1475                 }
1476             }
1477             if (c == 'l' || c == 'L')
1478                 c = tok_nextc(tok);
1479         }
1480         else {
1481             /* Decimal */
1482             do {
1483                 c = tok_nextc(tok);
1484             } while (isdigit(c));
1485             if (c == 'l' || c == 'L')
1486                 c = tok_nextc(tok);
1487             else {
1488                 /* Accept floating point numbers. */
1489                 if (c == '.') {
1490         fraction:
1491                     /* Fraction */
1492                     do {
1493                         c = tok_nextc(tok);
1494                     } while (isdigit(c));
1495                 }
1496                 if (c == 'e' || c == 'E') {
1497         exponent:
1498                     /* Exponent part */
1499                     c = tok_nextc(tok);
1500                     if (c == '+' || c == '-')
1501                         c = tok_nextc(tok);
1502                     if (!isdigit(c)) {
1503                         tok->done = E_TOKEN;
1504                         tok_backup(tok, c);
1505                         return ERRORTOKEN;
1506                     }
1507                     do {
1508                         c = tok_nextc(tok);
1509                     } while (isdigit(c));
1510                 }
1511 #ifndef WITHOUT_COMPLEX
1512                 if (c == 'j' || c == 'J')
1513                     /* Imaginary part */
1514         imaginary:
1515                     c = tok_nextc(tok);
1516 #endif
1517             }
1518         }
1519         tok_backup(tok, c);
1520         *p_start = tok->start;
1521         *p_end = tok->cur;
1522         return NUMBER;
1523     }
1524
1525   letter_quote:
1526     /* String */
1527     if (c == '\'' || c == '"') {
1528         Py_ssize_t quote2 = tok->cur - tok->start + 1;
1529         int quote = c;
1530         int triple = 0;
1531         int tripcount = 0;
1532         for (;;) {
1533             c = tok_nextc(tok);
1534             if (c == '\n') {
1535                 if (!triple) {
1536                     tok->done = E_EOLS;
1537                     tok_backup(tok, c);
1538                     return ERRORTOKEN;
1539                 }
1540                 tripcount = 0;
1541                 tok->cont_line = 1; /* multiline string. */
1542             }
1543             else if (c == EOF) {
1544                 if (triple)
1545                     tok->done = E_EOFS;
1546                 else
1547                     tok->done = E_EOLS;
1548                 tok->cur = tok->inp;
1549                 return ERRORTOKEN;
1550             }
1551             else if (c == quote) {
1552                 tripcount++;
1553                 if (tok->cur - tok->start == quote2) {
1554                     c = tok_nextc(tok);
1555                     if (c == quote) {
1556                         triple = 1;
1557                         tripcount = 0;
1558                         continue;
1559                     }
1560                     tok_backup(tok, c);
1561                 }
1562                 if (!triple || tripcount == 3)
1563                     break;
1564             }
1565             else if (c == '\\') {
1566                 tripcount = 0;
1567                 c = tok_nextc(tok);
1568                 if (c == EOF) {
1569                     tok->done = E_EOLS;
1570                     tok->cur = tok->inp;
1571                     return ERRORTOKEN;
1572                 }
1573             }
1574             else
1575                 tripcount = 0;
1576         }
1577         *p_start = tok->start;
1578         *p_end = tok->cur;
1579         return STRING;
1580     }
1581
1582     /* Line continuation */
1583     if (c == '\\') {
1584         c = tok_nextc(tok);
1585         if (c != '\n') {
1586             tok->done = E_LINECONT;
1587             tok->cur = tok->inp;
1588             return ERRORTOKEN;
1589         }
1590         tok->cont_line = 1;
1591         goto again; /* Read next line */
1592     }
1593
1594     /* Check for two-character token */
1595     {
1596         int c2 = tok_nextc(tok);
1597         int token = PyToken_TwoChars(c, c2);
1598 #ifndef PGEN
1599         if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1600             if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1601                                    "<> not supported in 3.x; use !=",
1602                                    tok->filename, tok->lineno,
1603                                    NULL, NULL)) {
1604                 return ERRORTOKEN;
1605             }
1606         }
1607 #endif
1608         if (token != OP) {
1609             int c3 = tok_nextc(tok);
1610             int token3 = PyToken_ThreeChars(c, c2, c3);
1611             if (token3 != OP) {
1612                 token = token3;
1613             } else {
1614                 tok_backup(tok, c3);
1615             }
1616             *p_start = tok->start;
1617             *p_end = tok->cur;
1618             return token;
1619         }
1620         tok_backup(tok, c2);
1621     }
1622
1623     /* Keep track of parentheses nesting level */
1624     switch (c) {
1625     case '(':
1626     case '[':
1627     case '{':
1628         tok->level++;
1629         break;
1630     case ')':
1631     case ']':
1632     case '}':
1633         tok->level--;
1634         break;
1635     }
1636
1637     /* Punctuation character */
1638     *p_start = tok->start;
1639     *p_end = tok->cur;
1640     return PyToken_OneChar(c);
1641 }
1642
1643 int
1644 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1645 {
1646     int result = tok_get(tok, p_start, p_end);
1647     if (tok->decoding_erred) {
1648         result = ERRORTOKEN;
1649         tok->done = E_DECODE;
1650     }
1651     return result;
1652 }
1653
1654 /* This function is only called from parsetok. However, it cannot live
1655    there, as it must be empty for PGEN, and we can check for PGEN only
1656    in this file. */
1657
1658 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1659 char*
1660 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1661 {
1662     return NULL;
1663 }
1664 #else
1665 #ifdef Py_USING_UNICODE
1666 static PyObject *
1667 dec_utf8(const char *enc, const char *text, size_t len) {
1668     PyObject *ret = NULL;
1669     PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1670     if (unicode_text) {
1671         ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1672         Py_DECREF(unicode_text);
1673     }
1674     if (!ret) {
1675         PyErr_Clear();
1676     }
1677     return ret;
1678 }
1679 char *
1680 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1681 {
1682     char *text = NULL;
1683     if (tok->encoding) {
1684         /* convert source to original encondig */
1685         PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1686         if (lineobj != NULL) {
1687             int linelen = PyString_Size(lineobj);
1688             const char *line = PyString_AsString(lineobj);
1689             text = PyObject_MALLOC(linelen + 1);
1690             if (text != NULL && line != NULL) {
1691                 if (linelen)
1692                     strncpy(text, line, linelen);
1693                 text[linelen] = '\0';
1694             }
1695             Py_DECREF(lineobj);
1696
1697             /* adjust error offset */
1698             if (*offset > 1) {
1699                 PyObject *offsetobj = dec_utf8(tok->encoding,
1700                                                tok->buf, *offset-1);
1701                 if (offsetobj) {
1702                     *offset = PyString_Size(offsetobj) + 1;
1703                     Py_DECREF(offsetobj);
1704                 }
1705             }
1706
1707         }
1708     }
1709     return text;
1710
1711 }
1712 #endif /* defined(Py_USING_UNICODE) */
1713 #endif
1714
1715
1716 #ifdef Py_DEBUG
1717
1718 void
1719 tok_dump(int type, char *start, char *end)
1720 {
1721     printf("%s", _PyParser_TokenNames[type]);
1722     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1723         printf("(%.*s)", (int)(end - start), start);
1724 }
1725
1726 #endif