2 /* Tokenizer implementation */
5 #include "pgenheaders.h"
10 #include "tokenizer.h"
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
21 extern char *PyOS_Readline(FILE *, FILE *, char *);
22 /* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
26 /* Don't ever change this -- it would break the portability of Python code */
29 /* Convert a possibly signed character to a nonnegative int */
30 /* XXX This assumes characters are 8 bits wide */
31 #ifdef __CHAR_UNSIGNED__
32 #define Py_CHARMASK(c) (c)
34 #define Py_CHARMASK(c) ((c) & 0xff)
38 static struct tok_state
*tok_new(void);
39 static int tok_nextc(struct tok_state
*tok
);
40 static void tok_backup(struct tok_state
*tok
, int c
);
44 char *_PyParser_TokenNames
[] = {
95 /* This table must match the #defines in token.h! */
102 /* Create and initialize a new tok_state structure */
104 static struct tok_state
*
107 struct tok_state
*tok
= PyMem_NEW(struct tok_state
, 1);
110 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
113 tok
->tabsize
= TABSIZE
;
115 tok
->indstack
[0] = 0;
118 tok
->prompt
= tok
->nextprompt
= NULL
;
121 tok
->filename
= NULL
;
125 tok
->altindstack
[0] = 0;
126 tok
->decoding_state
= 0;
127 tok
->decoding_erred
= 0;
128 tok
->read_coding_spec
= 0;
129 tok
->issued_encoding_warning
= 0;
130 tok
->encoding
= NULL
;
133 tok
->decoding_readline
= NULL
;
134 tok
->decoding_buffer
= NULL
;
142 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
144 return fgets(s
, size
, tok
->fp
);
148 decoding_feof(struct tok_state
*tok
)
150 return feof(tok
->fp
);
154 decode_str(const char *str
, struct tok_state
*tok
)
162 error_ret(struct tok_state
*tok
) /* XXX */
164 tok
->decoding_erred
= 1;
165 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
) /* see PyTokenizer_Free */
168 return NULL
; /* as if it were EOF */
172 new_string(const char *s
, int len
)
174 char* result
= PyMem_NEW(char, len
+ 1);
175 if (result
!= NULL
) {
176 memcpy(result
, s
, len
);
183 get_normal_name(char *s
) /* for utf-8 and latin-1 */
187 for (i
= 0; i
< 12; i
++) {
189 if (c
== '\0') break;
190 else if (c
== '_') buf
[i
] = '-';
191 else buf
[i
] = tolower(c
);
194 if (strcmp(buf
, "utf-8") == 0 ||
195 strncmp(buf
, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf
, "latin-1") == 0 ||
197 strcmp(buf
, "iso-8859-1") == 0 ||
198 strcmp(buf
, "iso-latin-1") == 0 ||
199 strncmp(buf
, "latin-1-", 8) == 0 ||
200 strncmp(buf
, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf
, "iso-latin-1-", 12) == 0) return "iso-8859-1";
205 /* Return the coding spec in S, or NULL if none is found. */
208 get_coding_spec(const char *s
, int size
)
211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i
= 0; i
< size
- 6; i
++) {
216 if (s
[i
] != ' ' && s
[i
] != '\t' && s
[i
] != '\014')
219 for (; i
< size
- 6; i
++) { /* XXX inefficient search */
220 const char* t
= s
+ i
;
221 if (strncmp(t
, "coding", 6) == 0) {
222 const char* begin
= NULL
;
224 if (t
[0] != ':' && t
[0] != '=')
228 } while (t
[0] == '\x20' || t
[0] == '\t');
231 while (isalnum((int)t
[0]) ||
232 t
[0] == '-' || t
[0] == '_' || t
[0] == '.')
236 char* r
= new_string(begin
, t
- begin
);
237 char* q
= get_normal_name(r
);
240 r
= new_string(q
, strlen(q
));
249 /* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
255 check_coding_spec(const char* line
, int size
, struct tok_state
*tok
,
256 int set_readline(struct tok_state
*, const char *))
262 /* It's a continuation line, so it can't be a coding spec. */
264 cs
= get_coding_spec(line
, size
);
266 tok
->read_coding_spec
= 1;
267 if (tok
->encoding
== NULL
) {
268 assert(tok
->decoding_state
== 1); /* raw */
269 if (strcmp(cs
, "utf-8") == 0 ||
270 strcmp(cs
, "iso-8859-1") == 0) {
273 #ifdef Py_USING_UNICODE
274 r
= set_readline(tok
, cs
);
277 tok
->decoding_state
= -1;
280 /* Without Unicode support, we cannot
281 process the coding spec. Since there
282 won't be any Unicode literals, that
286 } else { /* then, compare cs with BOM */
287 r
= (strcmp(tok
->encoding
, cs
) == 0);
294 /* See whether the file starts with a BOM. If it does,
295 invoke the set_readline function with the new encoding.
296 Return 1 on success, 0 on failure. */
299 check_bom(int get_char(struct tok_state
*),
300 void unget_char(int, struct tok_state
*),
301 int set_readline(struct tok_state
*, const char *),
302 struct tok_state
*tok
)
304 int ch
= get_char(tok
);
305 tok
->decoding_state
= 1;
308 } else if (ch
== 0xEF) {
309 ch
= get_char(tok
); if (ch
!= 0xBB) goto NON_BOM
;
310 ch
= get_char(tok
); if (ch
!= 0xBF) goto NON_BOM
;
312 /* Disable support for UTF-16 BOMs until a decision
313 is made whether this needs to be supported. */
314 } else if (ch
== 0xFE) {
315 ch
= get_char(tok
); if (ch
!= 0xFF) goto NON_BOM
;
316 if (!set_readline(tok
, "utf-16-be")) return 0;
317 tok
->decoding_state
= -1;
318 } else if (ch
== 0xFF) {
319 ch
= get_char(tok
); if (ch
!= 0xFE) goto NON_BOM
;
320 if (!set_readline(tok
, "utf-16-le")) return 0;
321 tok
->decoding_state
= -1;
327 tok
->encoding
= new_string("utf-8", 5); /* resulting is in utf-8 */
330 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
331 unget_char(0xFF, tok
); /* XXX this will cause a syntax error */
335 /* Read a line of text from TOK into S, using the stream in TOK.
336 Return NULL on failure, else S. */
339 fp_readl(char *s
, int size
, struct tok_state
*tok
)
341 #ifndef Py_USING_UNICODE
342 /* In a non-Unicode built, this should never be called. */
343 Py_FatalError("fp_readl should not be called in this build.");
344 return NULL
; /* Keep compiler happy (not reachable) */
347 PyObject
* buf
= tok
->decoding_buffer
;
349 /* Ask for one less byte so we can terminate it */
350 PyObject
*args
= Py_BuildValue("(i)", size
-1);
352 return error_ret(tok
);
353 buf
= PyObject_Call(tok
->decoding_readline
, args
, NULL
);
356 return error_ret(tok
);
358 tok
->decoding_buffer
= NULL
;
360 utf8
= PyUnicode_AsUTF8String(buf
);
363 return error_ret(tok
);
365 const char* str
= PyString_AsString(utf8
);
366 assert(strlen(str
) < (size_t)size
); /* XXX */
369 if (s
[0] == '\0') return NULL
; /* EOF */
375 /* Set the readline function for TOK to a StreamReader's
376 readline function. The StreamReader is named ENC.
378 This function is called from check_bom and check_coding_spec.
380 ENC is usually identical to the future value of tok->encoding,
381 except for the (currently unsupported) case of UTF-16.
383 Return 1 on success, 0 on failure. */
386 fp_setreadl(struct tok_state
*tok
, const char* enc
)
388 PyObject
*reader
, *stream
, *readline
;
390 /* XXX: constify filename argument. */
391 stream
= PyFile_FromFile(tok
->fp
, (char*)tok
->filename
, "rb", NULL
);
395 reader
= PyCodec_StreamReader(enc
, stream
, NULL
);
400 readline
= PyObject_GetAttrString(reader
, "readline");
402 if (readline
== NULL
)
405 tok
->decoding_readline
= readline
;
409 /* Fetch the next byte from TOK. */
411 static int fp_getc(struct tok_state
*tok
) {
412 return getc(tok
->fp
);
415 /* Unfetch the last byte back into TOK. */
417 static void fp_ungetc(int c
, struct tok_state
*tok
) {
421 /* Read a line of input from TOK. Determine encoding
425 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
428 int warn
= 0, badchar
= 0;
430 if (tok
->decoding_state
< 0) {
431 /* We already have a codec associated with
433 line
= fp_readl(s
, size
, tok
);
435 } else if (tok
->decoding_state
> 0) {
436 /* We want a 'raw' read. */
437 line
= Py_UniversalNewlineFgets(s
, size
,
442 /* We have not yet determined the encoding.
443 If an encoding is found, use the file-pointer
444 reader functions from now on. */
445 if (!check_bom(fp_getc
, fp_ungetc
, fp_setreadl
, tok
))
446 return error_ret(tok
);
447 assert(tok
->decoding_state
!= 0);
450 if (line
!= NULL
&& tok
->lineno
< 2 && !tok
->read_coding_spec
) {
451 if (!check_coding_spec(line
, strlen(line
), tok
, fp_setreadl
)) {
452 return error_ret(tok
);
456 if (warn
&& line
&& !tok
->issued_encoding_warning
&& !tok
->encoding
) {
458 for (c
= (unsigned char *)line
; *c
; c
++)
466 /* Need to add 1 to the line number, since this line
467 has not been counted, yet. */
469 "Non-ASCII character '\\x%.2x' "
470 "in file %.200s on line %i, "
471 "but no encoding declared; "
472 "see http://www.python.org/peps/pep-0263.html for details",
473 badchar
, tok
->filename
, tok
->lineno
+ 1);
474 /* We don't use PyErr_WarnExplicit() here because
475 printing the line in question to e.g. a log file
476 could result in sensitive information being
478 PyErr_Warn(PyExc_DeprecationWarning
, buf
);
479 tok
->issued_encoding_warning
= 1;
486 decoding_feof(struct tok_state
*tok
)
488 if (tok
->decoding_state
>= 0) {
489 return feof(tok
->fp
);
491 PyObject
* buf
= tok
->decoding_buffer
;
493 PyObject
*args
= PyTuple_New(0);
498 buf
= PyObject_Call(tok
->decoding_readline
,
505 tok
->decoding_buffer
= buf
;
508 return PyObject_Length(buf
) == 0;
512 /* Fetch a byte from TOK, using the string buffer. */
514 static int buf_getc(struct tok_state
*tok
) {
515 return Py_CHARMASK(*tok
->str
++);
518 /* Unfetch a byte from TOK, using the string buffer. */
520 static void buf_ungetc(int c
, struct tok_state
*tok
) {
522 assert(Py_CHARMASK(*tok
->str
) == c
); /* tok->cur may point to read-only segment */
525 /* Set the readline function for TOK to ENC. For the string-based
526 tokenizer, this means to just record the encoding. */
528 static int buf_setreadl(struct tok_state
*tok
, const char* enc
) {
533 /* Return a UTF-8 encoding Python string object from the
534 C byte string STR, which is encoded with ENC. */
536 #ifdef Py_USING_UNICODE
538 translate_into_utf8(const char* str
, const char* enc
) {
540 PyObject
* buf
= PyUnicode_Decode(str
, strlen(str
), enc
, NULL
);
543 utf8
= PyUnicode_AsUTF8String(buf
);
549 /* Decode a byte string STR for use as the buffer of TOK.
550 Look for encoding declarations inside STR, and record them
554 decode_str(const char *str
, struct tok_state
*tok
)
556 PyObject
* utf8
= NULL
;
561 if (!check_bom(buf_getc
, buf_ungetc
, buf_setreadl
, tok
))
563 str
= tok
->str
; /* string after BOM if any */
565 #ifdef Py_USING_UNICODE
566 if (tok
->enc
!= NULL
) {
567 utf8
= translate_into_utf8(str
, tok
->enc
);
570 str
= PyString_AsString(utf8
);
573 for (s
= str
;; s
++) {
574 if (*s
== '\0') break;
575 else if (*s
== '\n') {
577 if (lineno
== 2) break;
581 if (!check_coding_spec(str
, s
- str
, tok
, buf_setreadl
))
583 #ifdef Py_USING_UNICODE
584 if (tok
->enc
!= NULL
) {
585 assert(utf8
== NULL
);
586 utf8
= translate_into_utf8(str
, tok
->enc
);
589 str
= PyString_AsString(utf8
);
592 assert(tok
->decoding_buffer
== NULL
);
593 tok
->decoding_buffer
= utf8
; /* CAUTION */
599 /* Set up tokenizer for string */
602 PyTokenizer_FromString(const char *str
)
604 struct tok_state
*tok
= tok_new();
607 str
= (char *)decode_str(str
, tok
);
610 /* XXX: constify members. */
611 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= (char*)str
;
616 /* Set up tokenizer for file */
619 PyTokenizer_FromFile(FILE *fp
, char *ps1
, char *ps2
)
621 struct tok_state
*tok
= tok_new();
624 if ((tok
->buf
= PyMem_NEW(char, BUFSIZ
)) == NULL
) {
628 tok
->cur
= tok
->inp
= tok
->buf
;
629 tok
->end
= tok
->buf
+ BUFSIZ
;
632 tok
->nextprompt
= ps2
;
637 /* Free a tok_state structure */
640 PyTokenizer_Free(struct tok_state
*tok
)
642 if (tok
->encoding
!= NULL
)
643 PyMem_DEL(tok
->encoding
);
645 Py_XDECREF(tok
->decoding_readline
);
646 Py_XDECREF(tok
->decoding_buffer
);
648 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
654 /* Get next char, updating state; error code goes into tok->done */
657 tok_nextc(register struct tok_state
*tok
)
660 if (tok
->cur
!= tok
->inp
) {
661 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
663 if (tok
->done
!= E_OK
)
665 if (tok
->fp
== NULL
) {
666 char *end
= strchr(tok
->inp
, '\n');
670 end
= strchr(tok
->inp
, '\0');
671 if (end
== tok
->inp
) {
676 if (tok
->start
== NULL
)
680 return Py_CHARMASK(*tok
->cur
++);
682 if (tok
->prompt
!= NULL
) {
683 char *new = PyOS_Readline(stdin
, stdout
, tok
->prompt
);
684 if (tok
->nextprompt
!= NULL
)
685 tok
->prompt
= tok
->nextprompt
;
688 else if (*new == '\0') {
692 else if (tok
->start
!= NULL
) {
693 size_t start
= tok
->start
- tok
->buf
;
694 size_t oldlen
= tok
->cur
- tok
->buf
;
695 size_t newlen
= oldlen
+ strlen(new);
696 char *buf
= tok
->buf
;
697 PyMem_RESIZE(buf
, char, newlen
+1);
707 tok
->cur
= tok
->buf
+ oldlen
;
708 strcpy(tok
->buf
+ oldlen
, new);
710 tok
->inp
= tok
->buf
+ newlen
;
711 tok
->end
= tok
->inp
+ 1;
712 tok
->start
= tok
->buf
+ start
;
716 if (tok
->buf
!= NULL
)
720 tok
->inp
= strchr(tok
->buf
, '\0');
721 tok
->end
= tok
->inp
+ 1;
728 if (tok
->start
== NULL
) {
729 if (tok
->buf
== NULL
) {
730 tok
->buf
= PyMem_NEW(char, BUFSIZ
);
731 if (tok
->buf
== NULL
) {
735 tok
->end
= tok
->buf
+ BUFSIZ
;
737 if (decoding_fgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
744 tok
->inp
= strchr(tok
->buf
, '\0');
745 done
= tok
->inp
[-1] == '\n';
749 cur
= tok
->cur
- tok
->buf
;
750 if (decoding_feof(tok
)) {
758 /* Read until '\n' or EOF */
760 int curstart
= tok
->start
== NULL
? -1 :
761 tok
->start
- tok
->buf
;
762 int curvalid
= tok
->inp
- tok
->buf
;
763 int newsize
= curvalid
+ BUFSIZ
;
764 char *newbuf
= tok
->buf
;
765 PyMem_RESIZE(newbuf
, char, newsize
);
766 if (newbuf
== NULL
) {
772 tok
->inp
= tok
->buf
+ curvalid
;
773 tok
->end
= tok
->buf
+ newsize
;
774 tok
->start
= curstart
< 0 ? NULL
:
776 if (decoding_fgets(tok
->inp
,
777 (int)(tok
->end
- tok
->inp
),
779 /* Last line does not end in \n,
781 strcpy(tok
->inp
, "\n");
783 tok
->inp
= strchr(tok
->inp
, '\0');
784 done
= tok
->inp
[-1] == '\n';
786 tok
->cur
= tok
->buf
+ cur
;
788 /* replace "\r\n" with "\n" */
789 /* For Mac we leave the \r, giving a syntax error */
791 if (pt
>= tok
->buf
&& *pt
== '\r') {
798 if (tok
->done
!= E_OK
) {
799 if (tok
->prompt
!= NULL
)
800 PySys_WriteStderr("\n");
809 /* Back-up one character */
812 tok_backup(register struct tok_state
*tok
, register int c
)
815 if (--tok
->cur
< tok
->buf
)
816 Py_FatalError("tok_backup: begin of buffer");
823 /* Return the token corresponding to a single character */
826 PyToken_OneChar(int c
)
829 case '(': return LPAR
;
830 case ')': return RPAR
;
831 case '[': return LSQB
;
832 case ']': return RSQB
;
833 case ':': return COLON
;
834 case ',': return COMMA
;
835 case ';': return SEMI
;
836 case '+': return PLUS
;
837 case '-': return MINUS
;
838 case '*': return STAR
;
839 case '/': return SLASH
;
840 case '|': return VBAR
;
841 case '&': return AMPER
;
842 case '<': return LESS
;
843 case '>': return GREATER
;
844 case '=': return EQUAL
;
845 case '.': return DOT
;
846 case '%': return PERCENT
;
847 case '`': return BACKQUOTE
;
848 case '{': return LBRACE
;
849 case '}': return RBRACE
;
850 case '^': return CIRCUMFLEX
;
851 case '~': return TILDE
;
858 PyToken_TwoChars(int c1
, int c2
)
863 case '=': return EQEQUAL
;
868 case '=': return NOTEQUAL
;
873 case '>': return NOTEQUAL
;
874 case '=': return LESSEQUAL
;
875 case '<': return LEFTSHIFT
;
880 case '=': return GREATEREQUAL
;
881 case '>': return RIGHTSHIFT
;
886 case '=': return PLUSEQUAL
;
891 case '=': return MINEQUAL
;
896 case '*': return DOUBLESTAR
;
897 case '=': return STAREQUAL
;
902 case '/': return DOUBLESLASH
;
903 case '=': return SLASHEQUAL
;
908 case '=': return VBAREQUAL
;
913 case '=': return PERCENTEQUAL
;
918 case '=': return AMPEREQUAL
;
923 case '=': return CIRCUMFLEXEQUAL
;
931 PyToken_ThreeChars(int c1
, int c2
, int c3
)
939 return LEFTSHIFTEQUAL
;
949 return RIGHTSHIFTEQUAL
;
959 return DOUBLESTAREQUAL
;
969 return DOUBLESLASHEQUAL
;
979 indenterror(struct tok_state
*tok
)
982 tok
->done
= E_TABSPACE
;
986 if (tok
->altwarning
) {
987 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
988 "in indentation\n", tok
->filename
);
995 /* Get next token, after space stripping etc. */
998 tok_get(register struct tok_state
*tok
, char **p_start
, char **p_end
)
1003 *p_start
= *p_end
= NULL
;
1008 /* Get indentation level */
1010 register int col
= 0;
1011 register int altcol
= 0;
1017 else if (c
== '\t') {
1018 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
1019 altcol
= (altcol
/tok
->alttabsize
+ 1)
1022 else if (c
== '\014') /* Control-L (formfeed) */
1023 col
= altcol
= 0; /* For Emacs users */
1028 if (c
== '#' || c
== '\n') {
1029 /* Lines with only whitespace and/or comments
1030 shouldn't affect the indentation and are
1031 not passed to the parser as NEWLINE tokens,
1032 except *totally* empty lines in interactive
1033 mode, which signal the end of a command group. */
1034 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
1035 blankline
= 0; /* Let it through */
1037 blankline
= 1; /* Ignore completely */
1038 /* We can't jump back right here since we still
1039 may need to skip to the end of a comment */
1041 if (!blankline
&& tok
->level
== 0) {
1042 if (col
== tok
->indstack
[tok
->indent
]) {
1044 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1045 if (indenterror(tok
))
1049 else if (col
> tok
->indstack
[tok
->indent
]) {
1050 /* Indent -- always one */
1051 if (tok
->indent
+1 >= MAXINDENT
) {
1052 tok
->done
= E_TOODEEP
;
1053 tok
->cur
= tok
->inp
;
1056 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
1057 if (indenterror(tok
))
1061 tok
->indstack
[++tok
->indent
] = col
;
1062 tok
->altindstack
[tok
->indent
] = altcol
;
1064 else /* col < tok->indstack[tok->indent] */ {
1065 /* Dedent -- any number, must be consistent */
1066 while (tok
->indent
> 0 &&
1067 col
< tok
->indstack
[tok
->indent
]) {
1071 if (col
!= tok
->indstack
[tok
->indent
]) {
1072 tok
->done
= E_DEDENT
;
1073 tok
->cur
= tok
->inp
;
1076 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1077 if (indenterror(tok
))
1084 tok
->start
= tok
->cur
;
1086 /* Return pending indents/dedents */
1087 if (tok
->pendin
!= 0) {
1088 if (tok
->pendin
< 0) {
1103 } while (c
== ' ' || c
== '\t' || c
== '\014');
1105 /* Set start of current token */
1106 tok
->start
= tok
->cur
- 1;
1108 /* Skip comment, while looking for tab-setting magic */
1110 static char *tabforms
[] = {
1111 "tab-width:", /* Emacs */
1112 ":tabstop=", /* vim, full form */
1113 ":ts=", /* vim, abbreviated form */
1114 "set tabsize=", /* will vi never die? */
1115 /* more templates can be added here to support other editors */
1121 *tp
++ = c
= tok_nextc(tok
);
1122 } while (c
!= EOF
&& c
!= '\n' &&
1123 tp
- cbuf
+ 1 < sizeof(cbuf
));
1126 cp
< tabforms
+ sizeof(tabforms
)/sizeof(tabforms
[0]);
1128 if ((tp
= strstr(cbuf
, *cp
))) {
1129 int newsize
= atoi(tp
+ strlen(*cp
));
1131 if (newsize
>= 1 && newsize
<= 40) {
1132 tok
->tabsize
= newsize
;
1135 "Tab size set to %d\n",
1140 while (c
!= EOF
&& c
!= '\n')
1144 /* Check for EOF and errors now */
1146 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
1149 /* Identifier (most frequent token!) */
1150 if (isalpha(c
) || c
== '_') {
1151 /* Process r"", u"" and ur"" */
1156 if (c
== '"' || c
== '\'')
1162 if (c
== 'r' || c
== 'R')
1164 if (c
== '"' || c
== '\'')
1168 while (isalnum(c
) || c
== '_') {
1172 *p_start
= tok
->start
;
1180 if (blankline
|| tok
->level
> 0)
1182 *p_start
= tok
->start
;
1183 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
1191 "File contains \\r characters (incorrect line endings?)\n");
1192 tok
->done
= E_TOKEN
;
1193 tok
->cur
= tok
->inp
;
1197 /* Period or number starting with period? */
1205 *p_start
= tok
->start
;
1214 /* Hex or octal -- maybe. */
1218 #ifndef WITHOUT_COMPLEX
1219 if (c
== 'j' || c
== 'J')
1222 if (c
== 'x' || c
== 'X') {
1226 } while (isxdigit(c
));
1229 int found_decimal
= 0;
1230 /* Octal; c is first char of it */
1231 /* There's no 'isoctdigit' macro, sigh */
1232 while ('0' <= c
&& c
< '8') {
1239 } while (isdigit(c
));
1243 else if (c
== 'e' || c
== 'E')
1245 #ifndef WITHOUT_COMPLEX
1246 else if (c
== 'j' || c
== 'J')
1249 else if (found_decimal
) {
1250 tok
->done
= E_TOKEN
;
1255 if (c
== 'l' || c
== 'L')
1262 } while (isdigit(c
));
1263 if (c
== 'l' || c
== 'L')
1266 /* Accept floating point numbers. */
1272 } while (isdigit(c
));
1274 if (c
== 'e' || c
== 'E') {
1278 if (c
== '+' || c
== '-')
1281 tok
->done
= E_TOKEN
;
1287 } while (isdigit(c
));
1289 #ifndef WITHOUT_COMPLEX
1290 if (c
== 'j' || c
== 'J')
1291 /* Imaginary part */
1298 *p_start
= tok
->start
;
1305 if (c
== '\'' || c
== '"') {
1306 int quote2
= tok
->cur
- tok
->start
+ 1;
1319 tok
->cont_line
= 1; /* multiline string. */
1321 else if (c
== EOF
) {
1326 tok
->cur
= tok
->inp
;
1329 else if (c
== quote
) {
1331 if (tok
->cur
- tok
->start
== quote2
) {
1340 if (!triple
|| tripcount
== 3)
1343 else if (c
== '\\') {
1348 tok
->cur
= tok
->inp
;
1355 *p_start
= tok
->start
;
1360 /* Line continuation */
1364 tok
->done
= E_TOKEN
;
1365 tok
->cur
= tok
->inp
;
1369 goto again
; /* Read next line */
1372 /* Check for two-character token */
1374 int c2
= tok_nextc(tok
);
1375 int token
= PyToken_TwoChars(c
, c2
);
1377 int c3
= tok_nextc(tok
);
1378 int token3
= PyToken_ThreeChars(c
, c2
, c3
);
1382 tok_backup(tok
, c3
);
1384 *p_start
= tok
->start
;
1388 tok_backup(tok
, c2
);
1391 /* Keep track of parentheses nesting level */
1405 /* Punctuation character */
1406 *p_start
= tok
->start
;
1408 return PyToken_OneChar(c
);
1412 PyTokenizer_Get(struct tok_state
*tok
, char **p_start
, char **p_end
)
1414 int result
= tok_get(tok
, p_start
, p_end
);
1415 if (tok
->decoding_erred
) {
1416 result
= ERRORTOKEN
;
1417 tok
->done
= E_DECODE
;
1425 tok_dump(int type
, char *start
, char *end
)
1427 printf("%s", _PyParser_TokenNames
[type
]);
1428 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
1429 printf("(%.*s)", (int)(end
- start
), start
);