2 /* Tokenizer implementation */
5 #include "pgenheaders.h"
12 extern char *PyOS_Readline(char *);
13 /* Return malloc'ed string including trailing \n;
14 empty malloc'ed string for EOF;
15 NULL if interrupted */
17 /* Don't ever change this -- it would break the portability of Python code */
20 /* Convert a possibly signed character to a nonnegative int */
21 /* XXX This assumes characters are 8 bits wide */
22 #ifdef __CHAR_UNSIGNED__
23 #define Py_CHARMASK(c) (c)
25 #define Py_CHARMASK(c) ((c) & 0xff)
29 static struct tok_state
*tok_new(void);
30 static int tok_nextc(struct tok_state
*tok
);
31 static void tok_backup(struct tok_state
*tok
, int c
);
35 char *_PyParser_TokenNames
[] = {
86 /* This table must match the #defines in token.h! */
93 /* Create and initialize a new tok_state structure */
95 static struct tok_state
*
98 struct tok_state
*tok
= PyMem_NEW(struct tok_state
, 1);
101 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
104 tok
->tabsize
= TABSIZE
;
106 tok
->indstack
[0] = 0;
109 tok
->prompt
= tok
->nextprompt
= NULL
;
112 tok
->filename
= NULL
;
116 tok
->altindstack
[0] = 0;
121 /* Set up tokenizer for string */
124 PyTokenizer_FromString(char *str
)
126 struct tok_state
*tok
= tok_new();
129 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= str
;
134 /* Set up tokenizer for file */
137 PyTokenizer_FromFile(FILE *fp
, char *ps1
, char *ps2
)
139 struct tok_state
*tok
= tok_new();
142 if ((tok
->buf
= PyMem_NEW(char, BUFSIZ
)) == NULL
) {
146 tok
->cur
= tok
->inp
= tok
->buf
;
147 tok
->end
= tok
->buf
+ BUFSIZ
;
150 tok
->nextprompt
= ps2
;
155 /* Free a tok_state structure */
158 PyTokenizer_Free(struct tok_state
*tok
)
160 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
166 /* Get next char, updating state; error code goes into tok->done */
169 tok_nextc(register struct tok_state
*tok
)
172 if (tok
->cur
!= tok
->inp
) {
173 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
175 if (tok
->done
!= E_OK
)
177 if (tok
->fp
== NULL
) {
178 char *end
= strchr(tok
->inp
, '\n');
182 end
= strchr(tok
->inp
, '\0');
183 if (end
== tok
->inp
) {
188 if (tok
->start
== NULL
)
192 return Py_CHARMASK(*tok
->cur
++);
194 if (tok
->prompt
!= NULL
) {
195 char *new = PyOS_Readline(tok
->prompt
);
196 if (tok
->nextprompt
!= NULL
)
197 tok
->prompt
= tok
->nextprompt
;
200 else if (*new == '\0') {
204 else if (tok
->start
!= NULL
) {
205 size_t start
= tok
->start
- tok
->buf
;
206 size_t oldlen
= tok
->cur
- tok
->buf
;
207 size_t newlen
= oldlen
+ strlen(new);
208 char *buf
= tok
->buf
;
209 PyMem_RESIZE(buf
, char, newlen
+1);
219 tok
->cur
= tok
->buf
+ oldlen
;
220 strcpy(tok
->buf
+ oldlen
, new);
222 tok
->inp
= tok
->buf
+ newlen
;
223 tok
->end
= tok
->inp
+ 1;
224 tok
->start
= tok
->buf
+ start
;
228 if (tok
->buf
!= NULL
)
232 tok
->inp
= strchr(tok
->buf
, '\0');
233 tok
->end
= tok
->inp
+ 1;
240 if (tok
->start
== NULL
) {
241 if (tok
->buf
== NULL
) {
242 tok
->buf
= PyMem_NEW(char, BUFSIZ
);
243 if (tok
->buf
== NULL
) {
247 tok
->end
= tok
->buf
+ BUFSIZ
;
249 if (Py_UniversalNewlineFgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
250 tok
->fp
, NULL
) == NULL
) {
256 tok
->inp
= strchr(tok
->buf
, '\0');
257 done
= tok
->inp
[-1] == '\n';
261 cur
= tok
->cur
- tok
->buf
;
270 /* Read until '\n' or EOF */
272 int curstart
= tok
->start
== NULL
? -1 :
273 tok
->start
- tok
->buf
;
274 int curvalid
= tok
->inp
- tok
->buf
;
275 int newsize
= curvalid
+ BUFSIZ
;
276 char *newbuf
= tok
->buf
;
277 PyMem_RESIZE(newbuf
, char, newsize
);
278 if (newbuf
== NULL
) {
284 tok
->inp
= tok
->buf
+ curvalid
;
285 tok
->end
= tok
->buf
+ newsize
;
286 tok
->start
= curstart
< 0 ? NULL
:
288 if (Py_UniversalNewlineFgets(tok
->inp
,
289 (int)(tok
->end
- tok
->inp
),
290 tok
->fp
, NULL
) == NULL
) {
291 /* Last line does not end in \n,
293 strcpy(tok
->inp
, "\n");
295 tok
->inp
= strchr(tok
->inp
, '\0');
296 done
= tok
->inp
[-1] == '\n';
298 tok
->cur
= tok
->buf
+ cur
;
300 /* replace "\r\n" with "\n" */
301 /* For Mac we leave the \r, giving a syntax error */
303 if (pt
>= tok
->buf
&& *pt
== '\r') {
310 if (tok
->done
!= E_OK
) {
311 if (tok
->prompt
!= NULL
)
312 PySys_WriteStderr("\n");
321 /* Back-up one character */
324 tok_backup(register struct tok_state
*tok
, register int c
)
327 if (--tok
->cur
< tok
->buf
)
328 Py_FatalError("tok_backup: begin of buffer");
335 /* Return the token corresponding to a single character */
338 PyToken_OneChar(int c
)
341 case '(': return LPAR
;
342 case ')': return RPAR
;
343 case '[': return LSQB
;
344 case ']': return RSQB
;
345 case ':': return COLON
;
346 case ',': return COMMA
;
347 case ';': return SEMI
;
348 case '+': return PLUS
;
349 case '-': return MINUS
;
350 case '*': return STAR
;
351 case '/': return SLASH
;
352 case '|': return VBAR
;
353 case '&': return AMPER
;
354 case '<': return LESS
;
355 case '>': return GREATER
;
356 case '=': return EQUAL
;
357 case '.': return DOT
;
358 case '%': return PERCENT
;
359 case '`': return BACKQUOTE
;
360 case '{': return LBRACE
;
361 case '}': return RBRACE
;
362 case '^': return CIRCUMFLEX
;
363 case '~': return TILDE
;
370 PyToken_TwoChars(int c1
, int c2
)
375 case '=': return EQEQUAL
;
380 case '=': return NOTEQUAL
;
385 case '>': return NOTEQUAL
;
386 case '=': return LESSEQUAL
;
387 case '<': return LEFTSHIFT
;
392 case '=': return GREATEREQUAL
;
393 case '>': return RIGHTSHIFT
;
398 case '=': return PLUSEQUAL
;
403 case '=': return MINEQUAL
;
408 case '*': return DOUBLESTAR
;
409 case '=': return STAREQUAL
;
414 case '/': return DOUBLESLASH
;
415 case '=': return SLASHEQUAL
;
420 case '=': return VBAREQUAL
;
425 case '=': return PERCENTEQUAL
;
430 case '=': return AMPEREQUAL
;
435 case '=': return CIRCUMFLEXEQUAL
;
443 PyToken_ThreeChars(int c1
, int c2
, int c3
)
451 return LEFTSHIFTEQUAL
;
461 return RIGHTSHIFTEQUAL
;
471 return DOUBLESTAREQUAL
;
481 return DOUBLESLASHEQUAL
;
491 indenterror(struct tok_state
*tok
)
494 tok
->done
= E_TABSPACE
;
498 if (tok
->altwarning
) {
499 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
500 "in indentation\n", tok
->filename
);
507 /* Get next token, after space stripping etc. */
510 PyTokenizer_Get(register struct tok_state
*tok
, char **p_start
,
516 *p_start
= *p_end
= NULL
;
521 /* Get indentation level */
523 register int col
= 0;
524 register int altcol
= 0;
530 else if (c
== '\t') {
531 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
532 altcol
= (altcol
/tok
->alttabsize
+ 1)
535 else if (c
== '\014') /* Control-L (formfeed) */
536 col
= altcol
= 0; /* For Emacs users */
541 if (c
== '#' || c
== '\n') {
542 /* Lines with only whitespace and/or comments
543 shouldn't affect the indentation and are
544 not passed to the parser as NEWLINE tokens,
545 except *totally* empty lines in interactive
546 mode, which signal the end of a command group. */
547 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
548 blankline
= 0; /* Let it through */
550 blankline
= 1; /* Ignore completely */
551 /* We can't jump back right here since we still
552 may need to skip to the end of a comment */
554 if (!blankline
&& tok
->level
== 0) {
555 if (col
== tok
->indstack
[tok
->indent
]) {
557 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
558 if (indenterror(tok
))
562 else if (col
> tok
->indstack
[tok
->indent
]) {
563 /* Indent -- always one */
564 if (tok
->indent
+1 >= MAXINDENT
) {
565 tok
->done
= E_TOODEEP
;
569 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
570 if (indenterror(tok
))
574 tok
->indstack
[++tok
->indent
] = col
;
575 tok
->altindstack
[tok
->indent
] = altcol
;
577 else /* col < tok->indstack[tok->indent] */ {
578 /* Dedent -- any number, must be consistent */
579 while (tok
->indent
> 0 &&
580 col
< tok
->indstack
[tok
->indent
]) {
584 if (col
!= tok
->indstack
[tok
->indent
]) {
585 tok
->done
= E_DEDENT
;
589 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
590 if (indenterror(tok
))
597 tok
->start
= tok
->cur
;
599 /* Return pending indents/dedents */
600 if (tok
->pendin
!= 0) {
601 if (tok
->pendin
< 0) {
616 } while (c
== ' ' || c
== '\t' || c
== '\014');
618 /* Set start of current token */
619 tok
->start
= tok
->cur
- 1;
621 /* Skip comment, while looking for tab-setting magic */
623 static char *tabforms
[] = {
624 "tab-width:", /* Emacs */
625 ":tabstop=", /* vim, full form */
626 ":ts=", /* vim, abbreviated form */
627 "set tabsize=", /* will vi never die? */
628 /* more templates can be added here to support other editors */
634 *tp
++ = c
= tok_nextc(tok
);
635 } while (c
!= EOF
&& c
!= '\n' &&
636 tp
- cbuf
+ 1 < sizeof(cbuf
));
639 cp
< tabforms
+ sizeof(tabforms
)/sizeof(tabforms
[0]);
641 if ((tp
= strstr(cbuf
, *cp
))) {
642 int newsize
= atoi(tp
+ strlen(*cp
));
644 if (newsize
>= 1 && newsize
<= 40) {
645 tok
->tabsize
= newsize
;
648 "Tab size set to %d\n",
653 while (c
!= EOF
&& c
!= '\n')
657 /* Check for EOF and errors now */
659 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
662 /* Identifier (most frequent token!) */
663 if (isalpha(c
) || c
== '_') {
664 /* Process r"", u"" and ur"" */
669 if (c
== '"' || c
== '\'')
675 if (c
== 'r' || c
== 'R')
677 if (c
== '"' || c
== '\'')
681 while (isalnum(c
) || c
== '_') {
685 *p_start
= tok
->start
;
693 if (blankline
|| tok
->level
> 0)
695 *p_start
= tok
->start
;
696 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
703 "File contains \\r characters (incorrect line endings?)\n");
709 /* Period or number starting with period? */
717 *p_start
= tok
->start
;
726 /* Hex or octal -- maybe. */
730 #ifndef WITHOUT_COMPLEX
731 if (c
== 'j' || c
== 'J')
734 if (c
== 'x' || c
== 'X') {
738 } while (isxdigit(c
));
741 int found_decimal
= 0;
742 /* Octal; c is first char of it */
743 /* There's no 'isoctdigit' macro, sigh */
744 while ('0' <= c
&& c
< '8') {
751 } while (isdigit(c
));
755 else if (c
== 'e' || c
== 'E')
757 #ifndef WITHOUT_COMPLEX
758 else if (c
== 'j' || c
== 'J')
761 else if (found_decimal
) {
767 if (c
== 'l' || c
== 'L')
774 } while (isdigit(c
));
775 if (c
== 'l' || c
== 'L')
778 /* Accept floating point numbers. */
784 } while (isdigit(c
));
786 if (c
== 'e' || c
== 'E') {
790 if (c
== '+' || c
== '-')
799 } while (isdigit(c
));
801 #ifndef WITHOUT_COMPLEX
802 if (c
== 'j' || c
== 'J')
810 *p_start
= tok
->start
;
817 if (c
== '\'' || c
== '"') {
818 int quote2
= tok
->cur
- tok
->start
+ 1;
837 else if (c
== quote
) {
839 if (tok
->cur
- tok
->start
== quote2
) {
848 if (!triple
|| tripcount
== 3)
851 else if (c
== '\\') {
863 *p_start
= tok
->start
;
868 /* Line continuation */
876 goto again
; /* Read next line */
879 /* Check for two-character token */
881 int c2
= tok_nextc(tok
);
882 int token
= PyToken_TwoChars(c
, c2
);
884 int c3
= tok_nextc(tok
);
885 int token3
= PyToken_ThreeChars(c
, c2
, c3
);
891 *p_start
= tok
->start
;
898 /* Keep track of parentheses nesting level */
912 /* Punctuation character */
913 *p_start
= tok
->start
;
915 return PyToken_OneChar(c
);
922 tok_dump(int type
, char *start
, char *end
)
924 printf("%s", _PyParser_TokenNames
[type
]);
925 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
926 printf("(%.*s)", (int)(end
- start
), start
);