2 /* Tokenizer implementation */
4 #include "pgenheaders.h"
11 extern char *PyOS_Readline(char *);
12 /* Return malloc'ed string including trailing \n;
13 empty malloc'ed string for EOF;
14 NULL if interrupted */
16 /* Don't ever change this -- it would break the portability of Python code */
19 /* Convert a possibly signed character to a nonnegative int */
20 /* XXX This assumes characters are 8 bits wide */
21 #ifdef __CHAR_UNSIGNED__
22 #define Py_CHARMASK(c) (c)
24 #define Py_CHARMASK(c) ((c) & 0xff)
28 static struct tok_state
*tok_new(void);
29 static int tok_nextc(struct tok_state
*tok
);
30 static void tok_backup(struct tok_state
*tok
, int c
);
34 char *_PyParser_TokenNames
[] = {
83 /* This table must match the #defines in token.h! */
90 /* Create and initialize a new tok_state structure */
92 static struct tok_state
*
95 struct tok_state
*tok
= PyMem_NEW(struct tok_state
, 1);
98 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
101 tok
->tabsize
= TABSIZE
;
103 tok
->indstack
[0] = 0;
106 tok
->prompt
= tok
->nextprompt
= NULL
;
109 tok
->filename
= NULL
;
113 tok
->altindstack
[0] = 0;
118 /* Set up tokenizer for string */
121 PyTokenizer_FromString(char *str
)
123 struct tok_state
*tok
= tok_new();
126 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= str
;
131 /* Set up tokenizer for file */
134 PyTokenizer_FromFile(FILE *fp
, char *ps1
, char *ps2
)
136 struct tok_state
*tok
= tok_new();
139 if ((tok
->buf
= PyMem_NEW(char, BUFSIZ
)) == NULL
) {
143 tok
->cur
= tok
->inp
= tok
->buf
;
144 tok
->end
= tok
->buf
+ BUFSIZ
;
147 tok
->nextprompt
= ps2
;
152 /* Free a tok_state structure */
155 PyTokenizer_Free(struct tok_state
*tok
)
157 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
163 /* Get next char, updating state; error code goes into tok->done */
166 tok_nextc(register struct tok_state
*tok
)
169 if (tok
->cur
!= tok
->inp
) {
170 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
172 if (tok
->done
!= E_OK
)
174 if (tok
->fp
== NULL
) {
175 char *end
= strchr(tok
->inp
, '\n');
179 end
= strchr(tok
->inp
, '\0');
180 if (end
== tok
->inp
) {
185 if (tok
->start
== NULL
)
189 return Py_CHARMASK(*tok
->cur
++);
191 if (tok
->prompt
!= NULL
) {
192 char *new = PyOS_Readline(tok
->prompt
);
193 if (tok
->nextprompt
!= NULL
)
194 tok
->prompt
= tok
->nextprompt
;
197 else if (*new == '\0') {
201 else if (tok
->start
!= NULL
) {
202 size_t start
= tok
->start
- tok
->buf
;
203 size_t oldlen
= tok
->cur
- tok
->buf
;
204 size_t newlen
= oldlen
+ strlen(new);
205 char *buf
= tok
->buf
;
206 PyMem_RESIZE(buf
, char, newlen
+1);
216 tok
->cur
= tok
->buf
+ oldlen
;
217 strcpy(tok
->buf
+ oldlen
, new);
219 tok
->inp
= tok
->buf
+ newlen
;
220 tok
->end
= tok
->inp
+ 1;
221 tok
->start
= tok
->buf
+ start
;
225 if (tok
->buf
!= NULL
)
229 tok
->inp
= strchr(tok
->buf
, '\0');
230 tok
->end
= tok
->inp
+ 1;
237 if (tok
->start
== NULL
) {
238 if (tok
->buf
== NULL
) {
239 tok
->buf
= PyMem_NEW(char, BUFSIZ
);
240 if (tok
->buf
== NULL
) {
244 tok
->end
= tok
->buf
+ BUFSIZ
;
246 if (fgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
253 tok
->inp
= strchr(tok
->buf
, '\0');
254 done
= tok
->inp
[-1] == '\n';
258 cur
= tok
->cur
- tok
->buf
;
267 /* Read until '\n' or EOF */
269 int curstart
= tok
->start
== NULL
? -1 :
270 tok
->start
- tok
->buf
;
271 int curvalid
= tok
->inp
- tok
->buf
;
272 int newsize
= curvalid
+ BUFSIZ
;
273 char *newbuf
= tok
->buf
;
274 PyMem_RESIZE(newbuf
, char, newsize
);
275 if (newbuf
== NULL
) {
281 tok
->inp
= tok
->buf
+ curvalid
;
282 tok
->end
= tok
->buf
+ newsize
;
283 tok
->start
= curstart
< 0 ? NULL
:
286 (int)(tok
->end
- tok
->inp
),
288 /* Last line does not end in \n,
290 strcpy(tok
->inp
, "\n");
292 tok
->inp
= strchr(tok
->inp
, '\0');
293 done
= tok
->inp
[-1] == '\n';
295 tok
->cur
= tok
->buf
+ cur
;
297 /* replace "\r\n" with "\n" */
298 /* For Mac we leave the \r, giving a syntax error */
300 if (pt
>= tok
->buf
&& *pt
== '\r') {
307 if (tok
->done
!= E_OK
) {
308 if (tok
->prompt
!= NULL
)
309 PySys_WriteStderr("\n");
318 /* Back-up one character */
321 tok_backup(register struct tok_state
*tok
, register int c
)
324 if (--tok
->cur
< tok
->buf
)
325 Py_FatalError("tok_backup: begin of buffer");
332 /* Return the token corresponding to a single character */
335 PyToken_OneChar(int c
)
338 case '(': return LPAR
;
339 case ')': return RPAR
;
340 case '[': return LSQB
;
341 case ']': return RSQB
;
342 case ':': return COLON
;
343 case ',': return COMMA
;
344 case ';': return SEMI
;
345 case '+': return PLUS
;
346 case '-': return MINUS
;
347 case '*': return STAR
;
348 case '/': return SLASH
;
349 case '|': return VBAR
;
350 case '&': return AMPER
;
351 case '<': return LESS
;
352 case '>': return GREATER
;
353 case '=': return EQUAL
;
354 case '.': return DOT
;
355 case '%': return PERCENT
;
356 case '`': return BACKQUOTE
;
357 case '{': return LBRACE
;
358 case '}': return RBRACE
;
359 case '^': return CIRCUMFLEX
;
360 case '~': return TILDE
;
367 PyToken_TwoChars(int c1
, int c2
)
372 case '=': return EQEQUAL
;
377 case '=': return NOTEQUAL
;
382 case '>': return NOTEQUAL
;
383 case '=': return LESSEQUAL
;
384 case '<': return LEFTSHIFT
;
389 case '=': return GREATEREQUAL
;
390 case '>': return RIGHTSHIFT
;
395 case '=': return PLUSEQUAL
;
400 case '=': return MINEQUAL
;
405 case '*': return DOUBLESTAR
;
406 case '=': return STAREQUAL
;
411 case '=': return SLASHEQUAL
;
416 case '=': return VBAREQUAL
;
421 case '=': return PERCENTEQUAL
;
426 case '=': return AMPEREQUAL
;
431 case '=': return CIRCUMFLEXEQUAL
;
439 PyToken_ThreeChars(int c1
, int c2
, int c3
)
447 return LEFTSHIFTEQUAL
;
458 return RIGHTSHIFTEQUAL
;
469 return DOUBLESTAREQUAL
;
480 indenterror(struct tok_state
*tok
)
483 tok
->done
= E_TABSPACE
;
487 if (tok
->altwarning
) {
488 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
489 "in indentation\n", tok
->filename
);
496 /* Get next token, after space stripping etc. */
499 PyTokenizer_Get(register struct tok_state
*tok
, char **p_start
,
505 *p_start
= *p_end
= NULL
;
510 /* Get indentation level */
512 register int col
= 0;
513 register int altcol
= 0;
519 else if (c
== '\t') {
520 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
521 altcol
= (altcol
/tok
->alttabsize
+ 1)
524 else if (c
== '\014') /* Control-L (formfeed) */
525 col
= altcol
= 0; /* For Emacs users */
530 if (c
== '#' || c
== '\n') {
531 /* Lines with only whitespace and/or comments
532 shouldn't affect the indentation and are
533 not passed to the parser as NEWLINE tokens,
534 except *totally* empty lines in interactive
535 mode, which signal the end of a command group. */
536 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
537 blankline
= 0; /* Let it through */
539 blankline
= 1; /* Ignore completely */
540 /* We can't jump back right here since we still
541 may need to skip to the end of a comment */
543 if (!blankline
&& tok
->level
== 0) {
544 if (col
== tok
->indstack
[tok
->indent
]) {
546 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
547 if (indenterror(tok
))
551 else if (col
> tok
->indstack
[tok
->indent
]) {
552 /* Indent -- always one */
553 if (tok
->indent
+1 >= MAXINDENT
) {
554 tok
->done
= E_TOODEEP
;
558 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
559 if (indenterror(tok
))
563 tok
->indstack
[++tok
->indent
] = col
;
564 tok
->altindstack
[tok
->indent
] = altcol
;
566 else /* col < tok->indstack[tok->indent] */ {
567 /* Dedent -- any number, must be consistent */
568 while (tok
->indent
> 0 &&
569 col
< tok
->indstack
[tok
->indent
]) {
573 if (col
!= tok
->indstack
[tok
->indent
]) {
574 tok
->done
= E_DEDENT
;
578 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
579 if (indenterror(tok
))
586 tok
->start
= tok
->cur
;
588 /* Return pending indents/dedents */
589 if (tok
->pendin
!= 0) {
590 if (tok
->pendin
< 0) {
605 } while (c
== ' ' || c
== '\t' || c
== '\014');
607 /* Set start of current token */
608 tok
->start
= tok
->cur
- 1;
610 /* Skip comment, while looking for tab-setting magic */
612 static char *tabforms
[] = {
613 "tab-width:", /* Emacs */
614 ":tabstop=", /* vim, full form */
615 ":ts=", /* vim, abbreviated form */
616 "set tabsize=", /* will vi never die? */
617 /* more templates can be added here to support other editors */
623 *tp
++ = c
= tok_nextc(tok
);
624 } while (c
!= EOF
&& c
!= '\n' &&
625 tp
- cbuf
+ 1 < sizeof(cbuf
));
628 cp
< tabforms
+ sizeof(tabforms
)/sizeof(tabforms
[0]);
630 if ((tp
= strstr(cbuf
, *cp
))) {
631 int newsize
= atoi(tp
+ strlen(*cp
));
633 if (newsize
>= 1 && newsize
<= 40) {
634 tok
->tabsize
= newsize
;
637 "Tab size set to %d\n",
642 while (c
!= EOF
&& c
!= '\n')
646 /* Check for EOF and errors now */
648 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
651 /* Identifier (most frequent token!) */
652 if (isalpha(c
) || c
== '_') {
653 /* Process r"", u"" and ur"" */
658 if (c
== '"' || c
== '\'')
664 if (c
== 'r' || c
== 'R')
666 if (c
== '"' || c
== '\'')
670 while (isalnum(c
) || c
== '_') {
674 *p_start
= tok
->start
;
682 if (blankline
|| tok
->level
> 0)
684 *p_start
= tok
->start
;
685 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
692 "File contains \\r characters (incorrect line endings?)\n");
698 /* Period or number starting with period? */
706 *p_start
= tok
->start
;
719 #ifndef WITHOUT_COMPLEX
720 if (c
== 'j' || c
== 'J')
723 if (c
== 'x' || c
== 'X') {
727 } while (isxdigit(c
));
730 /* XXX This is broken! E.g.,
731 09.9 should be accepted as float! */
732 /* Octal; c is first char of it */
733 /* There's no 'isoctdigit' macro, sigh */
734 while ('0' <= c
&& c
< '8') {
738 if (c
== 'l' || c
== 'L')
745 } while (isdigit(c
));
746 if (c
== 'l' || c
== 'L')
749 /* Accept floating point numbers.
750 XXX This accepts incomplete things like
751 XXX 12e or 1e+; worry run-time */
757 } while (isdigit(c
));
759 if (c
== 'e' || c
== 'E') {
762 if (c
== '+' || c
== '-')
768 #ifndef WITHOUT_COMPLEX
769 if (c
== 'j' || c
== 'J')
777 *p_start
= tok
->start
;
784 if (c
== '\'' || c
== '"') {
785 int quote2
= tok
->cur
- tok
->start
+ 1;
804 else if (c
== quote
) {
806 if (tok
->cur
- tok
->start
== quote2
) {
815 if (!triple
|| tripcount
== 3)
818 else if (c
== '\\') {
830 *p_start
= tok
->start
;
835 /* Line continuation */
843 goto again
; /* Read next line */
846 /* Check for two-character token */
848 int c2
= tok_nextc(tok
);
849 int token
= PyToken_TwoChars(c
, c2
);
851 int c3
= tok_nextc(tok
);
852 int token3
= PyToken_ThreeChars(c
, c2
, c3
);
858 *p_start
= tok
->start
;
865 /* Keep track of parentheses nesting level */
879 /* Punctuation character */
880 *p_start
= tok
->start
;
882 return PyToken_OneChar(c
);
889 tok_dump(int type
, char *start
, char *end
)
891 printf("%s", _PyParser_TokenNames
[type
]);
892 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
893 printf("(%.*s)", (int)(end
- start
), start
);