1 /***********************************************************
2 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI or Corporation for National Research Initiatives or
13 CNRI not be used in advertising or publicity pertaining to
14 distribution of the software without specific, written prior
17 While CWI is the initial source for this software, a modified version
18 is made available by the Corporation for National Research Initiatives
19 (CNRI) at the Internet address ftp://ftp.python.org.
21 STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22 REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23 MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24 CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28 PERFORMANCE OF THIS SOFTWARE.
30 ******************************************************************/
32 /* Tokenizer implementation */
34 #include "pgenheaders.h"
38 #include "tokenizer.h"
41 extern char *PyOS_Readline
Py_PROTO((char *));
42 /* Return malloc'ed string including trailing \n;
43 empty malloc'ed string for EOF;
44 NULL if interrupted */
46 /* Don't ever change this -- it would break the portability of Python code */
49 /* Convert a possibly signed character to a nonnegative int */
50 /* XXX This assumes characters are 8 bits wide */
51 #ifdef __CHAR_UNSIGNED__
52 #define Py_CHARMASK(c) (c)
54 #define Py_CHARMASK(c) ((c) & 0xff)
58 static struct tok_state
*tok_new
Py_PROTO((void));
59 static int tok_nextc
Py_PROTO((struct tok_state
*tok
));
60 static void tok_backup
Py_PROTO((struct tok_state
*tok
, int c
));
64 char *_PyParser_TokenNames
[] = {
102 /* This table must match the #defines in token.h! */
109 /* Create and initialize a new tok_state structure */
111 static struct tok_state
*
114 struct tok_state
*tok
= PyMem_NEW(struct tok_state
, 1);
117 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
120 tok
->tabsize
= TABSIZE
;
122 tok
->indstack
[0] = 0;
125 tok
->prompt
= tok
->nextprompt
= NULL
;
128 tok
->filename
= NULL
;
132 tok
->altindstack
[0] = 0;
137 /* Set up tokenizer for string */
140 PyTokenizer_FromString(str
)
143 struct tok_state
*tok
= tok_new();
146 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= str
;
151 /* Set up tokenizer for file */
154 PyTokenizer_FromFile(fp
, ps1
, ps2
)
158 struct tok_state
*tok
= tok_new();
161 if ((tok
->buf
= PyMem_NEW(char, BUFSIZ
)) == NULL
) {
165 tok
->cur
= tok
->inp
= tok
->buf
;
166 tok
->end
= tok
->buf
+ BUFSIZ
;
169 tok
->nextprompt
= ps2
;
174 /* Free a tok_state structure */
177 PyTokenizer_Free(tok
)
178 struct tok_state
*tok
;
180 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
186 /* Get next char, updating state; error code goes into tok->done */
190 register struct tok_state
*tok
;
193 if (tok
->cur
!= tok
->inp
) {
194 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
196 if (tok
->done
!= E_OK
)
198 if (tok
->fp
== NULL
) {
199 char *end
= strchr(tok
->inp
, '\n');
203 end
= strchr(tok
->inp
, '\0');
204 if (end
== tok
->inp
) {
209 if (tok
->start
== NULL
)
213 return Py_CHARMASK(*tok
->cur
++);
215 if (tok
->prompt
!= NULL
) {
216 char *new = PyOS_Readline(tok
->prompt
);
217 if (tok
->nextprompt
!= NULL
)
218 tok
->prompt
= tok
->nextprompt
;
221 else if (*new == '\0') {
225 else if (tok
->start
!= NULL
) {
226 int start
= tok
->start
- tok
->buf
;
227 int oldlen
= tok
->cur
- tok
->buf
;
228 int newlen
= oldlen
+ strlen(new);
229 char *buf
= realloc(tok
->buf
, newlen
+1);
239 tok
->cur
= tok
->buf
+ oldlen
;
240 strcpy(tok
->buf
+ oldlen
, new);
242 tok
->inp
= tok
->buf
+ newlen
;
243 tok
->end
= tok
->inp
+ 1;
244 tok
->start
= tok
->buf
+ start
;
248 if (tok
->buf
!= NULL
)
252 tok
->inp
= strchr(tok
->buf
, '\0');
253 tok
->end
= tok
->inp
+ 1;
260 if (tok
->start
== NULL
) {
261 if (tok
->buf
== NULL
) {
262 tok
->buf
= PyMem_NEW(char, BUFSIZ
);
263 if (tok
->buf
== NULL
) {
267 tok
->end
= tok
->buf
+ BUFSIZ
;
269 if (fgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
276 tok
->inp
= strchr(tok
->buf
, '\0');
277 done
= tok
->inp
[-1] == '\n';
281 cur
= tok
->cur
- tok
->buf
;
290 /* Read until '\n' or EOF */
292 int curstart
= tok
->start
== NULL
? -1 :
293 tok
->start
- tok
->buf
;
294 int curvalid
= tok
->inp
- tok
->buf
;
295 int newsize
= curvalid
+ BUFSIZ
;
296 char *newbuf
= tok
->buf
;
297 PyMem_RESIZE(newbuf
, char, newsize
);
298 if (newbuf
== NULL
) {
304 tok
->inp
= tok
->buf
+ curvalid
;
305 tok
->end
= tok
->buf
+ newsize
;
306 tok
->start
= curstart
< 0 ? NULL
:
309 (int)(tok
->end
- tok
->inp
),
311 /* Last line does not end in \n,
313 strcpy(tok
->inp
, "\n");
315 tok
->inp
= strchr(tok
->inp
, '\0');
316 done
= tok
->inp
[-1] == '\n';
318 tok
->cur
= tok
->buf
+ cur
;
320 /* replace "\r\n" with "\n" */
321 /* For Mac we leave the \r, giving a syntax error */
323 if (pt
>= tok
->buf
&& *pt
== '\r') {
330 if (tok
->done
!= E_OK
) {
331 if (tok
->prompt
!= NULL
)
332 fprintf(stderr
, "\n");
341 /* Back-up one character */
345 register struct tok_state
*tok
;
349 if (--tok
->cur
< tok
->buf
)
350 Py_FatalError("tok_backup: begin of buffer");
357 /* Return the token corresponding to a single character */
364 case '(': return LPAR
;
365 case ')': return RPAR
;
366 case '[': return LSQB
;
367 case ']': return RSQB
;
368 case ':': return COLON
;
369 case ',': return COMMA
;
370 case ';': return SEMI
;
371 case '+': return PLUS
;
372 case '-': return MINUS
;
373 case '*': return STAR
;
374 case '/': return SLASH
;
375 case '|': return VBAR
;
376 case '&': return AMPER
;
377 case '<': return LESS
;
378 case '>': return GREATER
;
379 case '=': return EQUAL
;
380 case '.': return DOT
;
381 case '%': return PERCENT
;
382 case '`': return BACKQUOTE
;
383 case '{': return LBRACE
;
384 case '}': return RBRACE
;
385 case '^': return CIRCUMFLEX
;
386 case '~': return TILDE
;
393 PyToken_TwoChars(c1
, c2
)
399 case '=': return EQEQUAL
;
404 case '=': return NOTEQUAL
;
409 case '>': return NOTEQUAL
;
410 case '=': return LESSEQUAL
;
411 case '<': return LEFTSHIFT
;
416 case '=': return GREATEREQUAL
;
417 case '>': return RIGHTSHIFT
;
422 case '*': return DOUBLESTAR
;
432 struct tok_state
*tok
;
435 tok
->done
= E_INDENT
;
439 if (tok
->altwarning
) {
440 fprintf(stderr
, "%s: inconsistent tab/space usage\n",
448 /* Get next token, after space stripping etc. */
451 PyTokenizer_Get(tok
, p_start
, p_end
)
452 register struct tok_state
*tok
; /* In/out: tokenizer state */
453 char **p_start
, **p_end
; /* Out: point to start/end of token */
458 *p_start
= *p_end
= NULL
;
463 /* Get indentation level */
465 register int col
= 0;
466 register int altcol
= 0;
472 else if (c
== '\t') {
473 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
474 altcol
= (altcol
/tok
->alttabsize
+ 1)
477 else if (c
== '\014') /* Control-L (formfeed) */
478 col
= altcol
= 0; /* For Emacs users */
483 if (c
== '#' || c
== '\n') {
484 /* Lines with only whitespace and/or comments
485 shouldn't affect the indentation and are
486 not passed to the parser as NEWLINE tokens,
487 except *totally* empty lines in interactive
488 mode, which signal the end of a command group. */
489 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
490 blankline
= 0; /* Let it through */
492 blankline
= 1; /* Ignore completely */
493 /* We can't jump back right here since we still
494 may need to skip to the end of a comment */
496 if (!blankline
&& tok
->level
== 0) {
497 if (col
== tok
->indstack
[tok
->indent
]) {
499 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
500 if (indenterror(tok
))
504 else if (col
> tok
->indstack
[tok
->indent
]) {
505 /* Indent -- always one */
506 if (tok
->indent
+1 >= MAXINDENT
) {
507 fprintf(stderr
, "excessive indent\n");
512 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
513 if (indenterror(tok
))
517 tok
->indstack
[++tok
->indent
] = col
;
518 tok
->altindstack
[tok
->indent
] = altcol
;
520 else /* col < tok->indstack[tok->indent] */ {
521 /* Dedent -- any number, must be consistent */
522 while (tok
->indent
> 0 &&
523 col
< tok
->indstack
[tok
->indent
]) {
527 if (col
!= tok
->indstack
[tok
->indent
]) {
529 "inconsistent dedent\n");
534 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
535 if (indenterror(tok
))
542 tok
->start
= tok
->cur
;
544 /* Return pending indents/dedents */
545 if (tok
->pendin
!= 0) {
546 if (tok
->pendin
< 0) {
561 } while (c
== ' ' || c
== '\t' || c
== '\014');
563 /* Set start of current token */
564 tok
->start
= tok
->cur
- 1;
568 /* Hack to allow overriding the tabsize in the file.
569 This is also recognized by vi, when it occurs near the
570 beginning or end of the file. (Will vi never die...?)
571 For Python it must be at the beginning of the file! */
572 /* XXX The real vi syntax is actually different :-( */
573 /* XXX Should recognize Emacs syntax, too */
576 " vi:set tabsize=%d:", &x
) == 1 &&
578 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
583 } while (c
!= EOF
&& c
!= '\n');
586 /* Check for EOF and errors now */
588 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
591 /* Identifier (most frequent token!) */
592 if (isalpha(c
) || c
== '_') {
597 if (c
== '"' || c
== '\'')
600 while (isalnum(c
) || c
== '_') {
604 *p_start
= tok
->start
;
612 if (blankline
|| tok
->level
> 0)
614 *p_start
= tok
->start
;
615 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
622 "File contains \\r characters (incorrect line endings?)\n");
628 /* Period or number starting with period? */
636 *p_start
= tok
->start
;
649 #ifndef WITHOUT_COMPLEX
650 if (c
== 'j' || c
== 'J')
653 if (c
== 'x' || c
== 'X') {
657 } while (isxdigit(c
));
660 /* XXX This is broken! E.g.,
661 09.9 should be accepted as float! */
662 /* Octal; c is first char of it */
663 /* There's no 'isoctdigit' macro, sigh */
664 while ('0' <= c
&& c
< '8') {
668 if (c
== 'l' || c
== 'L')
675 } while (isdigit(c
));
676 if (c
== 'l' || c
== 'L')
679 /* Accept floating point numbers.
680 XXX This accepts incomplete things like
681 XXX 12e or 1e+; worry run-time */
687 } while (isdigit(c
));
689 if (c
== 'e' || c
== 'E') {
692 if (c
== '+' || c
== '-')
698 #ifndef WITHOUT_COMPLEX
699 if (c
== 'j' || c
== 'J')
707 *p_start
= tok
->start
;
714 if (c
== '\'' || c
== '"') {
715 int quote2
= tok
->cur
- tok
->start
+ 1;
734 else if (c
== quote
) {
736 if (tok
->cur
- tok
->start
== quote2
) {
745 if (!triple
|| tripcount
== 3)
748 else if (c
== '\\') {
760 *p_start
= tok
->start
;
765 /* Line continuation */
773 goto again
; /* Read next line */
776 /* Check for two-character token */
778 int c2
= tok_nextc(tok
);
779 int token
= PyToken_TwoChars(c
, c2
);
781 *p_start
= tok
->start
;
788 /* Keep track of parentheses nesting level */
802 /* Punctuation character */
803 *p_start
= tok
->start
;
805 return PyToken_OneChar(c
);
812 tok_dump(type
, start
, end
)
816 printf("%s", _PyParser_TokenNames
[type
]);
817 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
818 printf("(%.*s)", (int)(end
- start
), start
);