1 /***********************************************************
2 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI or Corporation for National Research Initiatives or
13 CNRI not be used in advertising or publicity pertaining to
14 distribution of the software without specific, written prior
17 While CWI is the initial source for this software, a modified version
18 is made available by the Corporation for National Research Initiatives
19 (CNRI) at the Internet address ftp://ftp.python.org.
21 STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22 REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23 MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24 CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28 PERFORMANCE OF THIS SOFTWARE.
30 ******************************************************************/
32 /* Tokenizer implementation */
34 #include "pgenheaders.h"
38 #include "tokenizer.h"
41 extern char *PyOS_Readline
Py_PROTO((char *));
42 /* Return malloc'ed string including trailing \n;
43 empty malloc'ed string for EOF;
44 NULL if interrupted */
46 /* Don't ever change this -- it would break the portability of Python code */
49 /* Convert a possibly signed character to a nonnegative int */
50 /* XXX This assumes characters are 8 bits wide */
51 #ifdef __CHAR_UNSIGNED__
52 #define Py_CHARMASK(c) (c)
54 #define Py_CHARMASK(c) ((c) & 0xff)
58 static struct tok_state
*tok_new
Py_PROTO((void));
59 static int tok_nextc
Py_PROTO((struct tok_state
*tok
));
60 static void tok_backup
Py_PROTO((struct tok_state
*tok
, int c
));
64 char *_PyParser_TokenNames
[] = {
102 /* This table must match the #defines in token.h! */
109 /* Create and initialize a new tok_state structure */
111 static struct tok_state
*
114 struct tok_state
*tok
= PyMem_NEW(struct tok_state
, 1);
117 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
120 tok
->tabsize
= TABSIZE
;
122 tok
->indstack
[0] = 0;
125 tok
->prompt
= tok
->nextprompt
= NULL
;
128 tok
->filename
= NULL
;
132 tok
->altindstack
[0] = 0;
137 /* Set up tokenizer for string */
140 PyTokenizer_FromString(str
)
143 struct tok_state
*tok
= tok_new();
146 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= str
;
151 /* Set up tokenizer for file */
154 PyTokenizer_FromFile(fp
, ps1
, ps2
)
158 struct tok_state
*tok
= tok_new();
161 if ((tok
->buf
= PyMem_NEW(char, BUFSIZ
)) == NULL
) {
165 tok
->cur
= tok
->inp
= tok
->buf
;
166 tok
->end
= tok
->buf
+ BUFSIZ
;
169 tok
->nextprompt
= ps2
;
174 /* Free a tok_state structure */
177 PyTokenizer_Free(tok
)
178 struct tok_state
*tok
;
180 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
186 /* Get next char, updating state; error code goes into tok->done */
190 register struct tok_state
*tok
;
193 if (tok
->cur
!= tok
->inp
) {
194 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
196 if (tok
->done
!= E_OK
)
198 if (tok
->fp
== NULL
) {
199 char *end
= strchr(tok
->inp
, '\n');
203 end
= strchr(tok
->inp
, '\0');
204 if (end
== tok
->inp
) {
209 if (tok
->start
== NULL
)
213 return Py_CHARMASK(*tok
->cur
++);
215 if (tok
->prompt
!= NULL
) {
216 char *new = PyOS_Readline(tok
->prompt
);
217 if (tok
->nextprompt
!= NULL
)
218 tok
->prompt
= tok
->nextprompt
;
221 else if (*new == '\0') {
225 else if (tok
->start
!= NULL
) {
226 int start
= tok
->start
- tok
->buf
;
227 int oldlen
= tok
->cur
- tok
->buf
;
228 int newlen
= oldlen
+ strlen(new);
229 char *buf
= realloc(tok
->buf
, newlen
+1);
239 tok
->cur
= tok
->buf
+ oldlen
;
240 strcpy(tok
->buf
+ oldlen
, new);
242 tok
->inp
= tok
->buf
+ newlen
;
243 tok
->end
= tok
->inp
+ 1;
244 tok
->start
= tok
->buf
+ start
;
248 if (tok
->buf
!= NULL
)
252 tok
->inp
= strchr(tok
->buf
, '\0');
253 tok
->end
= tok
->inp
+ 1;
260 if (tok
->start
== NULL
) {
261 if (tok
->buf
== NULL
) {
262 tok
->buf
= PyMem_NEW(char, BUFSIZ
);
263 if (tok
->buf
== NULL
) {
267 tok
->end
= tok
->buf
+ BUFSIZ
;
269 if (fgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
276 tok
->inp
= strchr(tok
->buf
, '\0');
277 done
= tok
->inp
[-1] == '\n';
281 cur
= tok
->cur
- tok
->buf
;
290 /* Read until '\n' or EOF */
292 int curstart
= tok
->start
== NULL
? -1 :
293 tok
->start
- tok
->buf
;
294 int curvalid
= tok
->inp
- tok
->buf
;
295 int newsize
= curvalid
+ BUFSIZ
;
296 char *newbuf
= tok
->buf
;
297 PyMem_RESIZE(newbuf
, char, newsize
);
298 if (newbuf
== NULL
) {
304 tok
->inp
= tok
->buf
+ curvalid
;
305 tok
->end
= tok
->buf
+ newsize
;
306 tok
->start
= curstart
< 0 ? NULL
:
309 (int)(tok
->end
- tok
->inp
),
311 /* Last line does not end in \n,
313 strcpy(tok
->inp
, "\n");
315 tok
->inp
= strchr(tok
->inp
, '\0');
316 done
= tok
->inp
[-1] == '\n';
318 tok
->cur
= tok
->buf
+ cur
;
320 /* replace "\r\n" with "\n" */
321 /* For Mac we leave the \r, giving a syntax error */
323 if (pt
>= tok
->buf
&& *pt
== '\r') {
330 if (tok
->done
!= E_OK
) {
331 if (tok
->prompt
!= NULL
)
332 PySys_WriteStderr("\n");
341 /* Back-up one character */
345 register struct tok_state
*tok
;
349 if (--tok
->cur
< tok
->buf
)
350 Py_FatalError("tok_backup: begin of buffer");
357 /* Return the token corresponding to a single character */
364 case '(': return LPAR
;
365 case ')': return RPAR
;
366 case '[': return LSQB
;
367 case ']': return RSQB
;
368 case ':': return COLON
;
369 case ',': return COMMA
;
370 case ';': return SEMI
;
371 case '+': return PLUS
;
372 case '-': return MINUS
;
373 case '*': return STAR
;
374 case '/': return SLASH
;
375 case '|': return VBAR
;
376 case '&': return AMPER
;
377 case '<': return LESS
;
378 case '>': return GREATER
;
379 case '=': return EQUAL
;
380 case '.': return DOT
;
381 case '%': return PERCENT
;
382 case '`': return BACKQUOTE
;
383 case '{': return LBRACE
;
384 case '}': return RBRACE
;
385 case '^': return CIRCUMFLEX
;
386 case '~': return TILDE
;
393 PyToken_TwoChars(c1
, c2
)
399 case '=': return EQEQUAL
;
404 case '=': return NOTEQUAL
;
409 case '>': return NOTEQUAL
;
410 case '=': return LESSEQUAL
;
411 case '<': return LEFTSHIFT
;
416 case '=': return GREATEREQUAL
;
417 case '>': return RIGHTSHIFT
;
422 case '*': return DOUBLESTAR
;
432 struct tok_state
*tok
;
435 tok
->done
= E_INDENT
;
439 if (tok
->altwarning
) {
440 PySys_WriteStderr("%s: inconsistent tab/space usage\n",
448 /* Get next token, after space stripping etc. */
451 PyTokenizer_Get(tok
, p_start
, p_end
)
452 register struct tok_state
*tok
; /* In/out: tokenizer state */
453 char **p_start
, **p_end
; /* Out: point to start/end of token */
458 *p_start
= *p_end
= NULL
;
463 /* Get indentation level */
465 register int col
= 0;
466 register int altcol
= 0;
472 else if (c
== '\t') {
473 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
474 altcol
= (altcol
/tok
->alttabsize
+ 1)
477 else if (c
== '\014') /* Control-L (formfeed) */
478 col
= altcol
= 0; /* For Emacs users */
483 if (c
== '#' || c
== '\n') {
484 /* Lines with only whitespace and/or comments
485 shouldn't affect the indentation and are
486 not passed to the parser as NEWLINE tokens,
487 except *totally* empty lines in interactive
488 mode, which signal the end of a command group. */
489 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
490 blankline
= 0; /* Let it through */
492 blankline
= 1; /* Ignore completely */
493 /* We can't jump back right here since we still
494 may need to skip to the end of a comment */
496 if (!blankline
&& tok
->level
== 0) {
497 if (col
== tok
->indstack
[tok
->indent
]) {
499 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
500 if (indenterror(tok
))
504 else if (col
> tok
->indstack
[tok
->indent
]) {
505 /* Indent -- always one */
506 if (tok
->indent
+1 >= MAXINDENT
) {
508 "excessive indent\n");
513 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
514 if (indenterror(tok
))
518 tok
->indstack
[++tok
->indent
] = col
;
519 tok
->altindstack
[tok
->indent
] = altcol
;
521 else /* col < tok->indstack[tok->indent] */ {
522 /* Dedent -- any number, must be consistent */
523 while (tok
->indent
> 0 &&
524 col
< tok
->indstack
[tok
->indent
]) {
528 if (col
!= tok
->indstack
[tok
->indent
]) {
530 "inconsistent dedent\n");
535 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
536 if (indenterror(tok
))
543 tok
->start
= tok
->cur
;
545 /* Return pending indents/dedents */
546 if (tok
->pendin
!= 0) {
547 if (tok
->pendin
< 0) {
562 } while (c
== ' ' || c
== '\t' || c
== '\014');
564 /* Set start of current token */
565 tok
->start
= tok
->cur
- 1;
569 /* Hack to allow overriding the tabsize in the file.
570 This is also recognized by vi, when it occurs near the
571 beginning or end of the file. (Will vi never die...?)
572 For Python it must be at the beginning of the file! */
573 /* XXX The real vi syntax is actually different :-( */
574 /* XXX Should recognize Emacs syntax, too */
577 " vi:set tabsize=%d:", &x
) == 1 &&
579 /* PySys_WriteStderr("# vi:set tabsize=%d:\n", x); */
584 } while (c
!= EOF
&& c
!= '\n');
587 /* Check for EOF and errors now */
589 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
592 /* Identifier (most frequent token!) */
593 if (isalpha(c
) || c
== '_') {
598 if (c
== '"' || c
== '\'')
601 while (isalnum(c
) || c
== '_') {
605 *p_start
= tok
->start
;
613 if (blankline
|| tok
->level
> 0)
615 *p_start
= tok
->start
;
616 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
623 "File contains \\r characters (incorrect line endings?)\n");
629 /* Period or number starting with period? */
637 *p_start
= tok
->start
;
650 #ifndef WITHOUT_COMPLEX
651 if (c
== 'j' || c
== 'J')
654 if (c
== 'x' || c
== 'X') {
658 } while (isxdigit(c
));
661 /* XXX This is broken! E.g.,
662 09.9 should be accepted as float! */
663 /* Octal; c is first char of it */
664 /* There's no 'isoctdigit' macro, sigh */
665 while ('0' <= c
&& c
< '8') {
669 if (c
== 'l' || c
== 'L')
676 } while (isdigit(c
));
677 if (c
== 'l' || c
== 'L')
680 /* Accept floating point numbers.
681 XXX This accepts incomplete things like
682 XXX 12e or 1e+; worry run-time */
688 } while (isdigit(c
));
690 if (c
== 'e' || c
== 'E') {
693 if (c
== '+' || c
== '-')
699 #ifndef WITHOUT_COMPLEX
700 if (c
== 'j' || c
== 'J')
708 *p_start
= tok
->start
;
715 if (c
== '\'' || c
== '"') {
716 int quote2
= tok
->cur
- tok
->start
+ 1;
735 else if (c
== quote
) {
737 if (tok
->cur
- tok
->start
== quote2
) {
746 if (!triple
|| tripcount
== 3)
749 else if (c
== '\\') {
761 *p_start
= tok
->start
;
766 /* Line continuation */
774 goto again
; /* Read next line */
777 /* Check for two-character token */
779 int c2
= tok_nextc(tok
);
780 int token
= PyToken_TwoChars(c
, c2
);
782 *p_start
= tok
->start
;
789 /* Keep track of parentheses nesting level */
803 /* Punctuation character */
804 *p_start
= tok
->start
;
806 return PyToken_OneChar(c
);
813 tok_dump(type
, start
, end
)
817 printf("%s", _PyParser_TokenNames
[type
]);
818 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
819 printf("(%.*s)", (int)(end
- start
), start
);