Clarify portability and main program.
[python/dscho.git] / Parser / tokenizer.c
blob63d1b050334bd4564dd86004ee6d1c7ece897470
1 /***********************************************************
2 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3 The Netherlands.
5 All Rights Reserved
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI or Corporation for National Research Initiatives or
13 CNRI not be used in advertising or publicity pertaining to
14 distribution of the software without specific, written prior
15 permission.
17 While CWI is the initial source for this software, a modified version
18 is made available by the Corporation for National Research Initiatives
19 (CNRI) at the Internet address ftp://ftp.python.org.
21 STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22 REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23 MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24 CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28 PERFORMANCE OF THIS SOFTWARE.
30 ******************************************************************/
32 /* Tokenizer implementation */
34 #include "pgenheaders.h"
36 #include <ctype.h>
38 #include "tokenizer.h"
39 #include "errcode.h"
41 extern char *PyOS_Readline Py_PROTO((char *));
42 /* Return malloc'ed string including trailing \n;
43 empty malloc'ed string for EOF;
44 NULL if interrupted */
46 /* Don't ever change this -- it would break the portability of Python code */
47 #define TABSIZE 8
49 /* Convert a possibly signed character to a nonnegative int */
50 /* XXX This assumes characters are 8 bits wide */
51 #ifdef __CHAR_UNSIGNED__
52 #define Py_CHARMASK(c) (c)
53 #else
54 #define Py_CHARMASK(c) ((c) & 0xff)
55 #endif
57 /* Forward */
58 static struct tok_state *tok_new Py_PROTO((void));
59 static int tok_nextc Py_PROTO((struct tok_state *tok));
60 static void tok_backup Py_PROTO((struct tok_state *tok, int c));
62 /* Token names */
64 char *_PyParser_TokenNames[] = {
65 "ENDMARKER",
66 "NAME",
67 "NUMBER",
68 "STRING",
69 "NEWLINE",
70 "INDENT",
71 "DEDENT",
72 "LPAR",
73 "RPAR",
74 "LSQB",
75 "RSQB",
76 "COLON",
77 "COMMA",
78 "SEMI",
79 "PLUS",
80 "MINUS",
81 "STAR",
82 "SLASH",
83 "VBAR",
84 "AMPER",
85 "LESS",
86 "GREATER",
87 "EQUAL",
88 "DOT",
89 "PERCENT",
90 "BACKQUOTE",
91 "LBRACE",
92 "RBRACE",
93 "EQEQUAL",
94 "NOTEQUAL",
95 "LESSEQUAL",
96 "GREATEREQUAL",
97 "TILDE",
98 "CIRCUMFLEX",
99 "LEFTSHIFT",
100 "RIGHTSHIFT",
101 "DOUBLESTAR",
102 /* This table must match the #defines in token.h! */
103 "OP",
104 "<ERRORTOKEN>",
105 "<N_TOKENS>"
109 /* Create and initialize a new tok_state structure */
111 static struct tok_state *
112 tok_new()
114 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
115 if (tok == NULL)
116 return NULL;
117 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
118 tok->done = E_OK;
119 tok->fp = NULL;
120 tok->tabsize = TABSIZE;
121 tok->indent = 0;
122 tok->indstack[0] = 0;
123 tok->atbol = 1;
124 tok->pendin = 0;
125 tok->prompt = tok->nextprompt = NULL;
126 tok->lineno = 0;
127 tok->level = 0;
128 tok->filename = NULL;
129 tok->altwarning = 0;
130 tok->alterror = 0;
131 tok->alttabsize = 1;
132 tok->altindstack[0] = 0;
133 return tok;
137 /* Set up tokenizer for string */
139 struct tok_state *
140 PyTokenizer_FromString(str)
141 char *str;
143 struct tok_state *tok = tok_new();
144 if (tok == NULL)
145 return NULL;
146 tok->buf = tok->cur = tok->end = tok->inp = str;
147 return tok;
151 /* Set up tokenizer for file */
153 struct tok_state *
154 PyTokenizer_FromFile(fp, ps1, ps2)
155 FILE *fp;
156 char *ps1, *ps2;
158 struct tok_state *tok = tok_new();
159 if (tok == NULL)
160 return NULL;
161 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
162 PyMem_DEL(tok);
163 return NULL;
165 tok->cur = tok->inp = tok->buf;
166 tok->end = tok->buf + BUFSIZ;
167 tok->fp = fp;
168 tok->prompt = ps1;
169 tok->nextprompt = ps2;
170 return tok;
174 /* Free a tok_state structure */
176 void
177 PyTokenizer_Free(tok)
178 struct tok_state *tok;
180 if (tok->fp != NULL && tok->buf != NULL)
181 PyMem_DEL(tok->buf);
182 PyMem_DEL(tok);
186 /* Get next char, updating state; error code goes into tok->done */
188 static int
189 tok_nextc(tok)
190 register struct tok_state *tok;
192 for (;;) {
193 if (tok->cur != tok->inp) {
194 return Py_CHARMASK(*tok->cur++); /* Fast path */
196 if (tok->done != E_OK)
197 return EOF;
198 if (tok->fp == NULL) {
199 char *end = strchr(tok->inp, '\n');
200 if (end != NULL)
201 end++;
202 else {
203 end = strchr(tok->inp, '\0');
204 if (end == tok->inp) {
205 tok->done = E_EOF;
206 return EOF;
209 if (tok->start == NULL)
210 tok->buf = tok->cur;
211 tok->lineno++;
212 tok->inp = end;
213 return Py_CHARMASK(*tok->cur++);
215 if (tok->prompt != NULL) {
216 char *new = PyOS_Readline(tok->prompt);
217 if (tok->nextprompt != NULL)
218 tok->prompt = tok->nextprompt;
219 if (new == NULL)
220 tok->done = E_INTR;
221 else if (*new == '\0') {
222 free(new);
223 tok->done = E_EOF;
225 else if (tok->start != NULL) {
226 int start = tok->start - tok->buf;
227 int oldlen = tok->cur - tok->buf;
228 int newlen = oldlen + strlen(new);
229 char *buf = realloc(tok->buf, newlen+1);
230 tok->lineno++;
231 if (buf == NULL) {
232 free(tok->buf);
233 tok->buf = NULL;
234 free(new);
235 tok->done = E_NOMEM;
236 return EOF;
238 tok->buf = buf;
239 tok->cur = tok->buf + oldlen;
240 strcpy(tok->buf + oldlen, new);
241 free(new);
242 tok->inp = tok->buf + newlen;
243 tok->end = tok->inp + 1;
244 tok->start = tok->buf + start;
246 else {
247 tok->lineno++;
248 if (tok->buf != NULL)
249 free(tok->buf);
250 tok->buf = new;
251 tok->cur = tok->buf;
252 tok->inp = strchr(tok->buf, '\0');
253 tok->end = tok->inp + 1;
256 else {
257 int done = 0;
258 int cur = 0;
259 char *pt;
260 if (tok->start == NULL) {
261 if (tok->buf == NULL) {
262 tok->buf = PyMem_NEW(char, BUFSIZ);
263 if (tok->buf == NULL) {
264 tok->done = E_NOMEM;
265 return EOF;
267 tok->end = tok->buf + BUFSIZ;
269 if (fgets(tok->buf, (int)(tok->end - tok->buf),
270 tok->fp) == NULL) {
271 tok->done = E_EOF;
272 done = 1;
274 else {
275 tok->done = E_OK;
276 tok->inp = strchr(tok->buf, '\0');
277 done = tok->inp[-1] == '\n';
280 else {
281 cur = tok->cur - tok->buf;
282 if (feof(tok->fp)) {
283 tok->done = E_EOF;
284 done = 1;
286 else
287 tok->done = E_OK;
289 tok->lineno++;
290 /* Read until '\n' or EOF */
291 while (!done) {
292 int curstart = tok->start == NULL ? -1 :
293 tok->start - tok->buf;
294 int curvalid = tok->inp - tok->buf;
295 int newsize = curvalid + BUFSIZ;
296 char *newbuf = tok->buf;
297 PyMem_RESIZE(newbuf, char, newsize);
298 if (newbuf == NULL) {
299 tok->done = E_NOMEM;
300 tok->cur = tok->inp;
301 return EOF;
303 tok->buf = newbuf;
304 tok->inp = tok->buf + curvalid;
305 tok->end = tok->buf + newsize;
306 tok->start = curstart < 0 ? NULL :
307 tok->buf + curstart;
308 if (fgets(tok->inp,
309 (int)(tok->end - tok->inp),
310 tok->fp) == NULL) {
311 /* Last line does not end in \n,
312 fake one */
313 strcpy(tok->inp, "\n");
315 tok->inp = strchr(tok->inp, '\0');
316 done = tok->inp[-1] == '\n';
318 tok->cur = tok->buf + cur;
319 #ifndef macintosh
320 /* replace "\r\n" with "\n" */
321 /* For Mac we leave the \r, giving a syntax error */
322 pt = tok->inp - 2;
323 if (pt >= tok->buf && *pt == '\r') {
324 *pt++ = '\n';
325 *pt = '\0';
326 tok->inp = pt;
328 #endif
330 if (tok->done != E_OK) {
331 if (tok->prompt != NULL)
332 PySys_WriteStderr("\n");
333 tok->cur = tok->inp;
334 return EOF;
337 /*NOTREACHED*/
341 /* Back-up one character */
343 static void
344 tok_backup(tok, c)
345 register struct tok_state *tok;
346 register int c;
348 if (c != EOF) {
349 if (--tok->cur < tok->buf)
350 Py_FatalError("tok_backup: begin of buffer");
351 if (*tok->cur != c)
352 *tok->cur = c;
357 /* Return the token corresponding to a single character */
360 PyToken_OneChar(c)
361 int c;
363 switch (c) {
364 case '(': return LPAR;
365 case ')': return RPAR;
366 case '[': return LSQB;
367 case ']': return RSQB;
368 case ':': return COLON;
369 case ',': return COMMA;
370 case ';': return SEMI;
371 case '+': return PLUS;
372 case '-': return MINUS;
373 case '*': return STAR;
374 case '/': return SLASH;
375 case '|': return VBAR;
376 case '&': return AMPER;
377 case '<': return LESS;
378 case '>': return GREATER;
379 case '=': return EQUAL;
380 case '.': return DOT;
381 case '%': return PERCENT;
382 case '`': return BACKQUOTE;
383 case '{': return LBRACE;
384 case '}': return RBRACE;
385 case '^': return CIRCUMFLEX;
386 case '~': return TILDE;
387 default: return OP;
393 PyToken_TwoChars(c1, c2)
394 int c1, c2;
396 switch (c1) {
397 case '=':
398 switch (c2) {
399 case '=': return EQEQUAL;
401 break;
402 case '!':
403 switch (c2) {
404 case '=': return NOTEQUAL;
406 break;
407 case '<':
408 switch (c2) {
409 case '>': return NOTEQUAL;
410 case '=': return LESSEQUAL;
411 case '<': return LEFTSHIFT;
413 break;
414 case '>':
415 switch (c2) {
416 case '=': return GREATEREQUAL;
417 case '>': return RIGHTSHIFT;
419 break;
420 case '*':
421 switch (c2) {
422 case '*': return DOUBLESTAR;
424 break;
426 return OP;
430 static int
431 indenterror(tok)
432 struct tok_state *tok;
434 if (tok->alterror) {
435 tok->done = E_INDENT;
436 tok->cur = tok->inp;
437 return 1;
439 if (tok->altwarning) {
440 PySys_WriteStderr("%s: inconsistent tab/space usage\n",
441 tok->filename);
442 tok->altwarning = 0;
444 return 0;
448 /* Get next token, after space stripping etc. */
451 PyTokenizer_Get(tok, p_start, p_end)
452 register struct tok_state *tok; /* In/out: tokenizer state */
453 char **p_start, **p_end; /* Out: point to start/end of token */
455 register int c;
456 int blankline;
458 *p_start = *p_end = NULL;
459 nextline:
460 tok->start = NULL;
461 blankline = 0;
463 /* Get indentation level */
464 if (tok->atbol) {
465 register int col = 0;
466 register int altcol = 0;
467 tok->atbol = 0;
468 for (;;) {
469 c = tok_nextc(tok);
470 if (c == ' ')
471 col++, altcol++;
472 else if (c == '\t') {
473 col = (col/tok->tabsize + 1) * tok->tabsize;
474 altcol = (altcol/tok->alttabsize + 1)
475 * tok->alttabsize;
477 else if (c == '\014') /* Control-L (formfeed) */
478 col = altcol = 0; /* For Emacs users */
479 else
480 break;
482 tok_backup(tok, c);
483 if (c == '#' || c == '\n') {
484 /* Lines with only whitespace and/or comments
485 shouldn't affect the indentation and are
486 not passed to the parser as NEWLINE tokens,
487 except *totally* empty lines in interactive
488 mode, which signal the end of a command group. */
489 if (col == 0 && c == '\n' && tok->prompt != NULL)
490 blankline = 0; /* Let it through */
491 else
492 blankline = 1; /* Ignore completely */
493 /* We can't jump back right here since we still
494 may need to skip to the end of a comment */
496 if (!blankline && tok->level == 0) {
497 if (col == tok->indstack[tok->indent]) {
498 /* No change */
499 if (altcol != tok->altindstack[tok->indent]) {
500 if (indenterror(tok))
501 return ERRORTOKEN;
504 else if (col > tok->indstack[tok->indent]) {
505 /* Indent -- always one */
506 if (tok->indent+1 >= MAXINDENT) {
507 PySys_WriteStderr(
508 "excessive indent\n");
509 tok->done = E_TOKEN;
510 tok->cur = tok->inp;
511 return ERRORTOKEN;
513 if (altcol <= tok->altindstack[tok->indent]) {
514 if (indenterror(tok))
515 return ERRORTOKEN;
517 tok->pendin++;
518 tok->indstack[++tok->indent] = col;
519 tok->altindstack[tok->indent] = altcol;
521 else /* col < tok->indstack[tok->indent] */ {
522 /* Dedent -- any number, must be consistent */
523 while (tok->indent > 0 &&
524 col < tok->indstack[tok->indent]) {
525 tok->pendin--;
526 tok->indent--;
528 if (col != tok->indstack[tok->indent]) {
529 fprintf(stderr,
530 "inconsistent dedent\n");
531 tok->done = E_TOKEN;
532 tok->cur = tok->inp;
533 return ERRORTOKEN;
535 if (altcol != tok->altindstack[tok->indent]) {
536 if (indenterror(tok))
537 return ERRORTOKEN;
543 tok->start = tok->cur;
545 /* Return pending indents/dedents */
546 if (tok->pendin != 0) {
547 if (tok->pendin < 0) {
548 tok->pendin++;
549 return DEDENT;
551 else {
552 tok->pendin--;
553 return INDENT;
557 again:
558 tok->start = NULL;
559 /* Skip spaces */
560 do {
561 c = tok_nextc(tok);
562 } while (c == ' ' || c == '\t' || c == '\014');
564 /* Set start of current token */
565 tok->start = tok->cur - 1;
567 /* Skip comment */
568 if (c == '#') {
569 /* Hack to allow overriding the tabsize in the file.
570 This is also recognized by vi, when it occurs near the
571 beginning or end of the file. (Will vi never die...?)
572 For Python it must be at the beginning of the file! */
573 /* XXX The real vi syntax is actually different :-( */
574 /* XXX Should recognize Emacs syntax, too */
575 int x;
576 if (sscanf(tok->cur,
577 " vi:set tabsize=%d:", &x) == 1 &&
578 x >= 1 && x <= 40) {
579 /* PySys_WriteStderr("# vi:set tabsize=%d:\n", x); */
580 tok->tabsize = x;
582 do {
583 c = tok_nextc(tok);
584 } while (c != EOF && c != '\n');
587 /* Check for EOF and errors now */
588 if (c == EOF) {
589 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
592 /* Identifier (most frequent token!) */
593 if (isalpha(c) || c == '_') {
594 switch (c) {
595 case 'r':
596 case 'R':
597 c = tok_nextc(tok);
598 if (c == '"' || c == '\'')
599 goto letter_quote;
601 while (isalnum(c) || c == '_') {
602 c = tok_nextc(tok);
604 tok_backup(tok, c);
605 *p_start = tok->start;
606 *p_end = tok->cur;
607 return NAME;
610 /* Newline */
611 if (c == '\n') {
612 tok->atbol = 1;
613 if (blankline || tok->level > 0)
614 goto nextline;
615 *p_start = tok->start;
616 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
617 return NEWLINE;
620 #ifdef macintosh
621 if (c == '\r') {
622 PySys_WriteStderr(
623 "File contains \\r characters (incorrect line endings?)\n");
624 tok->done = E_TOKEN;
625 tok->cur = tok->inp;
626 return ERRORTOKEN;
628 #endif
629 /* Period or number starting with period? */
630 if (c == '.') {
631 c = tok_nextc(tok);
632 if (isdigit(c)) {
633 goto fraction;
635 else {
636 tok_backup(tok, c);
637 *p_start = tok->start;
638 *p_end = tok->cur;
639 return DOT;
643 /* Number */
644 if (isdigit(c)) {
645 if (c == '0') {
646 /* Hex or octal */
647 c = tok_nextc(tok);
648 if (c == '.')
649 goto fraction;
650 #ifndef WITHOUT_COMPLEX
651 if (c == 'j' || c == 'J')
652 goto imaginary;
653 #endif
654 if (c == 'x' || c == 'X') {
655 /* Hex */
656 do {
657 c = tok_nextc(tok);
658 } while (isxdigit(c));
660 else {
661 /* XXX This is broken! E.g.,
662 09.9 should be accepted as float! */
663 /* Octal; c is first char of it */
664 /* There's no 'isoctdigit' macro, sigh */
665 while ('0' <= c && c < '8') {
666 c = tok_nextc(tok);
669 if (c == 'l' || c == 'L')
670 c = tok_nextc(tok);
672 else {
673 /* Decimal */
674 do {
675 c = tok_nextc(tok);
676 } while (isdigit(c));
677 if (c == 'l' || c == 'L')
678 c = tok_nextc(tok);
679 else {
680 /* Accept floating point numbers.
681 XXX This accepts incomplete things like
682 XXX 12e or 1e+; worry run-time */
683 if (c == '.') {
684 fraction:
685 /* Fraction */
686 do {
687 c = tok_nextc(tok);
688 } while (isdigit(c));
690 if (c == 'e' || c == 'E') {
691 /* Exponent part */
692 c = tok_nextc(tok);
693 if (c == '+' || c == '-')
694 c = tok_nextc(tok);
695 while (isdigit(c)) {
696 c = tok_nextc(tok);
699 #ifndef WITHOUT_COMPLEX
700 if (c == 'j' || c == 'J')
701 /* Imaginary part */
702 imaginary:
703 c = tok_nextc(tok);
704 #endif
707 tok_backup(tok, c);
708 *p_start = tok->start;
709 *p_end = tok->cur;
710 return NUMBER;
713 letter_quote:
714 /* String */
715 if (c == '\'' || c == '"') {
716 int quote2 = tok->cur - tok->start + 1;
717 int quote = c;
718 int triple = 0;
719 int tripcount = 0;
720 for (;;) {
721 c = tok_nextc(tok);
722 if (c == '\n') {
723 if (!triple) {
724 tok->done = E_TOKEN;
725 tok_backup(tok, c);
726 return ERRORTOKEN;
728 tripcount = 0;
730 else if (c == EOF) {
731 tok->done = E_TOKEN;
732 tok->cur = tok->inp;
733 return ERRORTOKEN;
735 else if (c == quote) {
736 tripcount++;
737 if (tok->cur - tok->start == quote2) {
738 c = tok_nextc(tok);
739 if (c == quote) {
740 triple = 1;
741 tripcount = 0;
742 continue;
744 tok_backup(tok, c);
746 if (!triple || tripcount == 3)
747 break;
749 else if (c == '\\') {
750 tripcount = 0;
751 c = tok_nextc(tok);
752 if (c == EOF) {
753 tok->done = E_TOKEN;
754 tok->cur = tok->inp;
755 return ERRORTOKEN;
758 else
759 tripcount = 0;
761 *p_start = tok->start;
762 *p_end = tok->cur;
763 return STRING;
766 /* Line continuation */
767 if (c == '\\') {
768 c = tok_nextc(tok);
769 if (c != '\n') {
770 tok->done = E_TOKEN;
771 tok->cur = tok->inp;
772 return ERRORTOKEN;
774 goto again; /* Read next line */
777 /* Check for two-character token */
779 int c2 = tok_nextc(tok);
780 int token = PyToken_TwoChars(c, c2);
781 if (token != OP) {
782 *p_start = tok->start;
783 *p_end = tok->cur;
784 return token;
786 tok_backup(tok, c2);
789 /* Keep track of parentheses nesting level */
790 switch (c) {
791 case '(':
792 case '[':
793 case '{':
794 tok->level++;
795 break;
796 case ')':
797 case ']':
798 case '}':
799 tok->level--;
800 break;
803 /* Punctuation character */
804 *p_start = tok->start;
805 *p_end = tok->cur;
806 return PyToken_OneChar(c);
810 #ifdef Py_DEBUG
812 void
813 tok_dump(type, start, end)
814 int type;
815 char *start, *end;
817 printf("%s", _PyParser_TokenNames[type]);
818 if (type == NAME || type == NUMBER || type == STRING || type == OP)
819 printf("(%.*s)", (int)(end - start), start);
822 #endif