Updated for 2.1a3
[python/dscho.git] / Parser / parser.c
blob6eaa925ef3c5da21780ff863d2d2eba04e31fa4c
2 /* Parser implementation */
4 /* For a description, see the comments at end of this file */
6 /* XXX To do: error recovery */
8 #include "pgenheaders.h"
9 #include "assert.h"
10 #include "token.h"
11 #include "grammar.h"
12 #include "node.h"
13 #include "parser.h"
14 #include "errcode.h"
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
25 /* STACK DATA TYPE */
27 static void s_reset(stack *);
29 static void
30 s_reset(stack *s)
32 s->s_top = &s->s_base[MAXSTACK];
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
37 static int
38 s_push(register stack *s, dfa *d, node *parent)
40 register stackentry *top;
41 if (s->s_top == s->s_base) {
42 fprintf(stderr, "s_push: parser stack overflow\n");
43 return E_NOMEM;
45 top = --s->s_top;
46 top->s_dfa = d;
47 top->s_parent = parent;
48 top->s_state = 0;
49 return 0;
52 #ifdef Py_DEBUG
54 static void
55 s_pop(register stack *s)
57 if (s_empty(s))
58 Py_FatalError("s_pop: parser stack underflow -- FATAL");
59 s->s_top++;
62 #else /* !Py_DEBUG */
64 #define s_pop(s) (s)->s_top++
66 #endif
69 /* PARSER CREATION */
71 parser_state *
72 PyParser_New(grammar *g, int start)
74 parser_state *ps;
76 if (!g->g_accel)
77 PyGrammar_AddAccelerators(g);
78 ps = PyMem_NEW(parser_state, 1);
79 if (ps == NULL)
80 return NULL;
81 ps->p_grammar = g;
82 ps->p_tree = PyNode_New(start);
83 if (ps->p_tree == NULL) {
84 PyMem_DEL(ps);
85 return NULL;
87 s_reset(&ps->p_stack);
88 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
89 return ps;
92 void
93 PyParser_Delete(parser_state *ps)
95 /* NB If you want to save the parse tree,
96 you must set p_tree to NULL before calling delparser! */
97 PyNode_Free(ps->p_tree);
98 PyMem_DEL(ps);
102 /* PARSER STACK OPERATIONS */
104 static int
105 shift(register stack *s, int type, char *str, int newstate, int lineno)
107 int err;
108 assert(!s_empty(s));
109 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno);
110 if (err)
111 return err;
112 s->s_top->s_state = newstate;
113 return 0;
116 static int
117 push(register stack *s, int type, dfa *d, int newstate, int lineno)
119 int err;
120 register node *n;
121 n = s->s_top->s_parent;
122 assert(!s_empty(s));
123 err = PyNode_AddChild(n, type, (char *)NULL, lineno);
124 if (err)
125 return err;
126 s->s_top->s_state = newstate;
127 return s_push(s, d, CHILD(n, NCH(n)-1));
131 /* PARSER PROPER */
133 static int
134 classify(grammar *g, int type, char *str)
136 register int n = g->g_ll.ll_nlabels;
138 if (type == NAME) {
139 register char *s = str;
140 register label *l = g->g_ll.ll_label;
141 register int i;
142 for (i = n; i > 0; i--, l++) {
143 if (l->lb_type == NAME && l->lb_str != NULL &&
144 l->lb_str[0] == s[0] &&
145 strcmp(l->lb_str, s) == 0) {
146 D(printf("It's a keyword\n"));
147 return n - i;
153 register label *l = g->g_ll.ll_label;
154 register int i;
155 for (i = n; i > 0; i--, l++) {
156 if (l->lb_type == type && l->lb_str == NULL) {
157 D(printf("It's a token we know\n"));
158 return n - i;
163 D(printf("Illegal token\n"));
164 return -1;
168 PyParser_AddToken(register parser_state *ps, register int type, char *str,
169 int lineno, int *expected_ret)
171 register int ilabel;
172 int err;
174 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
176 /* Find out which label this token is */
177 ilabel = classify(ps->p_grammar, type, str);
178 if (ilabel < 0)
179 return E_SYNTAX;
181 /* Loop until the token is shifted or an error occurred */
182 for (;;) {
183 /* Fetch the current dfa and state */
184 register dfa *d = ps->p_stack.s_top->s_dfa;
185 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
187 D(printf(" DFA '%s', state %d:",
188 d->d_name, ps->p_stack.s_top->s_state));
190 /* Check accelerator */
191 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
192 register int x = s->s_accel[ilabel - s->s_lower];
193 if (x != -1) {
194 if (x & (1<<7)) {
195 /* Push non-terminal */
196 int nt = (x >> 8) + NT_OFFSET;
197 int arrow = x & ((1<<7)-1);
198 dfa *d1 = PyGrammar_FindDFA(
199 ps->p_grammar, nt);
200 if ((err = push(&ps->p_stack, nt, d1,
201 arrow, lineno)) > 0) {
202 D(printf(" MemError: push\n"));
203 return err;
205 D(printf(" Push ...\n"));
206 continue;
209 /* Shift the token */
210 if ((err = shift(&ps->p_stack, type, str,
211 x, lineno)) > 0) {
212 D(printf(" MemError: shift.\n"));
213 return err;
215 D(printf(" Shift.\n"));
216 /* Pop while we are in an accept-only state */
217 while (s = &d->d_state
218 [ps->p_stack.s_top->s_state],
219 s->s_accept && s->s_narcs == 1) {
220 D(printf(" Direct pop.\n"));
221 s_pop(&ps->p_stack);
222 if (s_empty(&ps->p_stack)) {
223 D(printf(" ACCEPT.\n"));
224 return E_DONE;
226 d = ps->p_stack.s_top->s_dfa;
228 return E_OK;
232 if (s->s_accept) {
233 /* Pop this dfa and try again */
234 s_pop(&ps->p_stack);
235 D(printf(" Pop ...\n"));
236 if (s_empty(&ps->p_stack)) {
237 D(printf(" Error: bottom of stack.\n"));
238 return E_SYNTAX;
240 continue;
243 /* Stuck, report syntax error */
244 D(printf(" Error.\n"));
245 if (expected_ret) {
246 if (s->s_lower == s->s_upper - 1) {
247 /* Only one possible expected token */
248 *expected_ret = ps->p_grammar->
249 g_ll.ll_label[s->s_lower].lb_type;
251 else
252 *expected_ret = -1;
254 return E_SYNTAX;
259 #ifdef Py_DEBUG
261 /* DEBUG OUTPUT */
263 void
264 dumptree(grammar *g, node *n)
266 int i;
268 if (n == NULL)
269 printf("NIL");
270 else {
271 label l;
272 l.lb_type = TYPE(n);
273 l.lb_str = STR(n);
274 printf("%s", PyGrammar_LabelRepr(&l));
275 if (ISNONTERMINAL(TYPE(n))) {
276 printf("(");
277 for (i = 0; i < NCH(n); i++) {
278 if (i > 0)
279 printf(",");
280 dumptree(g, CHILD(n, i));
282 printf(")");
287 void
288 showtree(grammar *g, node *n)
290 int i;
292 if (n == NULL)
293 return;
294 if (ISNONTERMINAL(TYPE(n))) {
295 for (i = 0; i < NCH(n); i++)
296 showtree(g, CHILD(n, i));
298 else if (ISTERMINAL(TYPE(n))) {
299 printf("%s", _PyParser_TokenNames[TYPE(n)]);
300 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
301 printf("(%s)", STR(n));
302 printf(" ");
304 else
305 printf("? ");
308 void
309 printtree(parser_state *ps)
311 if (Py_DebugFlag) {
312 printf("Parse tree:\n");
313 dumptree(ps->p_grammar, ps->p_tree);
314 printf("\n");
315 printf("Tokens:\n");
316 showtree(ps->p_grammar, ps->p_tree);
317 printf("\n");
319 printf("Listing:\n");
320 PyNode_ListTree(ps->p_tree);
321 printf("\n");
324 #endif /* Py_DEBUG */
328 Description
329 -----------
331 The parser's interface is different than usual: the function addtoken()
332 must be called for each token in the input. This makes it possible to
333 turn it into an incremental parsing system later. The parsing system
334 constructs a parse tree as it goes.
336 A parsing rule is represented as a Deterministic Finite-state Automaton
337 (DFA). A node in a DFA represents a state of the parser; an arc represents
338 a transition. Transitions are either labeled with terminal symbols or
339 with non-terminals. When the parser decides to follow an arc labeled
340 with a non-terminal, it is invoked recursively with the DFA representing
341 the parsing rule for that as its initial state; when that DFA accepts,
342 the parser that invoked it continues. The parse tree constructed by the
343 recursively called parser is inserted as a child in the current parse tree.
345 The DFA's can be constructed automatically from a more conventional
346 language description. An extended LL(1) grammar (ELL(1)) is suitable.
347 Certain restrictions make the parser's life easier: rules that can produce
348 the empty string should be outlawed (there are other ways to put loops
349 or optional parts in the language). To avoid the need to construct
350 FIRST sets, we can require that all but the last alternative of a rule
351 (really: arc going out of a DFA's state) must begin with a terminal
352 symbol.
354 As an example, consider this grammar:
356 expr: term (OP term)*
357 term: CONSTANT | '(' expr ')'
359 The DFA corresponding to the rule for expr is:
361 ------->.---term-->.------->
364 \----OP----/
366 The parse tree generated for the input a+b is:
368 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))