Added all documentation.
[python/dscho.git] / Parser / pgen.c
blobf2adae919cbe8889870e124b272fab4515213238
1 /***********************************************************
2 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3 The Netherlands.
5 All Rights Reserved
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI or Corporation for National Research Initiatives or
13 CNRI not be used in advertising or publicity pertaining to
14 distribution of the software without specific, written prior
15 permission.
17 While CWI is the initial source for this software, a modified version
18 is made available by the Corporation for National Research Initiatives
19 (CNRI) at the Internet address ftp://ftp.python.org.
21 STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22 REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23 MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24 CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28 PERFORMANCE OF THIS SOFTWARE.
30 ******************************************************************/
32 /* Parser generator */
33 /* XXX This file is not yet fully PROTOized */
35 /* For a description, see the comments at end of this file */
37 #include "pgenheaders.h"
38 #include "assert.h"
39 #include "token.h"
40 #include "node.h"
41 #include "grammar.h"
42 #include "metagrammar.h"
43 #include "pgen.h"
45 extern int Py_DebugFlag;
48 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
50 typedef struct _nfaarc {
51 int ar_label;
52 int ar_arrow;
53 } nfaarc;
55 typedef struct _nfastate {
56 int st_narcs;
57 nfaarc *st_arc;
58 } nfastate;
60 typedef struct _nfa {
61 int nf_type;
62 char *nf_name;
63 int nf_nstates;
64 nfastate *nf_state;
65 int nf_start, nf_finish;
66 } nfa;
68 /* Forward */
69 static void compile_rhs Py_PROTO((labellist *ll,
70 nfa *nf, node *n, int *pa, int *pb));
71 static void compile_alt Py_PROTO((labellist *ll,
72 nfa *nf, node *n, int *pa, int *pb));
73 static void compile_item Py_PROTO((labellist *ll,
74 nfa *nf, node *n, int *pa, int *pb));
75 static void compile_atom Py_PROTO((labellist *ll,
76 nfa *nf, node *n, int *pa, int *pb));
78 static int
79 addnfastate(nf)
80 nfa *nf;
82 nfastate *st;
84 PyMem_RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
85 if (nf->nf_state == NULL)
86 Py_FatalError("out of mem");
87 st = &nf->nf_state[nf->nf_nstates++];
88 st->st_narcs = 0;
89 st->st_arc = NULL;
90 return st - nf->nf_state;
93 static void
94 addnfaarc(nf, from, to, lbl)
95 nfa *nf;
96 int from, to, lbl;
98 nfastate *st;
99 nfaarc *ar;
101 st = &nf->nf_state[from];
102 PyMem_RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
103 if (st->st_arc == NULL)
104 Py_FatalError("out of mem");
105 ar = &st->st_arc[st->st_narcs++];
106 ar->ar_label = lbl;
107 ar->ar_arrow = to;
110 static nfa *
111 newnfa(name)
112 char *name;
114 nfa *nf;
115 static int type = NT_OFFSET; /* All types will be disjunct */
117 nf = PyMem_NEW(nfa, 1);
118 if (nf == NULL)
119 Py_FatalError("no mem for new nfa");
120 nf->nf_type = type++;
121 nf->nf_name = name; /* XXX strdup(name) ??? */
122 nf->nf_nstates = 0;
123 nf->nf_state = NULL;
124 nf->nf_start = nf->nf_finish = -1;
125 return nf;
128 typedef struct _nfagrammar {
129 int gr_nnfas;
130 nfa **gr_nfa;
131 labellist gr_ll;
132 } nfagrammar;
134 /* Forward */
135 static void compile_rule Py_PROTO((nfagrammar *gr, node *n));
137 static nfagrammar *
138 newnfagrammar()
140 nfagrammar *gr;
142 gr = PyMem_NEW(nfagrammar, 1);
143 if (gr == NULL)
144 Py_FatalError("no mem for new nfa grammar");
145 gr->gr_nnfas = 0;
146 gr->gr_nfa = NULL;
147 gr->gr_ll.ll_nlabels = 0;
148 gr->gr_ll.ll_label = NULL;
149 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
150 return gr;
153 static nfa *
154 addnfa(gr, name)
155 nfagrammar *gr;
156 char *name;
158 nfa *nf;
160 nf = newnfa(name);
161 PyMem_RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
162 if (gr->gr_nfa == NULL)
163 Py_FatalError("out of mem");
164 gr->gr_nfa[gr->gr_nnfas++] = nf;
165 addlabel(&gr->gr_ll, NAME, nf->nf_name);
166 return nf;
169 #ifdef Py_DEBUG
171 static char REQNFMT[] = "metacompile: less than %d children\n";
173 #define REQN(i, count) \
174 if (i < count) { \
175 fprintf(stderr, REQNFMT, count); \
176 Py_FatalError("REQN"); \
177 } else
179 #else
180 #define REQN(i, count) /* empty */
181 #endif
183 static nfagrammar *
184 metacompile(n)
185 node *n;
187 nfagrammar *gr;
188 int i;
190 printf("Compiling (meta-) parse tree into NFA grammar\n");
191 gr = newnfagrammar();
192 REQ(n, MSTART);
193 i = n->n_nchildren - 1; /* Last child is ENDMARKER */
194 n = n->n_child;
195 for (; --i >= 0; n++) {
196 if (n->n_type != NEWLINE)
197 compile_rule(gr, n);
199 return gr;
202 static void
203 compile_rule(gr, n)
204 nfagrammar *gr;
205 node *n;
207 nfa *nf;
209 REQ(n, RULE);
210 REQN(n->n_nchildren, 4);
211 n = n->n_child;
212 REQ(n, NAME);
213 nf = addnfa(gr, n->n_str);
214 n++;
215 REQ(n, COLON);
216 n++;
217 REQ(n, RHS);
218 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
219 n++;
220 REQ(n, NEWLINE);
223 static void
224 compile_rhs(ll, nf, n, pa, pb)
225 labellist *ll;
226 nfa *nf;
227 node *n;
228 int *pa, *pb;
230 int i;
231 int a, b;
233 REQ(n, RHS);
234 i = n->n_nchildren;
235 REQN(i, 1);
236 n = n->n_child;
237 REQ(n, ALT);
238 compile_alt(ll, nf, n, pa, pb);
239 if (--i <= 0)
240 return;
241 n++;
242 a = *pa;
243 b = *pb;
244 *pa = addnfastate(nf);
245 *pb = addnfastate(nf);
246 addnfaarc(nf, *pa, a, EMPTY);
247 addnfaarc(nf, b, *pb, EMPTY);
248 for (; --i >= 0; n++) {
249 REQ(n, VBAR);
250 REQN(i, 1);
251 --i;
252 n++;
253 REQ(n, ALT);
254 compile_alt(ll, nf, n, &a, &b);
255 addnfaarc(nf, *pa, a, EMPTY);
256 addnfaarc(nf, b, *pb, EMPTY);
260 static void
261 compile_alt(ll, nf, n, pa, pb)
262 labellist *ll;
263 nfa *nf;
264 node *n;
265 int *pa, *pb;
267 int i;
268 int a, b;
270 REQ(n, ALT);
271 i = n->n_nchildren;
272 REQN(i, 1);
273 n = n->n_child;
274 REQ(n, ITEM);
275 compile_item(ll, nf, n, pa, pb);
276 --i;
277 n++;
278 for (; --i >= 0; n++) {
279 if (n->n_type == COMMA) { /* XXX Temporary */
280 REQN(i, 1);
281 --i;
282 n++;
284 REQ(n, ITEM);
285 compile_item(ll, nf, n, &a, &b);
286 addnfaarc(nf, *pb, a, EMPTY);
287 *pb = b;
291 static void
292 compile_item(ll, nf, n, pa, pb)
293 labellist *ll;
294 nfa *nf;
295 node *n;
296 int *pa, *pb;
298 int i;
299 int a, b;
301 REQ(n, ITEM);
302 i = n->n_nchildren;
303 REQN(i, 1);
304 n = n->n_child;
305 if (n->n_type == LSQB) {
306 REQN(i, 3);
307 n++;
308 REQ(n, RHS);
309 *pa = addnfastate(nf);
310 *pb = addnfastate(nf);
311 addnfaarc(nf, *pa, *pb, EMPTY);
312 compile_rhs(ll, nf, n, &a, &b);
313 addnfaarc(nf, *pa, a, EMPTY);
314 addnfaarc(nf, b, *pb, EMPTY);
315 REQN(i, 1);
316 n++;
317 REQ(n, RSQB);
319 else {
320 compile_atom(ll, nf, n, pa, pb);
321 if (--i <= 0)
322 return;
323 n++;
324 addnfaarc(nf, *pb, *pa, EMPTY);
325 if (n->n_type == STAR)
326 *pb = *pa;
327 else
328 REQ(n, PLUS);
332 static void
333 compile_atom(ll, nf, n, pa, pb)
334 labellist *ll;
335 nfa *nf;
336 node *n;
337 int *pa, *pb;
339 int i;
341 REQ(n, ATOM);
342 i = n->n_nchildren;
343 REQN(i, 1);
344 n = n->n_child;
345 if (n->n_type == LPAR) {
346 REQN(i, 3);
347 n++;
348 REQ(n, RHS);
349 compile_rhs(ll, nf, n, pa, pb);
350 n++;
351 REQ(n, RPAR);
353 else if (n->n_type == NAME || n->n_type == STRING) {
354 *pa = addnfastate(nf);
355 *pb = addnfastate(nf);
356 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
358 else
359 REQ(n, NAME);
362 static void
363 dumpstate(ll, nf, istate)
364 labellist *ll;
365 nfa *nf;
366 int istate;
368 nfastate *st;
369 int i;
370 nfaarc *ar;
372 printf("%c%2d%c",
373 istate == nf->nf_start ? '*' : ' ',
374 istate,
375 istate == nf->nf_finish ? '.' : ' ');
376 st = &nf->nf_state[istate];
377 ar = st->st_arc;
378 for (i = 0; i < st->st_narcs; i++) {
379 if (i > 0)
380 printf("\n ");
381 printf("-> %2d %s", ar->ar_arrow,
382 PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
383 ar++;
385 printf("\n");
388 static void
389 dumpnfa(ll, nf)
390 labellist *ll;
391 nfa *nf;
393 int i;
395 printf("NFA '%s' has %d states; start %d, finish %d\n",
396 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
397 for (i = 0; i < nf->nf_nstates; i++)
398 dumpstate(ll, nf, i);
402 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
404 static void
405 addclosure(ss, nf, istate)
406 bitset ss;
407 nfa *nf;
408 int istate;
410 if (addbit(ss, istate)) {
411 nfastate *st = &nf->nf_state[istate];
412 nfaarc *ar = st->st_arc;
413 int i;
415 for (i = st->st_narcs; --i >= 0; ) {
416 if (ar->ar_label == EMPTY)
417 addclosure(ss, nf, ar->ar_arrow);
418 ar++;
423 typedef struct _ss_arc {
424 bitset sa_bitset;
425 int sa_arrow;
426 int sa_label;
427 } ss_arc;
429 typedef struct _ss_state {
430 bitset ss_ss;
431 int ss_narcs;
432 ss_arc *ss_arc;
433 int ss_deleted;
434 int ss_finish;
435 int ss_rename;
436 } ss_state;
438 typedef struct _ss_dfa {
439 int sd_nstates;
440 ss_state *sd_state;
441 } ss_dfa;
443 /* Forward */
444 static void printssdfa Py_PROTO((int xx_nstates, ss_state *xx_state, int nbits,
445 labellist *ll, char *msg));
446 static void simplify Py_PROTO((int xx_nstates, ss_state *xx_state));
447 static void convert Py_PROTO((dfa *d, int xx_nstates, ss_state *xx_state));
449 static void
450 makedfa(gr, nf, d)
451 nfagrammar *gr;
452 nfa *nf;
453 dfa *d;
455 int nbits = nf->nf_nstates;
456 bitset ss;
457 int xx_nstates;
458 ss_state *xx_state, *yy;
459 ss_arc *zz;
460 int istate, jstate, iarc, jarc, ibit;
461 nfastate *st;
462 nfaarc *ar;
464 ss = newbitset(nbits);
465 addclosure(ss, nf, nf->nf_start);
466 xx_state = PyMem_NEW(ss_state, 1);
467 if (xx_state == NULL)
468 Py_FatalError("no mem for xx_state in makedfa");
469 xx_nstates = 1;
470 yy = &xx_state[0];
471 yy->ss_ss = ss;
472 yy->ss_narcs = 0;
473 yy->ss_arc = NULL;
474 yy->ss_deleted = 0;
475 yy->ss_finish = testbit(ss, nf->nf_finish);
476 if (yy->ss_finish)
477 printf("Error: nonterminal '%s' may produce empty.\n",
478 nf->nf_name);
480 /* This algorithm is from a book written before
481 the invention of structured programming... */
483 /* For each unmarked state... */
484 for (istate = 0; istate < xx_nstates; ++istate) {
485 yy = &xx_state[istate];
486 ss = yy->ss_ss;
487 /* For all its states... */
488 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
489 if (!testbit(ss, ibit))
490 continue;
491 st = &nf->nf_state[ibit];
492 /* For all non-empty arcs from this state... */
493 for (iarc = 0; iarc < st->st_narcs; iarc++) {
494 ar = &st->st_arc[iarc];
495 if (ar->ar_label == EMPTY)
496 continue;
497 /* Look up in list of arcs from this state */
498 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
499 zz = &yy->ss_arc[jarc];
500 if (ar->ar_label == zz->sa_label)
501 goto found;
503 /* Add new arc for this state */
504 PyMem_RESIZE(yy->ss_arc, ss_arc,
505 yy->ss_narcs + 1);
506 if (yy->ss_arc == NULL)
507 Py_FatalError("out of mem");
508 zz = &yy->ss_arc[yy->ss_narcs++];
509 zz->sa_label = ar->ar_label;
510 zz->sa_bitset = newbitset(nbits);
511 zz->sa_arrow = -1;
512 found: ;
513 /* Add destination */
514 addclosure(zz->sa_bitset, nf, ar->ar_arrow);
517 /* Now look up all the arrow states */
518 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
519 zz = &xx_state[istate].ss_arc[jarc];
520 for (jstate = 0; jstate < xx_nstates; jstate++) {
521 if (samebitset(zz->sa_bitset,
522 xx_state[jstate].ss_ss, nbits)) {
523 zz->sa_arrow = jstate;
524 goto done;
527 PyMem_RESIZE(xx_state, ss_state, xx_nstates + 1);
528 if (xx_state == NULL)
529 Py_FatalError("out of mem");
530 zz->sa_arrow = xx_nstates;
531 yy = &xx_state[xx_nstates++];
532 yy->ss_ss = zz->sa_bitset;
533 yy->ss_narcs = 0;
534 yy->ss_arc = NULL;
535 yy->ss_deleted = 0;
536 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
537 done: ;
541 if (Py_DebugFlag)
542 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
543 "before minimizing");
545 simplify(xx_nstates, xx_state);
547 if (Py_DebugFlag)
548 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
549 "after minimizing");
551 convert(d, xx_nstates, xx_state);
553 /* XXX cleanup */
556 static void
557 printssdfa(xx_nstates, xx_state, nbits, ll, msg)
558 int xx_nstates;
559 ss_state *xx_state;
560 int nbits;
561 labellist *ll;
562 char *msg;
564 int i, ibit, iarc;
565 ss_state *yy;
566 ss_arc *zz;
568 printf("Subset DFA %s\n", msg);
569 for (i = 0; i < xx_nstates; i++) {
570 yy = &xx_state[i];
571 if (yy->ss_deleted)
572 continue;
573 printf(" Subset %d", i);
574 if (yy->ss_finish)
575 printf(" (finish)");
576 printf(" { ");
577 for (ibit = 0; ibit < nbits; ibit++) {
578 if (testbit(yy->ss_ss, ibit))
579 printf("%d ", ibit);
581 printf("}\n");
582 for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
583 zz = &yy->ss_arc[iarc];
584 printf(" Arc to state %d, label %s\n",
585 zz->sa_arrow,
586 PyGrammar_LabelRepr(
587 &ll->ll_label[zz->sa_label]));
593 /* PART THREE -- SIMPLIFY DFA */
595 /* Simplify the DFA by repeatedly eliminating states that are
596 equivalent to another oner. This is NOT Algorithm 3.3 from
597 [Aho&Ullman 77]. It does not always finds the minimal DFA,
598 but it does usually make a much smaller one... (For an example
599 of sub-optimal behaviour, try S: x a b+ | y a b+.)
602 static int
603 samestate(s1, s2)
604 ss_state *s1, *s2;
606 int i;
608 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
609 return 0;
610 for (i = 0; i < s1->ss_narcs; i++) {
611 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
612 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
613 return 0;
615 return 1;
618 static void
619 renamestates(xx_nstates, xx_state, from, to)
620 int xx_nstates;
621 ss_state *xx_state;
622 int from, to;
624 int i, j;
626 if (Py_DebugFlag)
627 printf("Rename state %d to %d.\n", from, to);
628 for (i = 0; i < xx_nstates; i++) {
629 if (xx_state[i].ss_deleted)
630 continue;
631 for (j = 0; j < xx_state[i].ss_narcs; j++) {
632 if (xx_state[i].ss_arc[j].sa_arrow == from)
633 xx_state[i].ss_arc[j].sa_arrow = to;
638 static void
639 simplify(xx_nstates, xx_state)
640 int xx_nstates;
641 ss_state *xx_state;
643 int changes;
644 int i, j;
646 do {
647 changes = 0;
648 for (i = 1; i < xx_nstates; i++) {
649 if (xx_state[i].ss_deleted)
650 continue;
651 for (j = 0; j < i; j++) {
652 if (xx_state[j].ss_deleted)
653 continue;
654 if (samestate(&xx_state[i], &xx_state[j])) {
655 xx_state[i].ss_deleted++;
656 renamestates(xx_nstates, xx_state,
657 i, j);
658 changes++;
659 break;
663 } while (changes);
667 /* PART FOUR -- GENERATE PARSING TABLES */
669 /* Convert the DFA into a grammar that can be used by our parser */
671 static void
672 convert(d, xx_nstates, xx_state)
673 dfa *d;
674 int xx_nstates;
675 ss_state *xx_state;
677 int i, j;
678 ss_state *yy;
679 ss_arc *zz;
681 for (i = 0; i < xx_nstates; i++) {
682 yy = &xx_state[i];
683 if (yy->ss_deleted)
684 continue;
685 yy->ss_rename = addstate(d);
688 for (i = 0; i < xx_nstates; i++) {
689 yy = &xx_state[i];
690 if (yy->ss_deleted)
691 continue;
692 for (j = 0; j < yy->ss_narcs; j++) {
693 zz = &yy->ss_arc[j];
694 addarc(d, yy->ss_rename,
695 xx_state[zz->sa_arrow].ss_rename,
696 zz->sa_label);
698 if (yy->ss_finish)
699 addarc(d, yy->ss_rename, yy->ss_rename, 0);
702 d->d_initial = 0;
706 /* PART FIVE -- GLUE IT ALL TOGETHER */
708 static grammar *
709 maketables(gr)
710 nfagrammar *gr;
712 int i;
713 nfa *nf;
714 dfa *d;
715 grammar *g;
717 if (gr->gr_nnfas == 0)
718 return NULL;
719 g = newgrammar(gr->gr_nfa[0]->nf_type);
720 /* XXX first rule must be start rule */
721 g->g_ll = gr->gr_ll;
723 for (i = 0; i < gr->gr_nnfas; i++) {
724 nf = gr->gr_nfa[i];
725 if (Py_DebugFlag) {
726 printf("Dump of NFA for '%s' ...\n", nf->nf_name);
727 dumpnfa(&gr->gr_ll, nf);
729 printf("Making DFA for '%s' ...\n", nf->nf_name);
730 d = adddfa(g, nf->nf_type, nf->nf_name);
731 makedfa(gr, gr->gr_nfa[i], d);
734 return g;
737 grammar *
738 pgen(n)
739 node *n;
741 nfagrammar *gr;
742 grammar *g;
744 gr = metacompile(n);
745 g = maketables(gr);
746 translatelabels(g);
747 addfirstsets(g);
748 return g;
754 Description
755 -----------
757 Input is a grammar in extended BNF (using * for repetition, + for
758 at-least-once repetition, [] for optional parts, | for alternatives and
759 () for grouping). This has already been parsed and turned into a parse
760 tree.
762 Each rule is considered as a regular expression in its own right.
763 It is turned into a Non-deterministic Finite Automaton (NFA), which
764 is then turned into a Deterministic Finite Automaton (DFA), which is then
765 optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
766 or similar compiler books (this technique is more often used for lexical
767 analyzers).
769 The DFA's are used by the parser as parsing tables in a special way
770 that's probably unique. Before they are usable, the FIRST sets of all
771 non-terminals are computed.
773 Reference
774 ---------
776 [Aho&Ullman 77]
777 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
778 (first edition)