Use py_resource module
[python/dscho.git] / Parser / pgen.c
blobdb08d779698d314eac58153c52b306273ece963f
1 /***********************************************************
2 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3 The Netherlands.
5 All Rights Reserved
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI not be used in advertising or publicity pertaining to
13 distribution of the software without specific, written prior permission.
15 STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18 FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
23 ******************************************************************/
25 /* Parser generator */
26 /* XXX This file is not yet fully PROTOized */
28 /* For a description, see the comments at end of this file */
30 #include "pgenheaders.h"
31 #include "assert.h"
32 #include "token.h"
33 #include "node.h"
34 #include "grammar.h"
35 #include "metagrammar.h"
36 #include "pgen.h"
38 extern int debugging;
41 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
43 typedef struct _nfaarc {
44 int ar_label;
45 int ar_arrow;
46 } nfaarc;
48 typedef struct _nfastate {
49 int st_narcs;
50 nfaarc *st_arc;
51 } nfastate;
53 typedef struct _nfa {
54 int nf_type;
55 char *nf_name;
56 int nf_nstates;
57 nfastate *nf_state;
58 int nf_start, nf_finish;
59 } nfa;
61 /* Forward */
62 static compile_rhs PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
63 static compile_alt PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
64 static compile_item PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
65 static compile_atom PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
67 static int
68 addnfastate(nf)
69 nfa *nf;
71 nfastate *st;
73 RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
74 if (nf->nf_state == NULL)
75 fatal("out of mem");
76 st = &nf->nf_state[nf->nf_nstates++];
77 st->st_narcs = 0;
78 st->st_arc = NULL;
79 return st - nf->nf_state;
82 static void
83 addnfaarc(nf, from, to, lbl)
84 nfa *nf;
85 int from, to, lbl;
87 nfastate *st;
88 nfaarc *ar;
90 st = &nf->nf_state[from];
91 RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
92 if (st->st_arc == NULL)
93 fatal("out of mem");
94 ar = &st->st_arc[st->st_narcs++];
95 ar->ar_label = lbl;
96 ar->ar_arrow = to;
99 static nfa *
100 newnfa(name)
101 char *name;
103 nfa *nf;
104 static type = NT_OFFSET; /* All types will be disjunct */
106 nf = NEW(nfa, 1);
107 if (nf == NULL)
108 fatal("no mem for new nfa");
109 nf->nf_type = type++;
110 nf->nf_name = name; /* XXX strdup(name) ??? */
111 nf->nf_nstates = 0;
112 nf->nf_state = NULL;
113 nf->nf_start = nf->nf_finish = -1;
114 return nf;
117 typedef struct _nfagrammar {
118 int gr_nnfas;
119 nfa **gr_nfa;
120 labellist gr_ll;
121 } nfagrammar;
123 /* Forward */
124 static compile_rule PROTO((nfagrammar *gr, node *n));
126 static nfagrammar *
127 newnfagrammar()
129 nfagrammar *gr;
131 gr = NEW(nfagrammar, 1);
132 if (gr == NULL)
133 fatal("no mem for new nfa grammar");
134 gr->gr_nnfas = 0;
135 gr->gr_nfa = NULL;
136 gr->gr_ll.ll_nlabels = 0;
137 gr->gr_ll.ll_label = NULL;
138 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
139 return gr;
142 static nfa *
143 addnfa(gr, name)
144 nfagrammar *gr;
145 char *name;
147 nfa *nf;
149 nf = newnfa(name);
150 RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
151 if (gr->gr_nfa == NULL)
152 fatal("out of mem");
153 gr->gr_nfa[gr->gr_nnfas++] = nf;
154 addlabel(&gr->gr_ll, NAME, nf->nf_name);
155 return nf;
158 #ifdef DEBUG
160 static char REQNFMT[] = "metacompile: less than %d children\n";
162 #define REQN(i, count) \
163 if (i < count) { \
164 fprintf(stderr, REQNFMT, count); \
165 fatal("REQN"); \
166 } else
168 #else
169 #define REQN(i, count) /* empty */
170 #endif
172 static nfagrammar *
173 metacompile(n)
174 node *n;
176 nfagrammar *gr;
177 int i;
179 printf("Compiling (meta-) parse tree into NFA grammar\n");
180 gr = newnfagrammar();
181 REQ(n, MSTART);
182 i = n->n_nchildren - 1; /* Last child is ENDMARKER */
183 n = n->n_child;
184 for (; --i >= 0; n++) {
185 if (n->n_type != NEWLINE)
186 compile_rule(gr, n);
188 return gr;
191 static
192 compile_rule(gr, n)
193 nfagrammar *gr;
194 node *n;
196 nfa *nf;
198 REQ(n, RULE);
199 REQN(n->n_nchildren, 4);
200 n = n->n_child;
201 REQ(n, NAME);
202 nf = addnfa(gr, n->n_str);
203 n++;
204 REQ(n, COLON);
205 n++;
206 REQ(n, RHS);
207 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
208 n++;
209 REQ(n, NEWLINE);
212 static
213 compile_rhs(ll, nf, n, pa, pb)
214 labellist *ll;
215 nfa *nf;
216 node *n;
217 int *pa, *pb;
219 int i;
220 int a, b;
222 REQ(n, RHS);
223 i = n->n_nchildren;
224 REQN(i, 1);
225 n = n->n_child;
226 REQ(n, ALT);
227 compile_alt(ll, nf, n, pa, pb);
228 if (--i <= 0)
229 return;
230 n++;
231 a = *pa;
232 b = *pb;
233 *pa = addnfastate(nf);
234 *pb = addnfastate(nf);
235 addnfaarc(nf, *pa, a, EMPTY);
236 addnfaarc(nf, b, *pb, EMPTY);
237 for (; --i >= 0; n++) {
238 REQ(n, VBAR);
239 REQN(i, 1);
240 --i;
241 n++;
242 REQ(n, ALT);
243 compile_alt(ll, nf, n, &a, &b);
244 addnfaarc(nf, *pa, a, EMPTY);
245 addnfaarc(nf, b, *pb, EMPTY);
249 static
250 compile_alt(ll, nf, n, pa, pb)
251 labellist *ll;
252 nfa *nf;
253 node *n;
254 int *pa, *pb;
256 int i;
257 int a, b;
259 REQ(n, ALT);
260 i = n->n_nchildren;
261 REQN(i, 1);
262 n = n->n_child;
263 REQ(n, ITEM);
264 compile_item(ll, nf, n, pa, pb);
265 --i;
266 n++;
267 for (; --i >= 0; n++) {
268 if (n->n_type == COMMA) { /* XXX Temporary */
269 REQN(i, 1);
270 --i;
271 n++;
273 REQ(n, ITEM);
274 compile_item(ll, nf, n, &a, &b);
275 addnfaarc(nf, *pb, a, EMPTY);
276 *pb = b;
280 static
281 compile_item(ll, nf, n, pa, pb)
282 labellist *ll;
283 nfa *nf;
284 node *n;
285 int *pa, *pb;
287 int i;
288 int a, b;
290 REQ(n, ITEM);
291 i = n->n_nchildren;
292 REQN(i, 1);
293 n = n->n_child;
294 if (n->n_type == LSQB) {
295 REQN(i, 3);
296 n++;
297 REQ(n, RHS);
298 *pa = addnfastate(nf);
299 *pb = addnfastate(nf);
300 addnfaarc(nf, *pa, *pb, EMPTY);
301 compile_rhs(ll, nf, n, &a, &b);
302 addnfaarc(nf, *pa, a, EMPTY);
303 addnfaarc(nf, b, *pb, EMPTY);
304 REQN(i, 1);
305 n++;
306 REQ(n, RSQB);
308 else {
309 compile_atom(ll, nf, n, pa, pb);
310 if (--i <= 0)
311 return;
312 n++;
313 addnfaarc(nf, *pb, *pa, EMPTY);
314 if (n->n_type == STAR)
315 *pb = *pa;
316 else
317 REQ(n, PLUS);
321 static
322 compile_atom(ll, nf, n, pa, pb)
323 labellist *ll;
324 nfa *nf;
325 node *n;
326 int *pa, *pb;
328 int i;
330 REQ(n, ATOM);
331 i = n->n_nchildren;
332 REQN(i, 1);
333 n = n->n_child;
334 if (n->n_type == LPAR) {
335 REQN(i, 3);
336 n++;
337 REQ(n, RHS);
338 compile_rhs(ll, nf, n, pa, pb);
339 n++;
340 REQ(n, RPAR);
342 else if (n->n_type == NAME || n->n_type == STRING) {
343 *pa = addnfastate(nf);
344 *pb = addnfastate(nf);
345 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
347 else
348 REQ(n, NAME);
351 static void
352 dumpstate(ll, nf, istate)
353 labellist *ll;
354 nfa *nf;
355 int istate;
357 nfastate *st;
358 int i;
359 nfaarc *ar;
361 printf("%c%2d%c",
362 istate == nf->nf_start ? '*' : ' ',
363 istate,
364 istate == nf->nf_finish ? '.' : ' ');
365 st = &nf->nf_state[istate];
366 ar = st->st_arc;
367 for (i = 0; i < st->st_narcs; i++) {
368 if (i > 0)
369 printf("\n ");
370 printf("-> %2d %s", ar->ar_arrow,
371 labelrepr(&ll->ll_label[ar->ar_label]));
372 ar++;
374 printf("\n");
377 static void
378 dumpnfa(ll, nf)
379 labellist *ll;
380 nfa *nf;
382 int i;
384 printf("NFA '%s' has %d states; start %d, finish %d\n",
385 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
386 for (i = 0; i < nf->nf_nstates; i++)
387 dumpstate(ll, nf, i);
391 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
393 static void
394 addclosure(ss, nf, istate)
395 bitset ss;
396 nfa *nf;
397 int istate;
399 if (addbit(ss, istate)) {
400 nfastate *st = &nf->nf_state[istate];
401 nfaarc *ar = st->st_arc;
402 int i;
404 for (i = st->st_narcs; --i >= 0; ) {
405 if (ar->ar_label == EMPTY)
406 addclosure(ss, nf, ar->ar_arrow);
407 ar++;
412 typedef struct _ss_arc {
413 bitset sa_bitset;
414 int sa_arrow;
415 int sa_label;
416 } ss_arc;
418 typedef struct _ss_state {
419 bitset ss_ss;
420 int ss_narcs;
421 ss_arc *ss_arc;
422 int ss_deleted;
423 int ss_finish;
424 int ss_rename;
425 } ss_state;
427 typedef struct _ss_dfa {
428 int sd_nstates;
429 ss_state *sd_state;
430 } ss_dfa;
432 /* Forward */
433 static printssdfa PROTO((int xx_nstates, ss_state *xx_state, int nbits,
434 labellist *ll, char *msg));
435 static simplify PROTO((int xx_nstates, ss_state *xx_state));
436 static convert PROTO((dfa *d, int xx_nstates, ss_state *xx_state));
438 static
439 makedfa(gr, nf, d)
440 nfagrammar *gr;
441 nfa *nf;
442 dfa *d;
444 int nbits = nf->nf_nstates;
445 bitset ss;
446 int xx_nstates;
447 ss_state *xx_state, *yy;
448 ss_arc *zz;
449 int istate, jstate, iarc, jarc, ibit;
450 nfastate *st;
451 nfaarc *ar;
453 ss = newbitset(nbits);
454 addclosure(ss, nf, nf->nf_start);
455 xx_state = NEW(ss_state, 1);
456 if (xx_state == NULL)
457 fatal("no mem for xx_state in makedfa");
458 xx_nstates = 1;
459 yy = &xx_state[0];
460 yy->ss_ss = ss;
461 yy->ss_narcs = 0;
462 yy->ss_arc = NULL;
463 yy->ss_deleted = 0;
464 yy->ss_finish = testbit(ss, nf->nf_finish);
465 if (yy->ss_finish)
466 printf("Error: nonterminal '%s' may produce empty.\n",
467 nf->nf_name);
469 /* This algorithm is from a book written before
470 the invention of structured programming... */
472 /* For each unmarked state... */
473 for (istate = 0; istate < xx_nstates; ++istate) {
474 yy = &xx_state[istate];
475 ss = yy->ss_ss;
476 /* For all its states... */
477 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
478 if (!testbit(ss, ibit))
479 continue;
480 st = &nf->nf_state[ibit];
481 /* For all non-empty arcs from this state... */
482 for (iarc = 0; iarc < st->st_narcs; iarc++) {
483 ar = &st->st_arc[iarc];
484 if (ar->ar_label == EMPTY)
485 continue;
486 /* Look up in list of arcs from this state */
487 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
488 zz = &yy->ss_arc[jarc];
489 if (ar->ar_label == zz->sa_label)
490 goto found;
492 /* Add new arc for this state */
493 RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
494 if (yy->ss_arc == NULL)
495 fatal("out of mem");
496 zz = &yy->ss_arc[yy->ss_narcs++];
497 zz->sa_label = ar->ar_label;
498 zz->sa_bitset = newbitset(nbits);
499 zz->sa_arrow = -1;
500 found: ;
501 /* Add destination */
502 addclosure(zz->sa_bitset, nf, ar->ar_arrow);
505 /* Now look up all the arrow states */
506 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
507 zz = &xx_state[istate].ss_arc[jarc];
508 for (jstate = 0; jstate < xx_nstates; jstate++) {
509 if (samebitset(zz->sa_bitset,
510 xx_state[jstate].ss_ss, nbits)) {
511 zz->sa_arrow = jstate;
512 goto done;
515 RESIZE(xx_state, ss_state, xx_nstates + 1);
516 if (xx_state == NULL)
517 fatal("out of mem");
518 zz->sa_arrow = xx_nstates;
519 yy = &xx_state[xx_nstates++];
520 yy->ss_ss = zz->sa_bitset;
521 yy->ss_narcs = 0;
522 yy->ss_arc = NULL;
523 yy->ss_deleted = 0;
524 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
525 done: ;
529 if (debugging)
530 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
531 "before minimizing");
533 simplify(xx_nstates, xx_state);
535 if (debugging)
536 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
537 "after minimizing");
539 convert(d, xx_nstates, xx_state);
541 /* XXX cleanup */
544 static
545 printssdfa(xx_nstates, xx_state, nbits, ll, msg)
546 int xx_nstates;
547 ss_state *xx_state;
548 int nbits;
549 labellist *ll;
550 char *msg;
552 int i, ibit, iarc;
553 ss_state *yy;
554 ss_arc *zz;
556 printf("Subset DFA %s\n", msg);
557 for (i = 0; i < xx_nstates; i++) {
558 yy = &xx_state[i];
559 if (yy->ss_deleted)
560 continue;
561 printf(" Subset %d", i);
562 if (yy->ss_finish)
563 printf(" (finish)");
564 printf(" { ");
565 for (ibit = 0; ibit < nbits; ibit++) {
566 if (testbit(yy->ss_ss, ibit))
567 printf("%d ", ibit);
569 printf("}\n");
570 for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
571 zz = &yy->ss_arc[iarc];
572 printf(" Arc to state %d, label %s\n",
573 zz->sa_arrow,
574 labelrepr(&ll->ll_label[zz->sa_label]));
580 /* PART THREE -- SIMPLIFY DFA */
582 /* Simplify the DFA by repeatedly eliminating states that are
583 equivalent to another oner. This is NOT Algorithm 3.3 from
584 [Aho&Ullman 77]. It does not always finds the minimal DFA,
585 but it does usually make a much smaller one... (For an example
586 of sub-optimal behaviour, try S: x a b+ | y a b+.)
589 static int
590 samestate(s1, s2)
591 ss_state *s1, *s2;
593 int i;
595 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
596 return 0;
597 for (i = 0; i < s1->ss_narcs; i++) {
598 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
599 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
600 return 0;
602 return 1;
605 static void
606 renamestates(xx_nstates, xx_state, from, to)
607 int xx_nstates;
608 ss_state *xx_state;
609 int from, to;
611 int i, j;
613 if (debugging)
614 printf("Rename state %d to %d.\n", from, to);
615 for (i = 0; i < xx_nstates; i++) {
616 if (xx_state[i].ss_deleted)
617 continue;
618 for (j = 0; j < xx_state[i].ss_narcs; j++) {
619 if (xx_state[i].ss_arc[j].sa_arrow == from)
620 xx_state[i].ss_arc[j].sa_arrow = to;
625 static
626 simplify(xx_nstates, xx_state)
627 int xx_nstates;
628 ss_state *xx_state;
630 int changes;
631 int i, j;
633 do {
634 changes = 0;
635 for (i = 1; i < xx_nstates; i++) {
636 if (xx_state[i].ss_deleted)
637 continue;
638 for (j = 0; j < i; j++) {
639 if (xx_state[j].ss_deleted)
640 continue;
641 if (samestate(&xx_state[i], &xx_state[j])) {
642 xx_state[i].ss_deleted++;
643 renamestates(xx_nstates, xx_state, i, j);
644 changes++;
645 break;
649 } while (changes);
653 /* PART FOUR -- GENERATE PARSING TABLES */
655 /* Convert the DFA into a grammar that can be used by our parser */
657 static
658 convert(d, xx_nstates, xx_state)
659 dfa *d;
660 int xx_nstates;
661 ss_state *xx_state;
663 int i, j;
664 ss_state *yy;
665 ss_arc *zz;
667 for (i = 0; i < xx_nstates; i++) {
668 yy = &xx_state[i];
669 if (yy->ss_deleted)
670 continue;
671 yy->ss_rename = addstate(d);
674 for (i = 0; i < xx_nstates; i++) {
675 yy = &xx_state[i];
676 if (yy->ss_deleted)
677 continue;
678 for (j = 0; j < yy->ss_narcs; j++) {
679 zz = &yy->ss_arc[j];
680 addarc(d, yy->ss_rename,
681 xx_state[zz->sa_arrow].ss_rename,
682 zz->sa_label);
684 if (yy->ss_finish)
685 addarc(d, yy->ss_rename, yy->ss_rename, 0);
688 d->d_initial = 0;
692 /* PART FIVE -- GLUE IT ALL TOGETHER */
694 static grammar *
695 maketables(gr)
696 nfagrammar *gr;
698 int i;
699 nfa *nf;
700 dfa *d;
701 grammar *g;
703 if (gr->gr_nnfas == 0)
704 return NULL;
705 g = newgrammar(gr->gr_nfa[0]->nf_type);
706 /* XXX first rule must be start rule */
707 g->g_ll = gr->gr_ll;
709 for (i = 0; i < gr->gr_nnfas; i++) {
710 nf = gr->gr_nfa[i];
711 if (debugging) {
712 printf("Dump of NFA for '%s' ...\n", nf->nf_name);
713 dumpnfa(&gr->gr_ll, nf);
715 printf("Making DFA for '%s' ...\n", nf->nf_name);
716 d = adddfa(g, nf->nf_type, nf->nf_name);
717 makedfa(gr, gr->gr_nfa[i], d);
720 return g;
723 grammar *
724 pgen(n)
725 node *n;
727 nfagrammar *gr;
728 grammar *g;
730 gr = metacompile(n);
731 g = maketables(gr);
732 translatelabels(g);
733 addfirstsets(g);
734 return g;
740 Description
741 -----------
743 Input is a grammar in extended BNF (using * for repetition, + for
744 at-least-once repetition, [] for optional parts, | for alternatives and
745 () for grouping). This has already been parsed and turned into a parse
746 tree.
748 Each rule is considered as a regular expression in its own right.
749 It is turned into a Non-deterministic Finite Automaton (NFA), which
750 is then turned into a Deterministic Finite Automaton (DFA), which is then
751 optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
752 or similar compiler books (this technique is more often used for lexical
753 analyzers).
755 The DFA's are used by the parser as parsing tables in a special way
756 that's probably unique. Before they are usable, the FIRST sets of all
757 non-terminals are computed.
759 Reference
760 ---------
762 [Aho&Ullman 77]
763 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
764 (first edition)