1 /***********************************************************
2 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI not be used in advertising or publicity pertaining to
13 distribution of the software without specific, written prior permission.
15 STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18 FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
23 ******************************************************************/
25 /* Parser generator */
26 /* XXX This file is not yet fully PROTOized */
28 /* For a description, see the comments at end of this file */
30 #include "pgenheaders.h"
35 #include "metagrammar.h"
41 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
43 typedef struct _nfaarc
{
48 typedef struct _nfastate
{
58 int nf_start
, nf_finish
;
62 static compile_rhs
PROTO((labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
));
63 static compile_alt
PROTO((labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
));
64 static compile_item
PROTO((labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
));
65 static compile_atom
PROTO((labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
));
73 RESIZE(nf
->nf_state
, nfastate
, nf
->nf_nstates
+ 1);
74 if (nf
->nf_state
== NULL
)
76 st
= &nf
->nf_state
[nf
->nf_nstates
++];
79 return st
- nf
->nf_state
;
83 addnfaarc(nf
, from
, to
, lbl
)
90 st
= &nf
->nf_state
[from
];
91 RESIZE(st
->st_arc
, nfaarc
, st
->st_narcs
+ 1);
92 if (st
->st_arc
== NULL
)
94 ar
= &st
->st_arc
[st
->st_narcs
++];
104 static type
= NT_OFFSET
; /* All types will be disjunct */
108 fatal("no mem for new nfa");
109 nf
->nf_type
= type
++;
110 nf
->nf_name
= name
; /* XXX strdup(name) ??? */
113 nf
->nf_start
= nf
->nf_finish
= -1;
117 typedef struct _nfagrammar
{
124 static compile_rule
PROTO((nfagrammar
*gr
, node
*n
));
131 gr
= NEW(nfagrammar
, 1);
133 fatal("no mem for new nfa grammar");
136 gr
->gr_ll
.ll_nlabels
= 0;
137 gr
->gr_ll
.ll_label
= NULL
;
138 addlabel(&gr
->gr_ll
, ENDMARKER
, "EMPTY");
150 RESIZE(gr
->gr_nfa
, nfa
*, gr
->gr_nnfas
+ 1);
151 if (gr
->gr_nfa
== NULL
)
153 gr
->gr_nfa
[gr
->gr_nnfas
++] = nf
;
154 addlabel(&gr
->gr_ll
, NAME
, nf
->nf_name
);
160 static char REQNFMT
[] = "metacompile: less than %d children\n";
162 #define REQN(i, count) \
164 fprintf(stderr, REQNFMT, count); \
169 #define REQN(i, count) /* empty */
179 printf("Compiling (meta-) parse tree into NFA grammar\n");
180 gr
= newnfagrammar();
182 i
= n
->n_nchildren
- 1; /* Last child is ENDMARKER */
184 for (; --i
>= 0; n
++) {
185 if (n
->n_type
!= NEWLINE
)
199 REQN(n
->n_nchildren
, 4);
202 nf
= addnfa(gr
, n
->n_str
);
207 compile_rhs(&gr
->gr_ll
, nf
, n
, &nf
->nf_start
, &nf
->nf_finish
);
213 compile_rhs(ll
, nf
, n
, pa
, pb
)
227 compile_alt(ll
, nf
, n
, pa
, pb
);
233 *pa
= addnfastate(nf
);
234 *pb
= addnfastate(nf
);
235 addnfaarc(nf
, *pa
, a
, EMPTY
);
236 addnfaarc(nf
, b
, *pb
, EMPTY
);
237 for (; --i
>= 0; n
++) {
243 compile_alt(ll
, nf
, n
, &a
, &b
);
244 addnfaarc(nf
, *pa
, a
, EMPTY
);
245 addnfaarc(nf
, b
, *pb
, EMPTY
);
250 compile_alt(ll
, nf
, n
, pa
, pb
)
264 compile_item(ll
, nf
, n
, pa
, pb
);
267 for (; --i
>= 0; n
++) {
268 if (n
->n_type
== COMMA
) { /* XXX Temporary */
274 compile_item(ll
, nf
, n
, &a
, &b
);
275 addnfaarc(nf
, *pb
, a
, EMPTY
);
281 compile_item(ll
, nf
, n
, pa
, pb
)
294 if (n
->n_type
== LSQB
) {
298 *pa
= addnfastate(nf
);
299 *pb
= addnfastate(nf
);
300 addnfaarc(nf
, *pa
, *pb
, EMPTY
);
301 compile_rhs(ll
, nf
, n
, &a
, &b
);
302 addnfaarc(nf
, *pa
, a
, EMPTY
);
303 addnfaarc(nf
, b
, *pb
, EMPTY
);
309 compile_atom(ll
, nf
, n
, pa
, pb
);
313 addnfaarc(nf
, *pb
, *pa
, EMPTY
);
314 if (n
->n_type
== STAR
)
322 compile_atom(ll
, nf
, n
, pa
, pb
)
334 if (n
->n_type
== LPAR
) {
338 compile_rhs(ll
, nf
, n
, pa
, pb
);
342 else if (n
->n_type
== NAME
|| n
->n_type
== STRING
) {
343 *pa
= addnfastate(nf
);
344 *pb
= addnfastate(nf
);
345 addnfaarc(nf
, *pa
, *pb
, addlabel(ll
, n
->n_type
, n
->n_str
));
352 dumpstate(ll
, nf
, istate
)
362 istate
== nf
->nf_start
? '*' : ' ',
364 istate
== nf
->nf_finish
? '.' : ' ');
365 st
= &nf
->nf_state
[istate
];
367 for (i
= 0; i
< st
->st_narcs
; i
++) {
370 printf("-> %2d %s", ar
->ar_arrow
,
371 labelrepr(&ll
->ll_label
[ar
->ar_label
]));
384 printf("NFA '%s' has %d states; start %d, finish %d\n",
385 nf
->nf_name
, nf
->nf_nstates
, nf
->nf_start
, nf
->nf_finish
);
386 for (i
= 0; i
< nf
->nf_nstates
; i
++)
387 dumpstate(ll
, nf
, i
);
391 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
394 addclosure(ss
, nf
, istate
)
399 if (addbit(ss
, istate
)) {
400 nfastate
*st
= &nf
->nf_state
[istate
];
401 nfaarc
*ar
= st
->st_arc
;
404 for (i
= st
->st_narcs
; --i
>= 0; ) {
405 if (ar
->ar_label
== EMPTY
)
406 addclosure(ss
, nf
, ar
->ar_arrow
);
412 typedef struct _ss_arc
{
418 typedef struct _ss_state
{
427 typedef struct _ss_dfa
{
433 static printssdfa
PROTO((int xx_nstates
, ss_state
*xx_state
, int nbits
,
434 labellist
*ll
, char *msg
));
435 static simplify
PROTO((int xx_nstates
, ss_state
*xx_state
));
436 static convert
PROTO((dfa
*d
, int xx_nstates
, ss_state
*xx_state
));
444 int nbits
= nf
->nf_nstates
;
447 ss_state
*xx_state
, *yy
;
449 int istate
, jstate
, iarc
, jarc
, ibit
;
453 ss
= newbitset(nbits
);
454 addclosure(ss
, nf
, nf
->nf_start
);
455 xx_state
= NEW(ss_state
, 1);
456 if (xx_state
== NULL
)
457 fatal("no mem for xx_state in makedfa");
464 yy
->ss_finish
= testbit(ss
, nf
->nf_finish
);
466 printf("Error: nonterminal '%s' may produce empty.\n",
469 /* This algorithm is from a book written before
470 the invention of structured programming... */
472 /* For each unmarked state... */
473 for (istate
= 0; istate
< xx_nstates
; ++istate
) {
474 yy
= &xx_state
[istate
];
476 /* For all its states... */
477 for (ibit
= 0; ibit
< nf
->nf_nstates
; ++ibit
) {
478 if (!testbit(ss
, ibit
))
480 st
= &nf
->nf_state
[ibit
];
481 /* For all non-empty arcs from this state... */
482 for (iarc
= 0; iarc
< st
->st_narcs
; iarc
++) {
483 ar
= &st
->st_arc
[iarc
];
484 if (ar
->ar_label
== EMPTY
)
486 /* Look up in list of arcs from this state */
487 for (jarc
= 0; jarc
< yy
->ss_narcs
; ++jarc
) {
488 zz
= &yy
->ss_arc
[jarc
];
489 if (ar
->ar_label
== zz
->sa_label
)
492 /* Add new arc for this state */
493 RESIZE(yy
->ss_arc
, ss_arc
, yy
->ss_narcs
+ 1);
494 if (yy
->ss_arc
== NULL
)
496 zz
= &yy
->ss_arc
[yy
->ss_narcs
++];
497 zz
->sa_label
= ar
->ar_label
;
498 zz
->sa_bitset
= newbitset(nbits
);
501 /* Add destination */
502 addclosure(zz
->sa_bitset
, nf
, ar
->ar_arrow
);
505 /* Now look up all the arrow states */
506 for (jarc
= 0; jarc
< xx_state
[istate
].ss_narcs
; jarc
++) {
507 zz
= &xx_state
[istate
].ss_arc
[jarc
];
508 for (jstate
= 0; jstate
< xx_nstates
; jstate
++) {
509 if (samebitset(zz
->sa_bitset
,
510 xx_state
[jstate
].ss_ss
, nbits
)) {
511 zz
->sa_arrow
= jstate
;
515 RESIZE(xx_state
, ss_state
, xx_nstates
+ 1);
516 if (xx_state
== NULL
)
518 zz
->sa_arrow
= xx_nstates
;
519 yy
= &xx_state
[xx_nstates
++];
520 yy
->ss_ss
= zz
->sa_bitset
;
524 yy
->ss_finish
= testbit(yy
->ss_ss
, nf
->nf_finish
);
530 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
531 "before minimizing");
533 simplify(xx_nstates
, xx_state
);
536 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
539 convert(d
, xx_nstates
, xx_state
);
545 printssdfa(xx_nstates
, xx_state
, nbits
, ll
, msg
)
556 printf("Subset DFA %s\n", msg
);
557 for (i
= 0; i
< xx_nstates
; i
++) {
561 printf(" Subset %d", i
);
565 for (ibit
= 0; ibit
< nbits
; ibit
++) {
566 if (testbit(yy
->ss_ss
, ibit
))
570 for (iarc
= 0; iarc
< yy
->ss_narcs
; iarc
++) {
571 zz
= &yy
->ss_arc
[iarc
];
572 printf(" Arc to state %d, label %s\n",
574 labelrepr(&ll
->ll_label
[zz
->sa_label
]));
580 /* PART THREE -- SIMPLIFY DFA */
582 /* Simplify the DFA by repeatedly eliminating states that are
583 equivalent to another oner. This is NOT Algorithm 3.3 from
584 [Aho&Ullman 77]. It does not always finds the minimal DFA,
585 but it does usually make a much smaller one... (For an example
586 of sub-optimal behaviour, try S: x a b+ | y a b+.)
595 if (s1
->ss_narcs
!= s2
->ss_narcs
|| s1
->ss_finish
!= s2
->ss_finish
)
597 for (i
= 0; i
< s1
->ss_narcs
; i
++) {
598 if (s1
->ss_arc
[i
].sa_arrow
!= s2
->ss_arc
[i
].sa_arrow
||
599 s1
->ss_arc
[i
].sa_label
!= s2
->ss_arc
[i
].sa_label
)
606 renamestates(xx_nstates
, xx_state
, from
, to
)
614 printf("Rename state %d to %d.\n", from
, to
);
615 for (i
= 0; i
< xx_nstates
; i
++) {
616 if (xx_state
[i
].ss_deleted
)
618 for (j
= 0; j
< xx_state
[i
].ss_narcs
; j
++) {
619 if (xx_state
[i
].ss_arc
[j
].sa_arrow
== from
)
620 xx_state
[i
].ss_arc
[j
].sa_arrow
= to
;
626 simplify(xx_nstates
, xx_state
)
635 for (i
= 1; i
< xx_nstates
; i
++) {
636 if (xx_state
[i
].ss_deleted
)
638 for (j
= 0; j
< i
; j
++) {
639 if (xx_state
[j
].ss_deleted
)
641 if (samestate(&xx_state
[i
], &xx_state
[j
])) {
642 xx_state
[i
].ss_deleted
++;
643 renamestates(xx_nstates
, xx_state
, i
, j
);
653 /* PART FOUR -- GENERATE PARSING TABLES */
655 /* Convert the DFA into a grammar that can be used by our parser */
658 convert(d
, xx_nstates
, xx_state
)
667 for (i
= 0; i
< xx_nstates
; i
++) {
671 yy
->ss_rename
= addstate(d
);
674 for (i
= 0; i
< xx_nstates
; i
++) {
678 for (j
= 0; j
< yy
->ss_narcs
; j
++) {
680 addarc(d
, yy
->ss_rename
,
681 xx_state
[zz
->sa_arrow
].ss_rename
,
685 addarc(d
, yy
->ss_rename
, yy
->ss_rename
, 0);
692 /* PART FIVE -- GLUE IT ALL TOGETHER */
703 if (gr
->gr_nnfas
== 0)
705 g
= newgrammar(gr
->gr_nfa
[0]->nf_type
);
706 /* XXX first rule must be start rule */
709 for (i
= 0; i
< gr
->gr_nnfas
; i
++) {
712 printf("Dump of NFA for '%s' ...\n", nf
->nf_name
);
713 dumpnfa(&gr
->gr_ll
, nf
);
715 printf("Making DFA for '%s' ...\n", nf
->nf_name
);
716 d
= adddfa(g
, nf
->nf_type
, nf
->nf_name
);
717 makedfa(gr
, gr
->gr_nfa
[i
], d
);
743 Input is a grammar in extended BNF (using * for repetition, + for
744 at-least-once repetition, [] for optional parts, | for alternatives and
745 () for grouping). This has already been parsed and turned into a parse
748 Each rule is considered as a regular expression in its own right.
749 It is turned into a Non-deterministic Finite Automaton (NFA), which
750 is then turned into a Deterministic Finite Automaton (DFA), which is then
751 optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
752 or similar compiler books (this technique is more often used for lexical
755 The DFA's are used by the parser as parsing tables in a special way
756 that's probably unique. Before they are usable, the FIRST sets of all
757 non-terminals are computed.
763 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977