1 /***********************************************************
2 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI or Corporation for National Research Initiatives or
13 CNRI not be used in advertising or publicity pertaining to
14 distribution of the software without specific, written prior
17 While CWI is the initial source for this software, a modified version
18 is made available by the Corporation for National Research Initiatives
19 (CNRI) at the Internet address ftp://ftp.python.org.
21 STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22 REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23 MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24 CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28 PERFORMANCE OF THIS SOFTWARE.
30 ******************************************************************/
32 /* Parser generator */
33 /* XXX This file is not yet fully PROTOized */
35 /* For a description, see the comments at end of this file */
37 #include "pgenheaders.h"
42 #include "metagrammar.h"
45 extern int Py_DebugFlag
;
48 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
50 typedef struct _nfaarc
{
55 typedef struct _nfastate
{
65 int nf_start
, nf_finish
;
69 static void compile_rhs
Py_PROTO((labellist
*ll
,
70 nfa
*nf
, node
*n
, int *pa
, int *pb
));
71 static void compile_alt
Py_PROTO((labellist
*ll
,
72 nfa
*nf
, node
*n
, int *pa
, int *pb
));
73 static void compile_item
Py_PROTO((labellist
*ll
,
74 nfa
*nf
, node
*n
, int *pa
, int *pb
));
75 static void compile_atom
Py_PROTO((labellist
*ll
,
76 nfa
*nf
, node
*n
, int *pa
, int *pb
));
84 PyMem_RESIZE(nf
->nf_state
, nfastate
, nf
->nf_nstates
+ 1);
85 if (nf
->nf_state
== NULL
)
86 Py_FatalError("out of mem");
87 st
= &nf
->nf_state
[nf
->nf_nstates
++];
90 return st
- nf
->nf_state
;
94 addnfaarc(nf
, from
, to
, lbl
)
101 st
= &nf
->nf_state
[from
];
102 PyMem_RESIZE(st
->st_arc
, nfaarc
, st
->st_narcs
+ 1);
103 if (st
->st_arc
== NULL
)
104 Py_FatalError("out of mem");
105 ar
= &st
->st_arc
[st
->st_narcs
++];
115 static int type
= NT_OFFSET
; /* All types will be disjunct */
117 nf
= PyMem_NEW(nfa
, 1);
119 Py_FatalError("no mem for new nfa");
120 nf
->nf_type
= type
++;
121 nf
->nf_name
= name
; /* XXX strdup(name) ??? */
124 nf
->nf_start
= nf
->nf_finish
= -1;
128 typedef struct _nfagrammar
{
135 static void compile_rule
Py_PROTO((nfagrammar
*gr
, node
*n
));
142 gr
= PyMem_NEW(nfagrammar
, 1);
144 Py_FatalError("no mem for new nfa grammar");
147 gr
->gr_ll
.ll_nlabels
= 0;
148 gr
->gr_ll
.ll_label
= NULL
;
149 addlabel(&gr
->gr_ll
, ENDMARKER
, "EMPTY");
161 PyMem_RESIZE(gr
->gr_nfa
, nfa
*, gr
->gr_nnfas
+ 1);
162 if (gr
->gr_nfa
== NULL
)
163 Py_FatalError("out of mem");
164 gr
->gr_nfa
[gr
->gr_nnfas
++] = nf
;
165 addlabel(&gr
->gr_ll
, NAME
, nf
->nf_name
);
171 static char REQNFMT
[] = "metacompile: less than %d children\n";
173 #define REQN(i, count) \
175 fprintf(stderr, REQNFMT, count); \
176 Py_FatalError("REQN"); \
180 #define REQN(i, count) /* empty */
190 printf("Compiling (meta-) parse tree into NFA grammar\n");
191 gr
= newnfagrammar();
193 i
= n
->n_nchildren
- 1; /* Last child is ENDMARKER */
195 for (; --i
>= 0; n
++) {
196 if (n
->n_type
!= NEWLINE
)
210 REQN(n
->n_nchildren
, 4);
213 nf
= addnfa(gr
, n
->n_str
);
218 compile_rhs(&gr
->gr_ll
, nf
, n
, &nf
->nf_start
, &nf
->nf_finish
);
224 compile_rhs(ll
, nf
, n
, pa
, pb
)
238 compile_alt(ll
, nf
, n
, pa
, pb
);
244 *pa
= addnfastate(nf
);
245 *pb
= addnfastate(nf
);
246 addnfaarc(nf
, *pa
, a
, EMPTY
);
247 addnfaarc(nf
, b
, *pb
, EMPTY
);
248 for (; --i
>= 0; n
++) {
254 compile_alt(ll
, nf
, n
, &a
, &b
);
255 addnfaarc(nf
, *pa
, a
, EMPTY
);
256 addnfaarc(nf
, b
, *pb
, EMPTY
);
261 compile_alt(ll
, nf
, n
, pa
, pb
)
275 compile_item(ll
, nf
, n
, pa
, pb
);
278 for (; --i
>= 0; n
++) {
279 if (n
->n_type
== COMMA
) { /* XXX Temporary */
285 compile_item(ll
, nf
, n
, &a
, &b
);
286 addnfaarc(nf
, *pb
, a
, EMPTY
);
292 compile_item(ll
, nf
, n
, pa
, pb
)
305 if (n
->n_type
== LSQB
) {
309 *pa
= addnfastate(nf
);
310 *pb
= addnfastate(nf
);
311 addnfaarc(nf
, *pa
, *pb
, EMPTY
);
312 compile_rhs(ll
, nf
, n
, &a
, &b
);
313 addnfaarc(nf
, *pa
, a
, EMPTY
);
314 addnfaarc(nf
, b
, *pb
, EMPTY
);
320 compile_atom(ll
, nf
, n
, pa
, pb
);
324 addnfaarc(nf
, *pb
, *pa
, EMPTY
);
325 if (n
->n_type
== STAR
)
333 compile_atom(ll
, nf
, n
, pa
, pb
)
345 if (n
->n_type
== LPAR
) {
349 compile_rhs(ll
, nf
, n
, pa
, pb
);
353 else if (n
->n_type
== NAME
|| n
->n_type
== STRING
) {
354 *pa
= addnfastate(nf
);
355 *pb
= addnfastate(nf
);
356 addnfaarc(nf
, *pa
, *pb
, addlabel(ll
, n
->n_type
, n
->n_str
));
363 dumpstate(ll
, nf
, istate
)
373 istate
== nf
->nf_start
? '*' : ' ',
375 istate
== nf
->nf_finish
? '.' : ' ');
376 st
= &nf
->nf_state
[istate
];
378 for (i
= 0; i
< st
->st_narcs
; i
++) {
381 printf("-> %2d %s", ar
->ar_arrow
,
382 PyGrammar_LabelRepr(&ll
->ll_label
[ar
->ar_label
]));
395 printf("NFA '%s' has %d states; start %d, finish %d\n",
396 nf
->nf_name
, nf
->nf_nstates
, nf
->nf_start
, nf
->nf_finish
);
397 for (i
= 0; i
< nf
->nf_nstates
; i
++)
398 dumpstate(ll
, nf
, i
);
402 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
405 addclosure(ss
, nf
, istate
)
410 if (addbit(ss
, istate
)) {
411 nfastate
*st
= &nf
->nf_state
[istate
];
412 nfaarc
*ar
= st
->st_arc
;
415 for (i
= st
->st_narcs
; --i
>= 0; ) {
416 if (ar
->ar_label
== EMPTY
)
417 addclosure(ss
, nf
, ar
->ar_arrow
);
423 typedef struct _ss_arc
{
429 typedef struct _ss_state
{
438 typedef struct _ss_dfa
{
444 static void printssdfa
Py_PROTO((int xx_nstates
, ss_state
*xx_state
, int nbits
,
445 labellist
*ll
, char *msg
));
446 static void simplify
Py_PROTO((int xx_nstates
, ss_state
*xx_state
));
447 static void convert
Py_PROTO((dfa
*d
, int xx_nstates
, ss_state
*xx_state
));
455 int nbits
= nf
->nf_nstates
;
458 ss_state
*xx_state
, *yy
;
460 int istate
, jstate
, iarc
, jarc
, ibit
;
464 ss
= newbitset(nbits
);
465 addclosure(ss
, nf
, nf
->nf_start
);
466 xx_state
= PyMem_NEW(ss_state
, 1);
467 if (xx_state
== NULL
)
468 Py_FatalError("no mem for xx_state in makedfa");
475 yy
->ss_finish
= testbit(ss
, nf
->nf_finish
);
477 printf("Error: nonterminal '%s' may produce empty.\n",
480 /* This algorithm is from a book written before
481 the invention of structured programming... */
483 /* For each unmarked state... */
484 for (istate
= 0; istate
< xx_nstates
; ++istate
) {
485 yy
= &xx_state
[istate
];
487 /* For all its states... */
488 for (ibit
= 0; ibit
< nf
->nf_nstates
; ++ibit
) {
489 if (!testbit(ss
, ibit
))
491 st
= &nf
->nf_state
[ibit
];
492 /* For all non-empty arcs from this state... */
493 for (iarc
= 0; iarc
< st
->st_narcs
; iarc
++) {
494 ar
= &st
->st_arc
[iarc
];
495 if (ar
->ar_label
== EMPTY
)
497 /* Look up in list of arcs from this state */
498 for (jarc
= 0; jarc
< yy
->ss_narcs
; ++jarc
) {
499 zz
= &yy
->ss_arc
[jarc
];
500 if (ar
->ar_label
== zz
->sa_label
)
503 /* Add new arc for this state */
504 PyMem_RESIZE(yy
->ss_arc
, ss_arc
,
506 if (yy
->ss_arc
== NULL
)
507 Py_FatalError("out of mem");
508 zz
= &yy
->ss_arc
[yy
->ss_narcs
++];
509 zz
->sa_label
= ar
->ar_label
;
510 zz
->sa_bitset
= newbitset(nbits
);
513 /* Add destination */
514 addclosure(zz
->sa_bitset
, nf
, ar
->ar_arrow
);
517 /* Now look up all the arrow states */
518 for (jarc
= 0; jarc
< xx_state
[istate
].ss_narcs
; jarc
++) {
519 zz
= &xx_state
[istate
].ss_arc
[jarc
];
520 for (jstate
= 0; jstate
< xx_nstates
; jstate
++) {
521 if (samebitset(zz
->sa_bitset
,
522 xx_state
[jstate
].ss_ss
, nbits
)) {
523 zz
->sa_arrow
= jstate
;
527 PyMem_RESIZE(xx_state
, ss_state
, xx_nstates
+ 1);
528 if (xx_state
== NULL
)
529 Py_FatalError("out of mem");
530 zz
->sa_arrow
= xx_nstates
;
531 yy
= &xx_state
[xx_nstates
++];
532 yy
->ss_ss
= zz
->sa_bitset
;
536 yy
->ss_finish
= testbit(yy
->ss_ss
, nf
->nf_finish
);
542 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
543 "before minimizing");
545 simplify(xx_nstates
, xx_state
);
548 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
551 convert(d
, xx_nstates
, xx_state
);
557 printssdfa(xx_nstates
, xx_state
, nbits
, ll
, msg
)
568 printf("Subset DFA %s\n", msg
);
569 for (i
= 0; i
< xx_nstates
; i
++) {
573 printf(" Subset %d", i
);
577 for (ibit
= 0; ibit
< nbits
; ibit
++) {
578 if (testbit(yy
->ss_ss
, ibit
))
582 for (iarc
= 0; iarc
< yy
->ss_narcs
; iarc
++) {
583 zz
= &yy
->ss_arc
[iarc
];
584 printf(" Arc to state %d, label %s\n",
587 &ll
->ll_label
[zz
->sa_label
]));
593 /* PART THREE -- SIMPLIFY DFA */
595 /* Simplify the DFA by repeatedly eliminating states that are
596 equivalent to another oner. This is NOT Algorithm 3.3 from
597 [Aho&Ullman 77]. It does not always finds the minimal DFA,
598 but it does usually make a much smaller one... (For an example
599 of sub-optimal behaviour, try S: x a b+ | y a b+.)
608 if (s1
->ss_narcs
!= s2
->ss_narcs
|| s1
->ss_finish
!= s2
->ss_finish
)
610 for (i
= 0; i
< s1
->ss_narcs
; i
++) {
611 if (s1
->ss_arc
[i
].sa_arrow
!= s2
->ss_arc
[i
].sa_arrow
||
612 s1
->ss_arc
[i
].sa_label
!= s2
->ss_arc
[i
].sa_label
)
619 renamestates(xx_nstates
, xx_state
, from
, to
)
627 printf("Rename state %d to %d.\n", from
, to
);
628 for (i
= 0; i
< xx_nstates
; i
++) {
629 if (xx_state
[i
].ss_deleted
)
631 for (j
= 0; j
< xx_state
[i
].ss_narcs
; j
++) {
632 if (xx_state
[i
].ss_arc
[j
].sa_arrow
== from
)
633 xx_state
[i
].ss_arc
[j
].sa_arrow
= to
;
639 simplify(xx_nstates
, xx_state
)
648 for (i
= 1; i
< xx_nstates
; i
++) {
649 if (xx_state
[i
].ss_deleted
)
651 for (j
= 0; j
< i
; j
++) {
652 if (xx_state
[j
].ss_deleted
)
654 if (samestate(&xx_state
[i
], &xx_state
[j
])) {
655 xx_state
[i
].ss_deleted
++;
656 renamestates(xx_nstates
, xx_state
,
667 /* PART FOUR -- GENERATE PARSING TABLES */
669 /* Convert the DFA into a grammar that can be used by our parser */
672 convert(d
, xx_nstates
, xx_state
)
681 for (i
= 0; i
< xx_nstates
; i
++) {
685 yy
->ss_rename
= addstate(d
);
688 for (i
= 0; i
< xx_nstates
; i
++) {
692 for (j
= 0; j
< yy
->ss_narcs
; j
++) {
694 addarc(d
, yy
->ss_rename
,
695 xx_state
[zz
->sa_arrow
].ss_rename
,
699 addarc(d
, yy
->ss_rename
, yy
->ss_rename
, 0);
706 /* PART FIVE -- GLUE IT ALL TOGETHER */
717 if (gr
->gr_nnfas
== 0)
719 g
= newgrammar(gr
->gr_nfa
[0]->nf_type
);
720 /* XXX first rule must be start rule */
723 for (i
= 0; i
< gr
->gr_nnfas
; i
++) {
726 printf("Dump of NFA for '%s' ...\n", nf
->nf_name
);
727 dumpnfa(&gr
->gr_ll
, nf
);
729 printf("Making DFA for '%s' ...\n", nf
->nf_name
);
730 d
= adddfa(g
, nf
->nf_type
, nf
->nf_name
);
731 makedfa(gr
, gr
->gr_nfa
[i
], d
);
757 Input is a grammar in extended BNF (using * for repetition, + for
758 at-least-once repetition, [] for optional parts, | for alternatives and
759 () for grouping). This has already been parsed and turned into a parse
762 Each rule is considered as a regular expression in its own right.
763 It is turned into a Non-deterministic Finite Automaton (NFA), which
764 is then turned into a Deterministic Finite Automaton (DFA), which is then
765 optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
766 or similar compiler books (this technique is more often used for lexical
769 The DFA's are used by the parser as parsing tables in a special way
770 that's probably unique. Before they are usable, the FIRST sets of all
771 non-terminals are computed.
777 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977