3 /* XXX This file is not yet fully PROTOized */
5 /* For a description, see the comments at end of this file */
8 #include "pgenheaders.h"
12 #include "metagrammar.h"
15 extern int Py_DebugFlag
;
18 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
20 typedef struct _nfaarc
{
25 typedef struct _nfastate
{
35 int nf_start
, nf_finish
;
39 static void compile_rhs(labellist
*ll
,
40 nfa
*nf
, node
*n
, int *pa
, int *pb
);
41 static void compile_alt(labellist
*ll
,
42 nfa
*nf
, node
*n
, int *pa
, int *pb
);
43 static void compile_item(labellist
*ll
,
44 nfa
*nf
, node
*n
, int *pa
, int *pb
);
45 static void compile_atom(labellist
*ll
,
46 nfa
*nf
, node
*n
, int *pa
, int *pb
);
53 PyMem_RESIZE(nf
->nf_state
, nfastate
, nf
->nf_nstates
+ 1);
54 if (nf
->nf_state
== NULL
)
55 Py_FatalError("out of mem");
56 st
= &nf
->nf_state
[nf
->nf_nstates
++];
59 return st
- nf
->nf_state
;
63 addnfaarc(nfa
*nf
, int from
, int to
, int lbl
)
68 st
= &nf
->nf_state
[from
];
69 PyMem_RESIZE(st
->st_arc
, nfaarc
, st
->st_narcs
+ 1);
70 if (st
->st_arc
== NULL
)
71 Py_FatalError("out of mem");
72 ar
= &st
->st_arc
[st
->st_narcs
++];
81 static int type
= NT_OFFSET
; /* All types will be disjunct */
83 nf
= PyMem_NEW(nfa
, 1);
85 Py_FatalError("no mem for new nfa");
87 nf
->nf_name
= name
; /* XXX strdup(name) ??? */
90 nf
->nf_start
= nf
->nf_finish
= -1;
94 typedef struct _nfagrammar
{
101 static void compile_rule(nfagrammar
*gr
, node
*n
);
108 gr
= PyMem_NEW(nfagrammar
, 1);
110 Py_FatalError("no mem for new nfa grammar");
113 gr
->gr_ll
.ll_nlabels
= 0;
114 gr
->gr_ll
.ll_label
= NULL
;
115 addlabel(&gr
->gr_ll
, ENDMARKER
, "EMPTY");
120 addnfa(nfagrammar
*gr
, char *name
)
125 PyMem_RESIZE(gr
->gr_nfa
, nfa
*, gr
->gr_nnfas
+ 1);
126 if (gr
->gr_nfa
== NULL
)
127 Py_FatalError("out of mem");
128 gr
->gr_nfa
[gr
->gr_nnfas
++] = nf
;
129 addlabel(&gr
->gr_ll
, NAME
, nf
->nf_name
);
135 static char REQNFMT
[] = "metacompile: less than %d children\n";
137 #define REQN(i, count) \
139 fprintf(stderr, REQNFMT, count); \
140 Py_FatalError("REQN"); \
144 #define REQN(i, count) /* empty */
154 printf("Compiling (meta-) parse tree into NFA grammar\n");
155 gr
= newnfagrammar();
157 i
= n
->n_nchildren
- 1; /* Last child is ENDMARKER */
159 for (; --i
>= 0; n
++) {
160 if (n
->n_type
!= NEWLINE
)
167 compile_rule(nfagrammar
*gr
, node
*n
)
172 REQN(n
->n_nchildren
, 4);
175 nf
= addnfa(gr
, n
->n_str
);
180 compile_rhs(&gr
->gr_ll
, nf
, n
, &nf
->nf_start
, &nf
->nf_finish
);
186 compile_rhs(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
196 compile_alt(ll
, nf
, n
, pa
, pb
);
202 *pa
= addnfastate(nf
);
203 *pb
= addnfastate(nf
);
204 addnfaarc(nf
, *pa
, a
, EMPTY
);
205 addnfaarc(nf
, b
, *pb
, EMPTY
);
206 for (; --i
>= 0; n
++) {
212 compile_alt(ll
, nf
, n
, &a
, &b
);
213 addnfaarc(nf
, *pa
, a
, EMPTY
);
214 addnfaarc(nf
, b
, *pb
, EMPTY
);
219 compile_alt(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
229 compile_item(ll
, nf
, n
, pa
, pb
);
232 for (; --i
>= 0; n
++) {
233 if (n
->n_type
== COMMA
) { /* XXX Temporary */
239 compile_item(ll
, nf
, n
, &a
, &b
);
240 addnfaarc(nf
, *pb
, a
, EMPTY
);
246 compile_item(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
255 if (n
->n_type
== LSQB
) {
259 *pa
= addnfastate(nf
);
260 *pb
= addnfastate(nf
);
261 addnfaarc(nf
, *pa
, *pb
, EMPTY
);
262 compile_rhs(ll
, nf
, n
, &a
, &b
);
263 addnfaarc(nf
, *pa
, a
, EMPTY
);
264 addnfaarc(nf
, b
, *pb
, EMPTY
);
270 compile_atom(ll
, nf
, n
, pa
, pb
);
274 addnfaarc(nf
, *pb
, *pa
, EMPTY
);
275 if (n
->n_type
== STAR
)
283 compile_atom(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
291 if (n
->n_type
== LPAR
) {
295 compile_rhs(ll
, nf
, n
, pa
, pb
);
299 else if (n
->n_type
== NAME
|| n
->n_type
== STRING
) {
300 *pa
= addnfastate(nf
);
301 *pb
= addnfastate(nf
);
302 addnfaarc(nf
, *pa
, *pb
, addlabel(ll
, n
->n_type
, n
->n_str
));
309 dumpstate(labellist
*ll
, nfa
*nf
, int istate
)
316 istate
== nf
->nf_start
? '*' : ' ',
318 istate
== nf
->nf_finish
? '.' : ' ');
319 st
= &nf
->nf_state
[istate
];
321 for (i
= 0; i
< st
->st_narcs
; i
++) {
324 printf("-> %2d %s", ar
->ar_arrow
,
325 PyGrammar_LabelRepr(&ll
->ll_label
[ar
->ar_label
]));
332 dumpnfa(labellist
*ll
, nfa
*nf
)
336 printf("NFA '%s' has %d states; start %d, finish %d\n",
337 nf
->nf_name
, nf
->nf_nstates
, nf
->nf_start
, nf
->nf_finish
);
338 for (i
= 0; i
< nf
->nf_nstates
; i
++)
339 dumpstate(ll
, nf
, i
);
343 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
346 addclosure(bitset ss
, nfa
*nf
, int istate
)
348 if (addbit(ss
, istate
)) {
349 nfastate
*st
= &nf
->nf_state
[istate
];
350 nfaarc
*ar
= st
->st_arc
;
353 for (i
= st
->st_narcs
; --i
>= 0; ) {
354 if (ar
->ar_label
== EMPTY
)
355 addclosure(ss
, nf
, ar
->ar_arrow
);
361 typedef struct _ss_arc
{
367 typedef struct _ss_state
{
376 typedef struct _ss_dfa
{
382 static void printssdfa(int xx_nstates
, ss_state
*xx_state
, int nbits
,
383 labellist
*ll
, char *msg
);
384 static void simplify(int xx_nstates
, ss_state
*xx_state
);
385 static void convert(dfa
*d
, int xx_nstates
, ss_state
*xx_state
);
388 makedfa(nfagrammar
*gr
, nfa
*nf
, dfa
*d
)
390 int nbits
= nf
->nf_nstates
;
393 ss_state
*xx_state
, *yy
;
395 int istate
, jstate
, iarc
, jarc
, ibit
;
399 ss
= newbitset(nbits
);
400 addclosure(ss
, nf
, nf
->nf_start
);
401 xx_state
= PyMem_NEW(ss_state
, 1);
402 if (xx_state
== NULL
)
403 Py_FatalError("no mem for xx_state in makedfa");
410 yy
->ss_finish
= testbit(ss
, nf
->nf_finish
);
412 printf("Error: nonterminal '%s' may produce empty.\n",
415 /* This algorithm is from a book written before
416 the invention of structured programming... */
418 /* For each unmarked state... */
419 for (istate
= 0; istate
< xx_nstates
; ++istate
) {
420 yy
= &xx_state
[istate
];
422 /* For all its states... */
423 for (ibit
= 0; ibit
< nf
->nf_nstates
; ++ibit
) {
424 if (!testbit(ss
, ibit
))
426 st
= &nf
->nf_state
[ibit
];
427 /* For all non-empty arcs from this state... */
428 for (iarc
= 0; iarc
< st
->st_narcs
; iarc
++) {
429 ar
= &st
->st_arc
[iarc
];
430 if (ar
->ar_label
== EMPTY
)
432 /* Look up in list of arcs from this state */
433 for (jarc
= 0; jarc
< yy
->ss_narcs
; ++jarc
) {
434 zz
= &yy
->ss_arc
[jarc
];
435 if (ar
->ar_label
== zz
->sa_label
)
438 /* Add new arc for this state */
439 PyMem_RESIZE(yy
->ss_arc
, ss_arc
,
441 if (yy
->ss_arc
== NULL
)
442 Py_FatalError("out of mem");
443 zz
= &yy
->ss_arc
[yy
->ss_narcs
++];
444 zz
->sa_label
= ar
->ar_label
;
445 zz
->sa_bitset
= newbitset(nbits
);
448 /* Add destination */
449 addclosure(zz
->sa_bitset
, nf
, ar
->ar_arrow
);
452 /* Now look up all the arrow states */
453 for (jarc
= 0; jarc
< xx_state
[istate
].ss_narcs
; jarc
++) {
454 zz
= &xx_state
[istate
].ss_arc
[jarc
];
455 for (jstate
= 0; jstate
< xx_nstates
; jstate
++) {
456 if (samebitset(zz
->sa_bitset
,
457 xx_state
[jstate
].ss_ss
, nbits
)) {
458 zz
->sa_arrow
= jstate
;
462 PyMem_RESIZE(xx_state
, ss_state
, xx_nstates
+ 1);
463 if (xx_state
== NULL
)
464 Py_FatalError("out of mem");
465 zz
->sa_arrow
= xx_nstates
;
466 yy
= &xx_state
[xx_nstates
++];
467 yy
->ss_ss
= zz
->sa_bitset
;
471 yy
->ss_finish
= testbit(yy
->ss_ss
, nf
->nf_finish
);
477 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
478 "before minimizing");
480 simplify(xx_nstates
, xx_state
);
483 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
486 convert(d
, xx_nstates
, xx_state
);
492 printssdfa(int xx_nstates
, ss_state
*xx_state
, int nbits
,
493 labellist
*ll
, char *msg
)
499 printf("Subset DFA %s\n", msg
);
500 for (i
= 0; i
< xx_nstates
; i
++) {
504 printf(" Subset %d", i
);
508 for (ibit
= 0; ibit
< nbits
; ibit
++) {
509 if (testbit(yy
->ss_ss
, ibit
))
513 for (iarc
= 0; iarc
< yy
->ss_narcs
; iarc
++) {
514 zz
= &yy
->ss_arc
[iarc
];
515 printf(" Arc to state %d, label %s\n",
518 &ll
->ll_label
[zz
->sa_label
]));
524 /* PART THREE -- SIMPLIFY DFA */
526 /* Simplify the DFA by repeatedly eliminating states that are
527 equivalent to another oner. This is NOT Algorithm 3.3 from
528 [Aho&Ullman 77]. It does not always finds the minimal DFA,
529 but it does usually make a much smaller one... (For an example
530 of sub-optimal behavior, try S: x a b+ | y a b+.)
534 samestate(ss_state
*s1
, ss_state
*s2
)
538 if (s1
->ss_narcs
!= s2
->ss_narcs
|| s1
->ss_finish
!= s2
->ss_finish
)
540 for (i
= 0; i
< s1
->ss_narcs
; i
++) {
541 if (s1
->ss_arc
[i
].sa_arrow
!= s2
->ss_arc
[i
].sa_arrow
||
542 s1
->ss_arc
[i
].sa_label
!= s2
->ss_arc
[i
].sa_label
)
549 renamestates(int xx_nstates
, ss_state
*xx_state
, int from
, int to
)
554 printf("Rename state %d to %d.\n", from
, to
);
555 for (i
= 0; i
< xx_nstates
; i
++) {
556 if (xx_state
[i
].ss_deleted
)
558 for (j
= 0; j
< xx_state
[i
].ss_narcs
; j
++) {
559 if (xx_state
[i
].ss_arc
[j
].sa_arrow
== from
)
560 xx_state
[i
].ss_arc
[j
].sa_arrow
= to
;
566 simplify(int xx_nstates
, ss_state
*xx_state
)
573 for (i
= 1; i
< xx_nstates
; i
++) {
574 if (xx_state
[i
].ss_deleted
)
576 for (j
= 0; j
< i
; j
++) {
577 if (xx_state
[j
].ss_deleted
)
579 if (samestate(&xx_state
[i
], &xx_state
[j
])) {
580 xx_state
[i
].ss_deleted
++;
581 renamestates(xx_nstates
, xx_state
,
592 /* PART FOUR -- GENERATE PARSING TABLES */
594 /* Convert the DFA into a grammar that can be used by our parser */
597 convert(dfa
*d
, int xx_nstates
, ss_state
*xx_state
)
603 for (i
= 0; i
< xx_nstates
; i
++) {
607 yy
->ss_rename
= addstate(d
);
610 for (i
= 0; i
< xx_nstates
; i
++) {
614 for (j
= 0; j
< yy
->ss_narcs
; j
++) {
616 addarc(d
, yy
->ss_rename
,
617 xx_state
[zz
->sa_arrow
].ss_rename
,
621 addarc(d
, yy
->ss_rename
, yy
->ss_rename
, 0);
628 /* PART FIVE -- GLUE IT ALL TOGETHER */
631 maketables(nfagrammar
*gr
)
638 if (gr
->gr_nnfas
== 0)
640 g
= newgrammar(gr
->gr_nfa
[0]->nf_type
);
641 /* XXX first rule must be start rule */
644 for (i
= 0; i
< gr
->gr_nnfas
; i
++) {
647 printf("Dump of NFA for '%s' ...\n", nf
->nf_name
);
648 dumpnfa(&gr
->gr_ll
, nf
);
649 printf("Making DFA for '%s' ...\n", nf
->nf_name
);
651 d
= adddfa(g
, nf
->nf_type
, nf
->nf_name
);
652 makedfa(gr
, gr
->gr_nfa
[i
], d
);
677 Input is a grammar in extended BNF (using * for repetition, + for
678 at-least-once repetition, [] for optional parts, | for alternatives and
679 () for grouping). This has already been parsed and turned into a parse
682 Each rule is considered as a regular expression in its own right.
683 It is turned into a Non-deterministic Finite Automaton (NFA), which
684 is then turned into a Deterministic Finite Automaton (DFA), which is then
685 optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
686 or similar compiler books (this technique is more often used for lexical
689 The DFA's are used by the parser as parsing tables in a special way
690 that's probably unique. Before they are usable, the FIRST sets of all
691 non-terminals are computed.
697 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977