3 /* XXX This file is not yet fully PROTOized */
5 /* For a description, see the comments at end of this file */
7 #include "pgenheaders.h"
12 #include "metagrammar.h"
15 extern int Py_DebugFlag
;
18 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
20 typedef struct _nfaarc
{
25 typedef struct _nfastate
{
35 int nf_start
, nf_finish
;
39 static void compile_rhs(labellist
*ll
,
40 nfa
*nf
, node
*n
, int *pa
, int *pb
);
41 static void compile_alt(labellist
*ll
,
42 nfa
*nf
, node
*n
, int *pa
, int *pb
);
43 static void compile_item(labellist
*ll
,
44 nfa
*nf
, node
*n
, int *pa
, int *pb
);
45 static void compile_atom(labellist
*ll
,
46 nfa
*nf
, node
*n
, int *pa
, int *pb
);
53 PyMem_RESIZE(nf
->nf_state
, nfastate
, nf
->nf_nstates
+ 1);
54 if (nf
->nf_state
== NULL
)
55 Py_FatalError("out of mem");
56 st
= &nf
->nf_state
[nf
->nf_nstates
++];
59 return st
- nf
->nf_state
;
63 addnfaarc(nfa
*nf
, int from
, int to
, int lbl
)
68 st
= &nf
->nf_state
[from
];
69 PyMem_RESIZE(st
->st_arc
, nfaarc
, st
->st_narcs
+ 1);
70 if (st
->st_arc
== NULL
)
71 Py_FatalError("out of mem");
72 ar
= &st
->st_arc
[st
->st_narcs
++];
81 static int type
= NT_OFFSET
; /* All types will be disjunct */
83 nf
= PyMem_NEW(nfa
, 1);
85 Py_FatalError("no mem for new nfa");
87 nf
->nf_name
= name
; /* XXX strdup(name) ??? */
90 nf
->nf_start
= nf
->nf_finish
= -1;
94 typedef struct _nfagrammar
{
101 static void compile_rule(nfagrammar
*gr
, node
*n
);
108 gr
= PyMem_NEW(nfagrammar
, 1);
110 Py_FatalError("no mem for new nfa grammar");
113 gr
->gr_ll
.ll_nlabels
= 0;
114 gr
->gr_ll
.ll_label
= NULL
;
115 addlabel(&gr
->gr_ll
, ENDMARKER
, "EMPTY");
120 addnfa(nfagrammar
*gr
, char *name
)
125 PyMem_RESIZE(gr
->gr_nfa
, nfa
*, gr
->gr_nnfas
+ 1);
126 if (gr
->gr_nfa
== NULL
)
127 Py_FatalError("out of mem");
128 gr
->gr_nfa
[gr
->gr_nnfas
++] = nf
;
129 addlabel(&gr
->gr_ll
, NAME
, nf
->nf_name
);
135 static char REQNFMT
[] = "metacompile: less than %d children\n";
137 #define REQN(i, count) \
139 fprintf(stderr, REQNFMT, count); \
140 Py_FatalError("REQN"); \
144 #define REQN(i, count) /* empty */
153 printf("Compiling (meta-) parse tree into NFA grammar\n");
154 gr
= newnfagrammar();
156 i
= n
->n_nchildren
- 1; /* Last child is ENDMARKER */
158 for (; --i
>= 0; n
++) {
159 if (n
->n_type
!= NEWLINE
)
166 compile_rule(nfagrammar
*gr
, node
*n
)
171 REQN(n
->n_nchildren
, 4);
174 nf
= addnfa(gr
, n
->n_str
);
179 compile_rhs(&gr
->gr_ll
, nf
, n
, &nf
->nf_start
, &nf
->nf_finish
);
185 compile_rhs(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
195 compile_alt(ll
, nf
, n
, pa
, pb
);
201 *pa
= addnfastate(nf
);
202 *pb
= addnfastate(nf
);
203 addnfaarc(nf
, *pa
, a
, EMPTY
);
204 addnfaarc(nf
, b
, *pb
, EMPTY
);
205 for (; --i
>= 0; n
++) {
211 compile_alt(ll
, nf
, n
, &a
, &b
);
212 addnfaarc(nf
, *pa
, a
, EMPTY
);
213 addnfaarc(nf
, b
, *pb
, EMPTY
);
218 compile_alt(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
228 compile_item(ll
, nf
, n
, pa
, pb
);
231 for (; --i
>= 0; n
++) {
232 if (n
->n_type
== COMMA
) { /* XXX Temporary */
238 compile_item(ll
, nf
, n
, &a
, &b
);
239 addnfaarc(nf
, *pb
, a
, EMPTY
);
245 compile_item(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
254 if (n
->n_type
== LSQB
) {
258 *pa
= addnfastate(nf
);
259 *pb
= addnfastate(nf
);
260 addnfaarc(nf
, *pa
, *pb
, EMPTY
);
261 compile_rhs(ll
, nf
, n
, &a
, &b
);
262 addnfaarc(nf
, *pa
, a
, EMPTY
);
263 addnfaarc(nf
, b
, *pb
, EMPTY
);
269 compile_atom(ll
, nf
, n
, pa
, pb
);
273 addnfaarc(nf
, *pb
, *pa
, EMPTY
);
274 if (n
->n_type
== STAR
)
282 compile_atom(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
290 if (n
->n_type
== LPAR
) {
294 compile_rhs(ll
, nf
, n
, pa
, pb
);
298 else if (n
->n_type
== NAME
|| n
->n_type
== STRING
) {
299 *pa
= addnfastate(nf
);
300 *pb
= addnfastate(nf
);
301 addnfaarc(nf
, *pa
, *pb
, addlabel(ll
, n
->n_type
, n
->n_str
));
308 dumpstate(labellist
*ll
, nfa
*nf
, int istate
)
315 istate
== nf
->nf_start
? '*' : ' ',
317 istate
== nf
->nf_finish
? '.' : ' ');
318 st
= &nf
->nf_state
[istate
];
320 for (i
= 0; i
< st
->st_narcs
; i
++) {
323 printf("-> %2d %s", ar
->ar_arrow
,
324 PyGrammar_LabelRepr(&ll
->ll_label
[ar
->ar_label
]));
331 dumpnfa(labellist
*ll
, nfa
*nf
)
335 printf("NFA '%s' has %d states; start %d, finish %d\n",
336 nf
->nf_name
, nf
->nf_nstates
, nf
->nf_start
, nf
->nf_finish
);
337 for (i
= 0; i
< nf
->nf_nstates
; i
++)
338 dumpstate(ll
, nf
, i
);
342 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
345 addclosure(bitset ss
, nfa
*nf
, int istate
)
347 if (addbit(ss
, istate
)) {
348 nfastate
*st
= &nf
->nf_state
[istate
];
349 nfaarc
*ar
= st
->st_arc
;
352 for (i
= st
->st_narcs
; --i
>= 0; ) {
353 if (ar
->ar_label
== EMPTY
)
354 addclosure(ss
, nf
, ar
->ar_arrow
);
360 typedef struct _ss_arc
{
366 typedef struct _ss_state
{
375 typedef struct _ss_dfa
{
381 static void printssdfa(int xx_nstates
, ss_state
*xx_state
, int nbits
,
382 labellist
*ll
, char *msg
);
383 static void simplify(int xx_nstates
, ss_state
*xx_state
);
384 static void convert(dfa
*d
, int xx_nstates
, ss_state
*xx_state
);
387 makedfa(nfagrammar
*gr
, nfa
*nf
, dfa
*d
)
389 int nbits
= nf
->nf_nstates
;
392 ss_state
*xx_state
, *yy
;
394 int istate
, jstate
, iarc
, jarc
, ibit
;
398 ss
= newbitset(nbits
);
399 addclosure(ss
, nf
, nf
->nf_start
);
400 xx_state
= PyMem_NEW(ss_state
, 1);
401 if (xx_state
== NULL
)
402 Py_FatalError("no mem for xx_state in makedfa");
409 yy
->ss_finish
= testbit(ss
, nf
->nf_finish
);
411 printf("Error: nonterminal '%s' may produce empty.\n",
414 /* This algorithm is from a book written before
415 the invention of structured programming... */
417 /* For each unmarked state... */
418 for (istate
= 0; istate
< xx_nstates
; ++istate
) {
419 yy
= &xx_state
[istate
];
421 /* For all its states... */
422 for (ibit
= 0; ibit
< nf
->nf_nstates
; ++ibit
) {
423 if (!testbit(ss
, ibit
))
425 st
= &nf
->nf_state
[ibit
];
426 /* For all non-empty arcs from this state... */
427 for (iarc
= 0; iarc
< st
->st_narcs
; iarc
++) {
428 ar
= &st
->st_arc
[iarc
];
429 if (ar
->ar_label
== EMPTY
)
431 /* Look up in list of arcs from this state */
432 for (jarc
= 0; jarc
< yy
->ss_narcs
; ++jarc
) {
433 zz
= &yy
->ss_arc
[jarc
];
434 if (ar
->ar_label
== zz
->sa_label
)
437 /* Add new arc for this state */
438 PyMem_RESIZE(yy
->ss_arc
, ss_arc
,
440 if (yy
->ss_arc
== NULL
)
441 Py_FatalError("out of mem");
442 zz
= &yy
->ss_arc
[yy
->ss_narcs
++];
443 zz
->sa_label
= ar
->ar_label
;
444 zz
->sa_bitset
= newbitset(nbits
);
447 /* Add destination */
448 addclosure(zz
->sa_bitset
, nf
, ar
->ar_arrow
);
451 /* Now look up all the arrow states */
452 for (jarc
= 0; jarc
< xx_state
[istate
].ss_narcs
; jarc
++) {
453 zz
= &xx_state
[istate
].ss_arc
[jarc
];
454 for (jstate
= 0; jstate
< xx_nstates
; jstate
++) {
455 if (samebitset(zz
->sa_bitset
,
456 xx_state
[jstate
].ss_ss
, nbits
)) {
457 zz
->sa_arrow
= jstate
;
461 PyMem_RESIZE(xx_state
, ss_state
, xx_nstates
+ 1);
462 if (xx_state
== NULL
)
463 Py_FatalError("out of mem");
464 zz
->sa_arrow
= xx_nstates
;
465 yy
= &xx_state
[xx_nstates
++];
466 yy
->ss_ss
= zz
->sa_bitset
;
470 yy
->ss_finish
= testbit(yy
->ss_ss
, nf
->nf_finish
);
476 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
477 "before minimizing");
479 simplify(xx_nstates
, xx_state
);
482 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
485 convert(d
, xx_nstates
, xx_state
);
491 printssdfa(int xx_nstates
, ss_state
*xx_state
, int nbits
,
492 labellist
*ll
, char *msg
)
498 printf("Subset DFA %s\n", msg
);
499 for (i
= 0; i
< xx_nstates
; i
++) {
503 printf(" Subset %d", i
);
507 for (ibit
= 0; ibit
< nbits
; ibit
++) {
508 if (testbit(yy
->ss_ss
, ibit
))
512 for (iarc
= 0; iarc
< yy
->ss_narcs
; iarc
++) {
513 zz
= &yy
->ss_arc
[iarc
];
514 printf(" Arc to state %d, label %s\n",
517 &ll
->ll_label
[zz
->sa_label
]));
523 /* PART THREE -- SIMPLIFY DFA */
525 /* Simplify the DFA by repeatedly eliminating states that are
526 equivalent to another oner. This is NOT Algorithm 3.3 from
527 [Aho&Ullman 77]. It does not always finds the minimal DFA,
528 but it does usually make a much smaller one... (For an example
529 of sub-optimal behavior, try S: x a b+ | y a b+.)
533 samestate(ss_state
*s1
, ss_state
*s2
)
537 if (s1
->ss_narcs
!= s2
->ss_narcs
|| s1
->ss_finish
!= s2
->ss_finish
)
539 for (i
= 0; i
< s1
->ss_narcs
; i
++) {
540 if (s1
->ss_arc
[i
].sa_arrow
!= s2
->ss_arc
[i
].sa_arrow
||
541 s1
->ss_arc
[i
].sa_label
!= s2
->ss_arc
[i
].sa_label
)
548 renamestates(int xx_nstates
, ss_state
*xx_state
, int from
, int to
)
553 printf("Rename state %d to %d.\n", from
, to
);
554 for (i
= 0; i
< xx_nstates
; i
++) {
555 if (xx_state
[i
].ss_deleted
)
557 for (j
= 0; j
< xx_state
[i
].ss_narcs
; j
++) {
558 if (xx_state
[i
].ss_arc
[j
].sa_arrow
== from
)
559 xx_state
[i
].ss_arc
[j
].sa_arrow
= to
;
565 simplify(int xx_nstates
, ss_state
*xx_state
)
572 for (i
= 1; i
< xx_nstates
; i
++) {
573 if (xx_state
[i
].ss_deleted
)
575 for (j
= 0; j
< i
; j
++) {
576 if (xx_state
[j
].ss_deleted
)
578 if (samestate(&xx_state
[i
], &xx_state
[j
])) {
579 xx_state
[i
].ss_deleted
++;
580 renamestates(xx_nstates
, xx_state
,
591 /* PART FOUR -- GENERATE PARSING TABLES */
593 /* Convert the DFA into a grammar that can be used by our parser */
596 convert(dfa
*d
, int xx_nstates
, ss_state
*xx_state
)
602 for (i
= 0; i
< xx_nstates
; i
++) {
606 yy
->ss_rename
= addstate(d
);
609 for (i
= 0; i
< xx_nstates
; i
++) {
613 for (j
= 0; j
< yy
->ss_narcs
; j
++) {
615 addarc(d
, yy
->ss_rename
,
616 xx_state
[zz
->sa_arrow
].ss_rename
,
620 addarc(d
, yy
->ss_rename
, yy
->ss_rename
, 0);
627 /* PART FIVE -- GLUE IT ALL TOGETHER */
630 maketables(nfagrammar
*gr
)
637 if (gr
->gr_nnfas
== 0)
639 g
= newgrammar(gr
->gr_nfa
[0]->nf_type
);
640 /* XXX first rule must be start rule */
643 for (i
= 0; i
< gr
->gr_nnfas
; i
++) {
646 printf("Dump of NFA for '%s' ...\n", nf
->nf_name
);
647 dumpnfa(&gr
->gr_ll
, nf
);
649 printf("Making DFA for '%s' ...\n", nf
->nf_name
);
650 d
= adddfa(g
, nf
->nf_type
, nf
->nf_name
);
651 makedfa(gr
, gr
->gr_nfa
[i
], d
);
676 Input is a grammar in extended BNF (using * for repetition, + for
677 at-least-once repetition, [] for optional parts, | for alternatives and
678 () for grouping). This has already been parsed and turned into a parse
681 Each rule is considered as a regular expression in its own right.
682 It is turned into a Non-deterministic Finite Automaton (NFA), which
683 is then turned into a Deterministic Finite Automaton (DFA), which is then
684 optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
685 or similar compiler books (this technique is more often used for lexical
688 The DFA's are used by the parser as parsing tables in a special way
689 that's probably unique. Before they are usable, the FIRST sets of all
690 non-terminals are computed.
696 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977