Parser/pgen.c

   1
   2 /* Parser generator */
   3 /* XXX This file is not yet fully PROTOized */
   4
   5 /* For a description, see the comments at end of this file */
   6
   7 #include "Python.h"
   8 #include "pgenheaders.h"
   9 #include "token.h"
  10 #include "node.h"
  11 #include "grammar.h"
  12 #include "metagrammar.h"
  13 #include "pgen.h"
  14
  15 extern int Py_DebugFlag;
  16
  17
  18 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
  19
  20 typedef struct _nfaarc {
  21         int     ar_label;
  22         int     ar_arrow;
  23 } nfaarc;
  24
  25 typedef struct _nfastate {
  26         int     st_narcs;
  27         nfaarc  *st_arc;
  28 } nfastate;
  29
  30 typedef struct _nfa {
  31         int             nf_type;
  32         char            *nf_name;
  33         int             nf_nstates;
  34         nfastate        *nf_state;
  35         int             nf_start, nf_finish;
  36 } nfa;
  37
  38 /* Forward */
  39 static void compile_rhs(labellist *ll,
  40                         nfa *nf, node *n, int *pa, int *pb);
  41 static void compile_alt(labellist *ll,
  42                         nfa *nf, node *n, int *pa, int *pb);
  43 static void compile_item(labellist *ll,
  44                          nfa *nf, node *n, int *pa, int *pb);
  45 static void compile_atom(labellist *ll,
  46                          nfa *nf, node *n, int *pa, int *pb);
  47
  48 static int
  49 addnfastate(nfa *nf)
  50 {
  51         nfastate *st;
  52
  53         PyMem_RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
  54         if (nf->nf_state == NULL)
  55                 Py_FatalError("out of mem");
  56         st = &nf->nf_state[nf->nf_nstates++];
  57         st->st_narcs = 0;
  58         st->st_arc = NULL;
  59         return st - nf->nf_state;
  60 }
  61
  62 static void
  63 addnfaarc(nfa *nf, int from, int to, int lbl)
  64 {
  65         nfastate *st;
  66         nfaarc *ar;
  67
  68         st = &nf->nf_state[from];
  69         PyMem_RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
  70         if (st->st_arc == NULL)
  71                 Py_FatalError("out of mem");
  72         ar = &st->st_arc[st->st_narcs++];
  73         ar->ar_label = lbl;
  74         ar->ar_arrow = to;
  75 }
  76
  77 static nfa *
  78 newnfa(char *name)
  79 {
  80         nfa *nf;
  81         static int type = NT_OFFSET; /* All types will be disjunct */
  82
  83         nf = PyMem_NEW(nfa, 1);
  84         if (nf == NULL)
  85                 Py_FatalError("no mem for new nfa");
  86         nf->nf_type = type++;
  87         nf->nf_name = name; /* XXX strdup(name) ??? */
  88         nf->nf_nstates = 0;
  89         nf->nf_state = NULL;
  90         nf->nf_start = nf->nf_finish = -1;
  91         return nf;
  92 }
  93
  94 typedef struct _nfagrammar {
  95         int             gr_nnfas;
  96         nfa             **gr_nfa;
  97         labellist       gr_ll;
  98 } nfagrammar;
  99
 100 /* Forward */
 101 static void compile_rule(nfagrammar *gr, node *n);
 102
 103 static nfagrammar *
 104 newnfagrammar(void)
 105 {
 106         nfagrammar *gr;
 107
 108         gr = PyMem_NEW(nfagrammar, 1);
 109         if (gr == NULL)
 110                 Py_FatalError("no mem for new nfa grammar");
 111         gr->gr_nnfas = 0;
 112         gr->gr_nfa = NULL;
 113         gr->gr_ll.ll_nlabels = 0;
 114         gr->gr_ll.ll_label = NULL;
 115         addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
 116         return gr;
 117 }
 118
 119 static nfa *
 120 addnfa(nfagrammar *gr, char *name)
 121 {
 122         nfa *nf;
 123
 124         nf = newnfa(name);
 125         PyMem_RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
 126         if (gr->gr_nfa == NULL)
 127                 Py_FatalError("out of mem");
 128         gr->gr_nfa[gr->gr_nnfas++] = nf;
 129         addlabel(&gr->gr_ll, NAME, nf->nf_name);
 130         return nf;
 131 }
 132
 133 #ifdef Py_DEBUG
 134
 135 static char REQNFMT[] = "metacompile: less than %d children\n";
 136
 137 #define REQN(i, count) \
 138         if (i < count) { \
 139                 fprintf(stderr, REQNFMT, count); \
 140                 Py_FatalError("REQN"); \
 141         } else
 142
 143 #else
 144 #define REQN(i, count)  /* empty */
 145 #endif
 146
 147 static nfagrammar *
 148 metacompile(node *n)
 149 {
 150         nfagrammar *gr;
 151         int i;
 152
 153         if (Py_DebugFlag)
 154                 printf("Compiling (meta-) parse tree into NFA grammar\n");
 155         gr = newnfagrammar();
 156         REQ(n, MSTART);
 157         i = n->n_nchildren - 1; /* Last child is ENDMARKER */
 158         n = n->n_child;
 159         for (; --i >= 0; n++) {
 160                 if (n->n_type != NEWLINE)
 161                         compile_rule(gr, n);
 162         }
 163         return gr;
 164 }
 165
 166 static void
 167 compile_rule(nfagrammar *gr, node *n)
 168 {
 169         nfa *nf;
 170
 171         REQ(n, RULE);
 172         REQN(n->n_nchildren, 4);
 173         n = n->n_child;
 174         REQ(n, NAME);
 175         nf = addnfa(gr, n->n_str);
 176         n++;
 177         REQ(n, COLON);
 178         n++;
 179         REQ(n, RHS);
 180         compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
 181         n++;
 182         REQ(n, NEWLINE);
 183 }
 184
 185 static void
 186 compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
 187 {
 188         int i;
 189         int a, b;
 190
 191         REQ(n, RHS);
 192         i = n->n_nchildren;
 193         REQN(i, 1);
 194         n = n->n_child;
 195         REQ(n, ALT);
 196         compile_alt(ll, nf, n, pa, pb);
 197         if (--i <= 0)
 198                 return;
 199         n++;
 200         a = *pa;
 201         b = *pb;
 202         *pa = addnfastate(nf);
 203         *pb = addnfastate(nf);
 204         addnfaarc(nf, *pa, a, EMPTY);
 205         addnfaarc(nf, b, *pb, EMPTY);
 206         for (; --i >= 0; n++) {
 207                 REQ(n, VBAR);
 208                 REQN(i, 1);
 209                 --i;
 210                 n++;
 211                 REQ(n, ALT);
 212                 compile_alt(ll, nf, n, &a, &b);
 213                 addnfaarc(nf, *pa, a, EMPTY);
 214                 addnfaarc(nf, b, *pb, EMPTY);
 215         }
 216 }
 217
 218 static void
 219 compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
 220 {
 221         int i;
 222         int a, b;
 223
 224         REQ(n, ALT);
 225         i = n->n_nchildren;
 226         REQN(i, 1);
 227         n = n->n_child;
 228         REQ(n, ITEM);
 229         compile_item(ll, nf, n, pa, pb);
 230         --i;
 231         n++;
 232         for (; --i >= 0; n++) {
 233                 if (n->n_type == COMMA) { /* XXX Temporary */
 234                         REQN(i, 1);
 235                         --i;
 236                         n++;
 237                 }
 238                 REQ(n, ITEM);
 239                 compile_item(ll, nf, n, &a, &b);
 240                 addnfaarc(nf, *pb, a, EMPTY);
 241                 *pb = b;
 242         }
 243 }
 244
 245 static void
 246 compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
 247 {
 248         int i;
 249         int a, b;
 250
 251         REQ(n, ITEM);
 252         i = n->n_nchildren;
 253         REQN(i, 1);
 254         n = n->n_child;
 255         if (n->n_type == LSQB) {
 256                 REQN(i, 3);
 257                 n++;
 258                 REQ(n, RHS);
 259                 *pa = addnfastate(nf);
 260                 *pb = addnfastate(nf);
 261                 addnfaarc(nf, *pa, *pb, EMPTY);
 262                 compile_rhs(ll, nf, n, &a, &b);
 263                 addnfaarc(nf, *pa, a, EMPTY);
 264                 addnfaarc(nf, b, *pb, EMPTY);
 265                 REQN(i, 1);
 266                 n++;
 267                 REQ(n, RSQB);
 268         }
 269         else {
 270                 compile_atom(ll, nf, n, pa, pb);
 271                 if (--i <= 0)
 272                         return;
 273                 n++;
 274                 addnfaarc(nf, *pb, *pa, EMPTY);
 275                 if (n->n_type == STAR)
 276                         *pb = *pa;
 277                 else
 278                         REQ(n, PLUS);
 279         }
 280 }
 281
 282 static void
 283 compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
 284 {
 285         int i;
 286
 287         REQ(n, ATOM);
 288         i = n->n_nchildren;
 289         REQN(i, 1);
 290         n = n->n_child;
 291         if (n->n_type == LPAR) {
 292                 REQN(i, 3);
 293                 n++;
 294                 REQ(n, RHS);
 295                 compile_rhs(ll, nf, n, pa, pb);
 296                 n++;
 297                 REQ(n, RPAR);
 298         }
 299         else if (n->n_type == NAME || n->n_type == STRING) {
 300                 *pa = addnfastate(nf);
 301                 *pb = addnfastate(nf);
 302                 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
 303         }
 304         else
 305                 REQ(n, NAME);
 306 }
 307
 308 static void
 309 dumpstate(labellist *ll, nfa *nf, int istate)
 310 {
 311         nfastate *st;
 312         int i;
 313         nfaarc *ar;
 314
 315         printf("%c%2d%c",
 316                 istate == nf->nf_start ? '*' : ' ',
 317                 istate,
 318                 istate == nf->nf_finish ? '.' : ' ');
 319         st = &nf->nf_state[istate];
 320         ar = st->st_arc;
 321         for (i = 0; i < st->st_narcs; i++) {
 322                 if (i > 0)
 323                         printf("\n    ");
 324                 printf("-> %2d  %s", ar->ar_arrow,
 325                         PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
 326                 ar++;
 327         }
 328         printf("\n");
 329 }
 330
 331 static void
 332 dumpnfa(labellist *ll, nfa *nf)
 333 {
 334         int i;
 335
 336         printf("NFA '%s' has %d states; start %d, finish %d\n",
 337                 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
 338         for (i = 0; i < nf->nf_nstates; i++)
 339                 dumpstate(ll, nf, i);
 340 }
 341
 342
 343 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
 344
 345 static void
 346 addclosure(bitset ss, nfa *nf, int istate)
 347 {
 348         if (addbit(ss, istate)) {
 349                 nfastate *st = &nf->nf_state[istate];
 350                 nfaarc *ar = st->st_arc;
 351                 int i;
 352
 353                 for (i = st->st_narcs; --i >= 0; ) {
 354                         if (ar->ar_label == EMPTY)
 355                                 addclosure(ss, nf, ar->ar_arrow);
 356                         ar++;
 357                 }
 358         }
 359 }
 360
 361 typedef struct _ss_arc {
 362         bitset  sa_bitset;
 363         int     sa_arrow;
 364         int     sa_label;
 365 } ss_arc;
 366
 367 typedef struct _ss_state {
 368         bitset  ss_ss;
 369         int     ss_narcs;
 370         ss_arc  *ss_arc;
 371         int     ss_deleted;
 372         int     ss_finish;
 373         int     ss_rename;
 374 } ss_state;
 375
 376 typedef struct _ss_dfa {
 377         int     sd_nstates;
 378         ss_state *sd_state;
 379 } ss_dfa;
 380
 381 /* Forward */
 382 static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
 383                        labellist *ll, char *msg);
 384 static void simplify(int xx_nstates, ss_state *xx_state);
 385 static void convert(dfa *d, int xx_nstates, ss_state *xx_state);
 386
 387 static void
 388 makedfa(nfagrammar *gr, nfa *nf, dfa *d)
 389 {
 390         int nbits = nf->nf_nstates;
 391         bitset ss;
 392         int xx_nstates;
 393         ss_state *xx_state, *yy;
 394         ss_arc *zz;
 395         int istate, jstate, iarc, jarc, ibit;
 396         nfastate *st;
 397         nfaarc *ar;
 398
 399         ss = newbitset(nbits);
 400         addclosure(ss, nf, nf->nf_start);
 401         xx_state = PyMem_NEW(ss_state, 1);
 402         if (xx_state == NULL)
 403                 Py_FatalError("no mem for xx_state in makedfa");
 404         xx_nstates = 1;
 405         yy = &xx_state[0];
 406         yy->ss_ss = ss;
 407         yy->ss_narcs = 0;
 408         yy->ss_arc = NULL;
 409         yy->ss_deleted = 0;
 410         yy->ss_finish = testbit(ss, nf->nf_finish);
 411         if (yy->ss_finish)
 412                 printf("Error: nonterminal '%s' may produce empty.\n",
 413                         nf->nf_name);
 414
 415         /* This algorithm is from a book written before
 416            the invention of structured programming... */
 417
 418         /* For each unmarked state... */
 419         for (istate = 0; istate < xx_nstates; ++istate) {
 420                 yy = &xx_state[istate];
 421                 ss = yy->ss_ss;
 422                 /* For all its states... */
 423                 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
 424                         if (!testbit(ss, ibit))
 425                                 continue;
 426                         st = &nf->nf_state[ibit];
 427                         /* For all non-empty arcs from this state... */
 428                         for (iarc = 0; iarc < st->st_narcs; iarc++) {
 429                                 ar = &st->st_arc[iarc];
 430                                 if (ar->ar_label == EMPTY)
 431                                         continue;
 432                                 /* Look up in list of arcs from this state */
 433                                 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
 434                                         zz = &yy->ss_arc[jarc];
 435                                         if (ar->ar_label == zz->sa_label)
 436                                                 goto found;
 437                                 }
 438                                 /* Add new arc for this state */
 439                                 PyMem_RESIZE(yy->ss_arc, ss_arc,
 440                                              yy->ss_narcs + 1);
 441                                 if (yy->ss_arc == NULL)
 442                                         Py_FatalError("out of mem");
 443                                 zz = &yy->ss_arc[yy->ss_narcs++];
 444                                 zz->sa_label = ar->ar_label;
 445                                 zz->sa_bitset = newbitset(nbits);
 446                                 zz->sa_arrow = -1;
 447                          found: ;
 448                                 /* Add destination */
 449                                 addclosure(zz->sa_bitset, nf, ar->ar_arrow);
 450                         }
 451                 }
 452                 /* Now look up all the arrow states */
 453                 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
 454                         zz = &xx_state[istate].ss_arc[jarc];
 455                         for (jstate = 0; jstate < xx_nstates; jstate++) {
 456                                 if (samebitset(zz->sa_bitset,
 457                                         xx_state[jstate].ss_ss, nbits)) {
 458                                         zz->sa_arrow = jstate;
 459                                         goto done;
 460                                 }
 461                         }
 462                         PyMem_RESIZE(xx_state, ss_state, xx_nstates + 1);
 463                         if (xx_state == NULL)
 464                                 Py_FatalError("out of mem");
 465                         zz->sa_arrow = xx_nstates;
 466                         yy = &xx_state[xx_nstates++];
 467                         yy->ss_ss = zz->sa_bitset;
 468                         yy->ss_narcs = 0;
 469                         yy->ss_arc = NULL;
 470                         yy->ss_deleted = 0;
 471                         yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
 472                  done:  ;
 473                 }
 474         }
 475
 476         if (Py_DebugFlag)
 477                 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
 478                                                 "before minimizing");
 479
 480         simplify(xx_nstates, xx_state);
 481
 482         if (Py_DebugFlag)
 483                 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
 484                                                 "after minimizing");
 485
 486         convert(d, xx_nstates, xx_state);
 487
 488         /* XXX cleanup */
 489 }
 490
 491 static void
 492 printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
 493            labellist *ll, char *msg)
 494 {
 495         int i, ibit, iarc;
 496         ss_state *yy;
 497         ss_arc *zz;
 498
 499         printf("Subset DFA %s\n", msg);
 500         for (i = 0; i < xx_nstates; i++) {
 501                 yy = &xx_state[i];
 502                 if (yy->ss_deleted)
 503                         continue;
 504                 printf(" Subset %d", i);
 505                 if (yy->ss_finish)
 506                         printf(" (finish)");
 507                 printf(" { ");
 508                 for (ibit = 0; ibit < nbits; ibit++) {
 509                         if (testbit(yy->ss_ss, ibit))
 510                                 printf("%d ", ibit);
 511                 }
 512                 printf("}\n");
 513                 for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
 514                         zz = &yy->ss_arc[iarc];
 515                         printf("  Arc to state %d, label %s\n",
 516                                 zz->sa_arrow,
 517                                 PyGrammar_LabelRepr(
 518                                         &ll->ll_label[zz->sa_label]));
 519                 }
 520         }
 521 }
 522
 523
 524 /* PART THREE -- SIMPLIFY DFA */
 525
 526 /* Simplify the DFA by repeatedly eliminating states that are
 527    equivalent to another oner.  This is NOT Algorithm 3.3 from
 528    [Aho&Ullman 77].  It does not always finds the minimal DFA,
 529    but it does usually make a much smaller one...  (For an example
 530    of sub-optimal behavior, try S: x a b+ | y a b+.)
 531 */
 532
 533 static int
 534 samestate(ss_state *s1, ss_state *s2)
 535 {
 536         int i;
 537
 538         if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
 539                 return 0;
 540         for (i = 0; i < s1->ss_narcs; i++) {
 541                 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
 542                         s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
 543                         return 0;
 544         }
 545         return 1;
 546 }
 547
 548 static void
 549 renamestates(int xx_nstates, ss_state *xx_state, int from, int to)
 550 {
 551         int i, j;
 552
 553         if (Py_DebugFlag)
 554                 printf("Rename state %d to %d.\n", from, to);
 555         for (i = 0; i < xx_nstates; i++) {
 556                 if (xx_state[i].ss_deleted)
 557                         continue;
 558                 for (j = 0; j < xx_state[i].ss_narcs; j++) {
 559                         if (xx_state[i].ss_arc[j].sa_arrow == from)
 560                                 xx_state[i].ss_arc[j].sa_arrow = to;
 561                 }
 562         }
 563 }
 564
 565 static void
 566 simplify(int xx_nstates, ss_state *xx_state)
 567 {
 568         int changes;
 569         int i, j;
 570
 571         do {
 572                 changes = 0;
 573                 for (i = 1; i < xx_nstates; i++) {
 574                         if (xx_state[i].ss_deleted)
 575                                 continue;
 576                         for (j = 0; j < i; j++) {
 577                                 if (xx_state[j].ss_deleted)
 578                                         continue;
 579                                 if (samestate(&xx_state[i], &xx_state[j])) {
 580                                         xx_state[i].ss_deleted++;
 581                                         renamestates(xx_nstates, xx_state,
 582                                                      i, j);
 583                                         changes++;
 584                                         break;
 585                                 }
 586                         }
 587                 }
 588         } while (changes);
 589 }
 590
 591
 592 /* PART FOUR -- GENERATE PARSING TABLES */
 593
 594 /* Convert the DFA into a grammar that can be used by our parser */
 595
 596 static void
 597 convert(dfa *d, int xx_nstates, ss_state *xx_state)
 598 {
 599         int i, j;
 600         ss_state *yy;
 601         ss_arc *zz;
 602
 603         for (i = 0; i < xx_nstates; i++) {
 604                 yy = &xx_state[i];
 605                 if (yy->ss_deleted)
 606                         continue;
 607                 yy->ss_rename = addstate(d);
 608         }
 609
 610         for (i = 0; i < xx_nstates; i++) {
 611                 yy = &xx_state[i];
 612                 if (yy->ss_deleted)
 613                         continue;
 614                 for (j = 0; j < yy->ss_narcs; j++) {
 615                         zz = &yy->ss_arc[j];
 616                         addarc(d, yy->ss_rename,
 617                                 xx_state[zz->sa_arrow].ss_rename,
 618                                 zz->sa_label);
 619                 }
 620                 if (yy->ss_finish)
 621                         addarc(d, yy->ss_rename, yy->ss_rename, 0);
 622         }
 623
 624         d->d_initial = 0;
 625 }
 626
 627
 628 /* PART FIVE -- GLUE IT ALL TOGETHER */
 629
 630 static grammar *
 631 maketables(nfagrammar *gr)
 632 {
 633         int i;
 634         nfa *nf;
 635         dfa *d;
 636         grammar *g;
 637
 638         if (gr->gr_nnfas == 0)
 639                 return NULL;
 640         g = newgrammar(gr->gr_nfa[0]->nf_type);
 641                         /* XXX first rule must be start rule */
 642         g->g_ll = gr->gr_ll;
 643
 644         for (i = 0; i < gr->gr_nnfas; i++) {
 645                 nf = gr->gr_nfa[i];
 646                 if (Py_DebugFlag) {
 647                         printf("Dump of NFA for '%s' ...\n", nf->nf_name);
 648                         dumpnfa(&gr->gr_ll, nf);
 649                         printf("Making DFA for '%s' ...\n", nf->nf_name);
 650                 }
 651                 d = adddfa(g, nf->nf_type, nf->nf_name);
 652                 makedfa(gr, gr->gr_nfa[i], d);
 653         }
 654
 655         return g;
 656 }
 657
 658 grammar *
 659 pgen(node *n)
 660 {
 661         nfagrammar *gr;
 662         grammar *g;
 663
 664         gr = metacompile(n);
 665         g = maketables(gr);
 666         translatelabels(g);
 667         addfirstsets(g);
 668         return g;
 669 }
 670
 671
 672 /*
 673
 674 Description
 675 -----------
 676
 677 Input is a grammar in extended BNF (using * for repetition, + for
 678 at-least-once repetition, [] for optional parts, | for alternatives and
 679 () for grouping).  This has already been parsed and turned into a parse
 680 tree.
 681
 682 Each rule is considered as a regular expression in its own right.
 683 It is turned into a Non-deterministic Finite Automaton (NFA), which
 684 is then turned into a Deterministic Finite Automaton (DFA), which is then
 685 optimized to reduce the number of states.  See [Aho&Ullman 77] chapter 3,
 686 or similar compiler books (this technique is more often used for lexical
 687 analyzers).
 688
 689 The DFA's are used by the parser as parsing tables in a special way
 690 that's probably unique.  Before they are usable, the FIRST sets of all
 691 non-terminals are computed.
 692
 693 Reference
 694 ---------
 695
 696 [Aho&Ullman 77]
 697         Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
 698         (first edition)
 699
 700 */