src/backend/regex/regc_nfa.c

   1 /*
   2  * NFA utilities.
   3  * This file is #included by regcomp.c.
   4  *
   5  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
   6  *
   7  * Development of this software was funded, in part, by Cray Research Inc.,
   8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
   9  * Corporation, none of whom are responsible for the results.  The author
  10  * thanks all of them.
  11  *
  12  * Redistribution and use in source and binary forms -- with or without
  13  * modification -- are permitted for any purpose, provided that
  14  * redistributions in source form retain this entire copyright notice and
  15  * indicate the origin and nature of any modifications.
  16  *
  17  * I'd appreciate being given credit for this package in the documentation
  18  * of software which uses it, but that is not a requirement.
  19  *
  20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
  21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
  23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * src/backend/regex/regc_nfa.c
  32  *
  33  *
  34  * One or two things that technically ought to be in here
  35  * are actually in color.c, thanks to some incestuous relationships in
  36  * the color chains.
  37  */
  38
  39 #define NISERR()        VISERR(nfa->v)
  40 #define NERR(e)         VERR(nfa->v, (e))
  41
  42
  43 /*
  44  * newnfa - set up an NFA
  45  */
  46 static struct nfa *                             /* the NFA, or NULL */
  47 newnfa(struct vars *v,
  48            struct colormap *cm,
  49            struct nfa *parent)          /* NULL if primary NFA */
  50 {
  51         struct nfa *nfa;
  52
  53         nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
  54         if (nfa == NULL)
  55         {
  56                 ERR(REG_ESPACE);
  57                 return NULL;
  58         }
  59
  60         /* Make the NFA minimally valid, so freenfa() will behave sanely */
  61         nfa->states = NULL;
  62         nfa->slast = NULL;
  63         nfa->freestates = NULL;
  64         nfa->freearcs = NULL;
  65         nfa->lastsb = NULL;
  66         nfa->lastab = NULL;
  67         nfa->lastsbused = 0;
  68         nfa->lastabused = 0;
  69         nfa->nstates = 0;
  70         nfa->cm = cm;
  71         nfa->v = v;
  72         nfa->bos[0] = nfa->bos[1] = COLORLESS;
  73         nfa->eos[0] = nfa->eos[1] = COLORLESS;
  74         nfa->flags = 0;
  75         nfa->minmatchall = nfa->maxmatchall = -1;
  76         nfa->parent = parent;           /* Precedes newfstate so parent is valid. */
  77
  78         /* Create required infrastructure */
  79         nfa->post = newfstate(nfa, '@');        /* number 0 */
  80         nfa->pre = newfstate(nfa, '>'); /* number 1 */
  81         nfa->init = newstate(nfa);      /* may become invalid later */
  82         nfa->final = newstate(nfa);
  83         if (ISERR())
  84         {
  85                 freenfa(nfa);
  86                 return NULL;
  87         }
  88         rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
  89         newarc(nfa, '^', 1, nfa->pre, nfa->init);
  90         newarc(nfa, '^', 0, nfa->pre, nfa->init);
  91         rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
  92         newarc(nfa, '$', 1, nfa->final, nfa->post);
  93         newarc(nfa, '$', 0, nfa->final, nfa->post);
  94
  95         if (ISERR())
  96         {
  97                 freenfa(nfa);
  98                 return NULL;
  99         }
 100         return nfa;
 101 }
 102
 103 /*
 104  * freenfa - free an entire NFA
 105  */
 106 static void
 107 freenfa(struct nfa *nfa)
 108 {
 109         struct statebatch *sb;
 110         struct statebatch *sbnext;
 111         struct arcbatch *ab;
 112         struct arcbatch *abnext;
 113
 114         for (sb = nfa->lastsb; sb != NULL; sb = sbnext)
 115         {
 116                 sbnext = sb->next;
 117                 nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates);
 118                 FREE(sb);
 119         }
 120         nfa->lastsb = NULL;
 121         for (ab = nfa->lastab; ab != NULL; ab = abnext)
 122         {
 123                 abnext = ab->next;
 124                 nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
 125                 FREE(ab);
 126         }
 127         nfa->lastab = NULL;
 128
 129         nfa->nstates = -1;
 130         FREE(nfa);
 131 }
 132
 133 /*
 134  * newstate - allocate an NFA state, with zero flag value
 135  */
 136 static struct state *                   /* NULL on error */
 137 newstate(struct nfa *nfa)
 138 {
 139         struct state *s;
 140
 141         /*
 142          * This is a handy place to check for operation cancel during regex
 143          * compilation, since no code path will go very long without making a new
 144          * state or arc.
 145          */
 146         INTERRUPT(nfa->v->re);
 147
 148         /* first, recycle anything that's on the freelist */
 149         if (nfa->freestates != NULL)
 150         {
 151                 s = nfa->freestates;
 152                 nfa->freestates = s->next;
 153         }
 154         /* otherwise, is there anything left in the last statebatch? */
 155         else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates)
 156         {
 157                 s = &nfa->lastsb->s[nfa->lastsbused++];
 158         }
 159         /* otherwise, need to allocate a new statebatch */
 160         else
 161         {
 162                 struct statebatch *newSb;
 163                 size_t          nstates;
 164
 165                 if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
 166                 {
 167                         NERR(REG_ETOOBIG);
 168                         return NULL;
 169                 }
 170                 nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE;
 171                 if (nstates > MAXSBSIZE)
 172                         nstates = MAXSBSIZE;
 173                 newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates));
 174                 if (newSb == NULL)
 175                 {
 176                         NERR(REG_ESPACE);
 177                         return NULL;
 178                 }
 179                 nfa->v->spaceused += STATEBATCHSIZE(nstates);
 180                 newSb->nstates = nstates;
 181                 newSb->next = nfa->lastsb;
 182                 nfa->lastsb = newSb;
 183                 nfa->lastsbused = 1;
 184                 s = &newSb->s[0];
 185         }
 186
 187         assert(nfa->nstates >= 0);
 188         s->no = nfa->nstates++;
 189         s->flag = 0;
 190         if (nfa->states == NULL)
 191                 nfa->states = s;
 192         s->nins = 0;
 193         s->ins = NULL;
 194         s->nouts = 0;
 195         s->outs = NULL;
 196         s->tmp = NULL;
 197         s->next = NULL;
 198         if (nfa->slast != NULL)
 199         {
 200                 assert(nfa->slast->next == NULL);
 201                 nfa->slast->next = s;
 202         }
 203         s->prev = nfa->slast;
 204         nfa->slast = s;
 205         return s;
 206 }
 207
 208 /*
 209  * newfstate - allocate an NFA state with a specified flag value
 210  */
 211 static struct state *                   /* NULL on error */
 212 newfstate(struct nfa *nfa, int flag)
 213 {
 214         struct state *s;
 215
 216         s = newstate(nfa);
 217         if (s != NULL)
 218                 s->flag = (char) flag;
 219         return s;
 220 }
 221
 222 /*
 223  * dropstate - delete a state's inarcs and outarcs and free it
 224  */
 225 static void
 226 dropstate(struct nfa *nfa,
 227                   struct state *s)
 228 {
 229         struct arc *a;
 230
 231         while ((a = s->ins) != NULL)
 232                 freearc(nfa, a);
 233         while ((a = s->outs) != NULL)
 234                 freearc(nfa, a);
 235         freestate(nfa, s);
 236 }
 237
 238 /*
 239  * freestate - free a state, which has no in-arcs or out-arcs
 240  */
 241 static void
 242 freestate(struct nfa *nfa,
 243                   struct state *s)
 244 {
 245         assert(s != NULL);
 246         assert(s->nins == 0 && s->nouts == 0);
 247
 248         s->no = FREESTATE;
 249         s->flag = 0;
 250         if (s->next != NULL)
 251                 s->next->prev = s->prev;
 252         else
 253         {
 254                 assert(s == nfa->slast);
 255                 nfa->slast = s->prev;
 256         }
 257         if (s->prev != NULL)
 258                 s->prev->next = s->next;
 259         else
 260         {
 261                 assert(s == nfa->states);
 262                 nfa->states = s->next;
 263         }
 264         s->prev = NULL;
 265         s->next = nfa->freestates;      /* don't delete it, put it on the free list */
 266         nfa->freestates = s;
 267 }
 268
 269 /*
 270  * newarc - set up a new arc within an NFA
 271  *
 272  * This function checks to make sure that no duplicate arcs are created.
 273  * In general we never want duplicates.
 274  *
 275  * However: in principle, a RAINBOW arc is redundant with any plain arc
 276  * (unless that arc is for a pseudocolor).  But we don't try to recognize
 277  * that redundancy, either here or in allied operations such as moveins().
 278  * The pseudocolor consideration makes that more costly than it seems worth.
 279  */
 280 static void
 281 newarc(struct nfa *nfa,
 282            int t,
 283            color co,
 284            struct state *from,
 285            struct state *to)
 286 {
 287         struct arc *a;
 288
 289         assert(from != NULL && to != NULL);
 290
 291         /*
 292          * This is a handy place to check for operation cancel during regex
 293          * compilation, since no code path will go very long without making a new
 294          * state or arc.
 295          */
 296         INTERRUPT(nfa->v->re);
 297
 298         /* check for duplicate arc, using whichever chain is shorter */
 299         if (from->nouts <= to->nins)
 300         {
 301                 for (a = from->outs; a != NULL; a = a->outchain)
 302                         if (a->to == to && a->co == co && a->type == t)
 303                                 return;
 304         }
 305         else
 306         {
 307                 for (a = to->ins; a != NULL; a = a->inchain)
 308                         if (a->from == from && a->co == co && a->type == t)
 309                                 return;
 310         }
 311
 312         /* no dup, so create the arc */
 313         createarc(nfa, t, co, from, to);
 314 }
 315
 316 /*
 317  * createarc - create a new arc within an NFA
 318  *
 319  * This function must *only* be used after verifying that there is no existing
 320  * identical arc (same type/color/from/to).
 321  */
 322 static void
 323 createarc(struct nfa *nfa,
 324                   int t,
 325                   color co,
 326                   struct state *from,
 327                   struct state *to)
 328 {
 329         struct arc *a;
 330
 331         a = allocarc(nfa);
 332         if (NISERR())
 333                 return;
 334         assert(a != NULL);
 335
 336         a->type = t;
 337         a->co = co;
 338         a->to = to;
 339         a->from = from;
 340
 341         /*
 342          * Put the new arc on the beginning, not the end, of the chains; it's
 343          * simpler here, and freearc() is the same cost either way.  See also the
 344          * logic in moveins() and its cohorts, as well as fixempties().
 345          */
 346         a->inchain = to->ins;
 347         a->inchainRev = NULL;
 348         if (to->ins)
 349                 to->ins->inchainRev = a;
 350         to->ins = a;
 351         a->outchain = from->outs;
 352         a->outchainRev = NULL;
 353         if (from->outs)
 354                 from->outs->outchainRev = a;
 355         from->outs = a;
 356
 357         from->nouts++;
 358         to->nins++;
 359
 360         if (COLORED(a) && nfa->parent == NULL)
 361                 colorchain(nfa->cm, a);
 362 }
 363
 364 /*
 365  * allocarc - allocate a new arc within an NFA
 366  */
 367 static struct arc *                             /* NULL for failure */
 368 allocarc(struct nfa *nfa)
 369 {
 370         struct arc *a;
 371
 372         /* first, recycle anything that's on the freelist */
 373         if (nfa->freearcs != NULL)
 374         {
 375                 a = nfa->freearcs;
 376                 nfa->freearcs = a->freechain;
 377         }
 378         /* otherwise, is there anything left in the last arcbatch? */
 379         else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs)
 380         {
 381                 a = &nfa->lastab->a[nfa->lastabused++];
 382         }
 383         /* otherwise, need to allocate a new arcbatch */
 384         else
 385         {
 386                 struct arcbatch *newAb;
 387                 size_t          narcs;
 388
 389                 if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
 390                 {
 391                         NERR(REG_ETOOBIG);
 392                         return NULL;
 393                 }
 394                 narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE;
 395                 if (narcs > MAXABSIZE)
 396                         narcs = MAXABSIZE;
 397                 newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
 398                 if (newAb == NULL)
 399                 {
 400                         NERR(REG_ESPACE);
 401                         return NULL;
 402                 }
 403                 nfa->v->spaceused += ARCBATCHSIZE(narcs);
 404                 newAb->narcs = narcs;
 405                 newAb->next = nfa->lastab;
 406                 nfa->lastab = newAb;
 407                 nfa->lastabused = 1;
 408                 a = &newAb->a[0];
 409         }
 410
 411         return a;
 412 }
 413
 414 /*
 415  * freearc - free an arc
 416  */
 417 static void
 418 freearc(struct nfa *nfa,
 419                 struct arc *victim)
 420 {
 421         struct state *from = victim->from;
 422         struct state *to = victim->to;
 423         struct arc *predecessor;
 424
 425         assert(victim->type != 0);
 426
 427         /* take it off color chain if necessary */
 428         if (COLORED(victim) && nfa->parent == NULL)
 429                 uncolorchain(nfa->cm, victim);
 430
 431         /* take it off source's out-chain */
 432         assert(from != NULL);
 433         predecessor = victim->outchainRev;
 434         if (predecessor == NULL)
 435         {
 436                 assert(from->outs == victim);
 437                 from->outs = victim->outchain;
 438         }
 439         else
 440         {
 441                 assert(predecessor->outchain == victim);
 442                 predecessor->outchain = victim->outchain;
 443         }
 444         if (victim->outchain != NULL)
 445         {
 446                 assert(victim->outchain->outchainRev == victim);
 447                 victim->outchain->outchainRev = predecessor;
 448         }
 449         from->nouts--;
 450
 451         /* take it off target's in-chain */
 452         assert(to != NULL);
 453         predecessor = victim->inchainRev;
 454         if (predecessor == NULL)
 455         {
 456                 assert(to->ins == victim);
 457                 to->ins = victim->inchain;
 458         }
 459         else
 460         {
 461                 assert(predecessor->inchain == victim);
 462                 predecessor->inchain = victim->inchain;
 463         }
 464         if (victim->inchain != NULL)
 465         {
 466                 assert(victim->inchain->inchainRev == victim);
 467                 victim->inchain->inchainRev = predecessor;
 468         }
 469         to->nins--;
 470
 471         /* clean up and place on NFA's free list */
 472         victim->type = 0;
 473         victim->from = NULL;            /* precautions... */
 474         victim->to = NULL;
 475         victim->inchain = NULL;
 476         victim->inchainRev = NULL;
 477         victim->outchain = NULL;
 478         victim->outchainRev = NULL;
 479         victim->freechain = nfa->freearcs;
 480         nfa->freearcs = victim;
 481 }
 482
 483 /*
 484  * changearcsource - flip an arc to have a different from state
 485  *
 486  * Caller must have verified that there is no pre-existing duplicate arc.
 487  */
 488 static void
 489 changearcsource(struct arc *a, struct state *newfrom)
 490 {
 491         struct state *oldfrom = a->from;
 492         struct arc *predecessor;
 493
 494         assert(oldfrom != newfrom);
 495
 496         /* take it off old source's out-chain */
 497         assert(oldfrom != NULL);
 498         predecessor = a->outchainRev;
 499         if (predecessor == NULL)
 500         {
 501                 assert(oldfrom->outs == a);
 502                 oldfrom->outs = a->outchain;
 503         }
 504         else
 505         {
 506                 assert(predecessor->outchain == a);
 507                 predecessor->outchain = a->outchain;
 508         }
 509         if (a->outchain != NULL)
 510         {
 511                 assert(a->outchain->outchainRev == a);
 512                 a->outchain->outchainRev = predecessor;
 513         }
 514         oldfrom->nouts--;
 515
 516         a->from = newfrom;
 517
 518         /* prepend it to new source's out-chain */
 519         a->outchain = newfrom->outs;
 520         a->outchainRev = NULL;
 521         if (newfrom->outs)
 522                 newfrom->outs->outchainRev = a;
 523         newfrom->outs = a;
 524         newfrom->nouts++;
 525 }
 526
 527 /*
 528  * changearctarget - flip an arc to have a different to state
 529  *
 530  * Caller must have verified that there is no pre-existing duplicate arc.
 531  */
 532 static void
 533 changearctarget(struct arc *a, struct state *newto)
 534 {
 535         struct state *oldto = a->to;
 536         struct arc *predecessor;
 537
 538         assert(oldto != newto);
 539
 540         /* take it off old target's in-chain */
 541         assert(oldto != NULL);
 542         predecessor = a->inchainRev;
 543         if (predecessor == NULL)
 544         {
 545                 assert(oldto->ins == a);
 546                 oldto->ins = a->inchain;
 547         }
 548         else
 549         {
 550                 assert(predecessor->inchain == a);
 551                 predecessor->inchain = a->inchain;
 552         }
 553         if (a->inchain != NULL)
 554         {
 555                 assert(a->inchain->inchainRev == a);
 556                 a->inchain->inchainRev = predecessor;
 557         }
 558         oldto->nins--;
 559
 560         a->to = newto;
 561
 562         /* prepend it to new target's in-chain */
 563         a->inchain = newto->ins;
 564         a->inchainRev = NULL;
 565         if (newto->ins)
 566                 newto->ins->inchainRev = a;
 567         newto->ins = a;
 568         newto->nins++;
 569 }
 570
 571 /*
 572  * hasnonemptyout - Does state have a non-EMPTY out arc?
 573  */
 574 static int
 575 hasnonemptyout(struct state *s)
 576 {
 577         struct arc *a;
 578
 579         for (a = s->outs; a != NULL; a = a->outchain)
 580         {
 581                 if (a->type != EMPTY)
 582                         return 1;
 583         }
 584         return 0;
 585 }
 586
 587 /*
 588  * findarc - find arc, if any, from given source with given type and color
 589  * If there is more than one such arc, the result is random.
 590  */
 591 static struct arc *
 592 findarc(struct state *s,
 593                 int type,
 594                 color co)
 595 {
 596         struct arc *a;
 597
 598         for (a = s->outs; a != NULL; a = a->outchain)
 599                 if (a->type == type && a->co == co)
 600                         return a;
 601         return NULL;
 602 }
 603
 604 /*
 605  * cparc - allocate a new arc within an NFA, copying details from old one
 606  */
 607 static void
 608 cparc(struct nfa *nfa,
 609           struct arc *oa,
 610           struct state *from,
 611           struct state *to)
 612 {
 613         newarc(nfa, oa->type, oa->co, from, to);
 614 }
 615
 616 /*
 617  * sortins - sort the in arcs of a state by from/color/type
 618  */
 619 static void
 620 sortins(struct nfa *nfa,
 621                 struct state *s)
 622 {
 623         struct arc **sortarray;
 624         struct arc *a;
 625         int                     n = s->nins;
 626         int                     i;
 627
 628         if (n <= 1)
 629                 return;                                 /* nothing to do */
 630         /* make an array of arc pointers ... */
 631         sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
 632         if (sortarray == NULL)
 633         {
 634                 NERR(REG_ESPACE);
 635                 return;
 636         }
 637         i = 0;
 638         for (a = s->ins; a != NULL; a = a->inchain)
 639                 sortarray[i++] = a;
 640         assert(i == n);
 641         /* ... sort the array */
 642         qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
 643         /* ... and rebuild arc list in order */
 644         /* it seems worth special-casing first and last items to simplify loop */
 645         a = sortarray[0];
 646         s->ins = a;
 647         a->inchain = sortarray[1];
 648         a->inchainRev = NULL;
 649         for (i = 1; i < n - 1; i++)
 650         {
 651                 a = sortarray[i];
 652                 a->inchain = sortarray[i + 1];
 653                 a->inchainRev = sortarray[i - 1];
 654         }
 655         a = sortarray[i];
 656         a->inchain = NULL;
 657         a->inchainRev = sortarray[i - 1];
 658         FREE(sortarray);
 659 }
 660
 661 static int
 662 sortins_cmp(const void *a, const void *b)
 663 {
 664         const struct arc *aa = *((const struct arc *const *) a);
 665         const struct arc *bb = *((const struct arc *const *) b);
 666
 667         /* we check the fields in the order they are most likely to be different */
 668         if (aa->from->no < bb->from->no)
 669                 return -1;
 670         if (aa->from->no > bb->from->no)
 671                 return 1;
 672         if (aa->co < bb->co)
 673                 return -1;
 674         if (aa->co > bb->co)
 675                 return 1;
 676         if (aa->type < bb->type)
 677                 return -1;
 678         if (aa->type > bb->type)
 679                 return 1;
 680         return 0;
 681 }
 682
 683 /*
 684  * sortouts - sort the out arcs of a state by to/color/type
 685  */
 686 static void
 687 sortouts(struct nfa *nfa,
 688                  struct state *s)
 689 {
 690         struct arc **sortarray;
 691         struct arc *a;
 692         int                     n = s->nouts;
 693         int                     i;
 694
 695         if (n <= 1)
 696                 return;                                 /* nothing to do */
 697         /* make an array of arc pointers ... */
 698         sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
 699         if (sortarray == NULL)
 700         {
 701                 NERR(REG_ESPACE);
 702                 return;
 703         }
 704         i = 0;
 705         for (a = s->outs; a != NULL; a = a->outchain)
 706                 sortarray[i++] = a;
 707         assert(i == n);
 708         /* ... sort the array */
 709         qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
 710         /* ... and rebuild arc list in order */
 711         /* it seems worth special-casing first and last items to simplify loop */
 712         a = sortarray[0];
 713         s->outs = a;
 714         a->outchain = sortarray[1];
 715         a->outchainRev = NULL;
 716         for (i = 1; i < n - 1; i++)
 717         {
 718                 a = sortarray[i];
 719                 a->outchain = sortarray[i + 1];
 720                 a->outchainRev = sortarray[i - 1];
 721         }
 722         a = sortarray[i];
 723         a->outchain = NULL;
 724         a->outchainRev = sortarray[i - 1];
 725         FREE(sortarray);
 726 }
 727
 728 static int
 729 sortouts_cmp(const void *a, const void *b)
 730 {
 731         const struct arc *aa = *((const struct arc *const *) a);
 732         const struct arc *bb = *((const struct arc *const *) b);
 733
 734         /* we check the fields in the order they are most likely to be different */
 735         if (aa->to->no < bb->to->no)
 736                 return -1;
 737         if (aa->to->no > bb->to->no)
 738                 return 1;
 739         if (aa->co < bb->co)
 740                 return -1;
 741         if (aa->co > bb->co)
 742                 return 1;
 743         if (aa->type < bb->type)
 744                 return -1;
 745         if (aa->type > bb->type)
 746                 return 1;
 747         return 0;
 748 }
 749
 750 /*
 751  * Common decision logic about whether to use arc-by-arc operations or
 752  * sort/merge.  If there's just a few source arcs we cannot recoup the
 753  * cost of sorting the destination arc list, no matter how large it is.
 754  * Otherwise, limit the number of arc-by-arc comparisons to about 1000
 755  * (a somewhat arbitrary choice, but the breakeven point would probably
 756  * be machine dependent anyway).
 757  */
 758 #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
 759         ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
 760
 761 /*
 762  * moveins - move all in arcs of a state to another state
 763  *
 764  * You might think this could be done better by just updating the
 765  * existing arcs, and you would be right if it weren't for the need
 766  * for duplicate suppression, which makes it easier to just make new
 767  * ones to exploit the suppression built into newarc.
 768  *
 769  * However, if we have a whole lot of arcs to deal with, retail duplicate
 770  * checks become too slow.  In that case we proceed by sorting and merging
 771  * the arc lists, and then we can indeed just update the arcs in-place.
 772  *
 773  * On the other hand, it's also true that this is frequently called with
 774  * a brand-new newState that has no existing in-arcs.  In that case,
 775  * de-duplication is unnecessary, so we can just blindly move all the arcs.
 776  */
 777 static void
 778 moveins(struct nfa *nfa,
 779                 struct state *oldState,
 780                 struct state *newState)
 781 {
 782         assert(oldState != newState);
 783
 784         if (newState->nins == 0)
 785         {
 786                 /* No need for de-duplication */
 787                 struct arc *a;
 788
 789                 while ((a = oldState->ins) != NULL)
 790                 {
 791                         createarc(nfa, a->type, a->co, a->from, newState);
 792                         freearc(nfa, a);
 793                 }
 794         }
 795         else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
 796         {
 797                 /* With not too many arcs, just do them one at a time */
 798                 struct arc *a;
 799
 800                 while ((a = oldState->ins) != NULL)
 801                 {
 802                         cparc(nfa, a, a->from, newState);
 803                         freearc(nfa, a);
 804                 }
 805         }
 806         else
 807         {
 808                 /*
 809                  * With many arcs, use a sort-merge approach.  Note changearctarget()
 810                  * will put the arc onto the front of newState's chain, so it does not
 811                  * break our walk through the sorted part of the chain.
 812                  */
 813                 struct arc *oa;
 814                 struct arc *na;
 815
 816                 /*
 817                  * Because we bypass newarc() in this code path, we'd better include a
 818                  * cancel check.
 819                  */
 820                 INTERRUPT(nfa->v->re);
 821
 822                 sortins(nfa, oldState);
 823                 sortins(nfa, newState);
 824                 if (NISERR())
 825                         return;                         /* might have failed to sort */
 826                 oa = oldState->ins;
 827                 na = newState->ins;
 828                 while (oa != NULL && na != NULL)
 829                 {
 830                         struct arc *a = oa;
 831
 832                         switch (sortins_cmp(&oa, &na))
 833                         {
 834                                 case -1:
 835                                         /* newState does not have anything matching oa */
 836                                         oa = oa->inchain;
 837
 838                                         /*
 839                                          * Rather than doing createarc+freearc, we can just unlink
 840                                          * and relink the existing arc struct.
 841                                          */
 842                                         changearctarget(a, newState);
 843                                         break;
 844                                 case 0:
 845                                         /* match, advance in both lists */
 846                                         oa = oa->inchain;
 847                                         na = na->inchain;
 848                                         /* ... and drop duplicate arc from oldState */
 849                                         freearc(nfa, a);
 850                                         break;
 851                                 case +1:
 852                                         /* advance only na; oa might have a match later */
 853                                         na = na->inchain;
 854                                         break;
 855                                 default:
 856                                         assert(NOTREACHED);
 857                         }
 858                 }
 859                 while (oa != NULL)
 860                 {
 861                         /* newState does not have anything matching oa */
 862                         struct arc *a = oa;
 863
 864                         oa = oa->inchain;
 865                         changearctarget(a, newState);
 866                 }
 867         }
 868
 869         assert(oldState->nins == 0);
 870         assert(oldState->ins == NULL);
 871 }
 872
 873 /*
 874  * copyins - copy in arcs of a state to another state
 875  *
 876  * The comments for moveins() apply here as well.  However, in current
 877  * usage, this is *only* called with brand-new target states, so that
 878  * only the "no need for de-duplication" code path is ever reached.
 879  * We keep the rest #ifdef'd out in case it's needed in the future.
 880  */
 881 static void
 882 copyins(struct nfa *nfa,
 883                 struct state *oldState,
 884                 struct state *newState)
 885 {
 886         assert(oldState != newState);
 887         assert(newState->nins == 0);    /* see comment above */
 888
 889         if (newState->nins == 0)
 890         {
 891                 /* No need for de-duplication */
 892                 struct arc *a;
 893
 894                 for (a = oldState->ins; a != NULL; a = a->inchain)
 895                         createarc(nfa, a->type, a->co, a->from, newState);
 896         }
 897 #ifdef NOT_USED                                 /* see comment above */
 898         else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
 899         {
 900                 /* With not too many arcs, just do them one at a time */
 901                 struct arc *a;
 902
 903                 for (a = oldState->ins; a != NULL; a = a->inchain)
 904                         cparc(nfa, a, a->from, newState);
 905         }
 906         else
 907         {
 908                 /*
 909                  * With many arcs, use a sort-merge approach.  Note that createarc()
 910                  * will put new arcs onto the front of newState's chain, so it does
 911                  * not break our walk through the sorted part of the chain.
 912                  */
 913                 struct arc *oa;
 914                 struct arc *na;
 915
 916                 /*
 917                  * Because we bypass newarc() in this code path, we'd better include a
 918                  * cancel check.
 919                  */
 920                 INTERRUPT(nfa->v->re);
 921
 922                 sortins(nfa, oldState);
 923                 sortins(nfa, newState);
 924                 if (NISERR())
 925                         return;                         /* might have failed to sort */
 926                 oa = oldState->ins;
 927                 na = newState->ins;
 928                 while (oa != NULL && na != NULL)
 929                 {
 930                         struct arc *a = oa;
 931
 932                         switch (sortins_cmp(&oa, &na))
 933                         {
 934                                 case -1:
 935                                         /* newState does not have anything matching oa */
 936                                         oa = oa->inchain;
 937                                         createarc(nfa, a->type, a->co, a->from, newState);
 938                                         break;
 939                                 case 0:
 940                                         /* match, advance in both lists */
 941                                         oa = oa->inchain;
 942                                         na = na->inchain;
 943                                         break;
 944                                 case +1:
 945                                         /* advance only na; oa might have a match later */
 946                                         na = na->inchain;
 947                                         break;
 948                                 default:
 949                                         assert(NOTREACHED);
 950                         }
 951                 }
 952                 while (oa != NULL)
 953                 {
 954                         /* newState does not have anything matching oa */
 955                         struct arc *a = oa;
 956
 957                         oa = oa->inchain;
 958                         createarc(nfa, a->type, a->co, a->from, newState);
 959                 }
 960         }
 961 #endif                                                  /* NOT_USED */
 962 }
 963
 964 /*
 965  * mergeins - merge a list of inarcs into a state
 966  *
 967  * This is much like copyins, but the source arcs are listed in an array,
 968  * and are not guaranteed unique.  It's okay to clobber the array contents.
 969  */
 970 static void
 971 mergeins(struct nfa *nfa,
 972                  struct state *s,
 973                  struct arc **arcarray,
 974                  int arccount)
 975 {
 976         struct arc *na;
 977         int                     i;
 978         int                     j;
 979
 980         if (arccount <= 0)
 981                 return;
 982
 983         /*
 984          * Because we bypass newarc() in this code path, we'd better include a
 985          * cancel check.
 986          */
 987         INTERRUPT(nfa->v->re);
 988
 989         /* Sort existing inarcs as well as proposed new ones */
 990         sortins(nfa, s);
 991         if (NISERR())
 992                 return;                                 /* might have failed to sort */
 993
 994         qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
 995
 996         /*
 997          * arcarray very likely includes dups, so we must eliminate them.  (This
 998          * could be folded into the next loop, but it's not worth the trouble.)
 999          */
1000         j = 0;
1001         for (i = 1; i < arccount; i++)
1002         {
1003                 switch (sortins_cmp(&arcarray[j], &arcarray[i]))
1004                 {
1005                         case -1:
1006                                 /* non-dup */
1007                                 arcarray[++j] = arcarray[i];
1008                                 break;
1009                         case 0:
1010                                 /* dup */
1011                                 break;
1012                         default:
1013                                 /* trouble */
1014                                 assert(NOTREACHED);
1015                 }
1016         }
1017         arccount = j + 1;
1018
1019         /*
1020          * Now merge into s' inchain.  Note that createarc() will put new arcs
1021          * onto the front of s's chain, so it does not break our walk through the
1022          * sorted part of the chain.
1023          */
1024         i = 0;
1025         na = s->ins;
1026         while (i < arccount && na != NULL)
1027         {
1028                 struct arc *a = arcarray[i];
1029
1030                 switch (sortins_cmp(&a, &na))
1031                 {
1032                         case -1:
1033                                 /* s does not have anything matching a */
1034                                 createarc(nfa, a->type, a->co, a->from, s);
1035                                 i++;
1036                                 break;
1037                         case 0:
1038                                 /* match, advance in both lists */
1039                                 i++;
1040                                 na = na->inchain;
1041                                 break;
1042                         case +1:
1043                                 /* advance only na; array might have a match later */
1044                                 na = na->inchain;
1045                                 break;
1046                         default:
1047                                 assert(NOTREACHED);
1048                 }
1049         }
1050         while (i < arccount)
1051         {
1052                 /* s does not have anything matching a */
1053                 struct arc *a = arcarray[i];
1054
1055                 createarc(nfa, a->type, a->co, a->from, s);
1056                 i++;
1057         }
1058 }
1059
1060 /*
1061  * moveouts - move all out arcs of a state to another state
1062  *
1063  * See comments for moveins()
1064  */
1065 static void
1066 moveouts(struct nfa *nfa,
1067                  struct state *oldState,
1068                  struct state *newState)
1069 {
1070         assert(oldState != newState);
1071
1072         if (newState->nouts == 0)
1073         {
1074                 /* No need for de-duplication */
1075                 struct arc *a;
1076
1077                 while ((a = oldState->outs) != NULL)
1078                 {
1079                         createarc(nfa, a->type, a->co, newState, a->to);
1080                         freearc(nfa, a);
1081                 }
1082         }
1083         else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1084         {
1085                 /* With not too many arcs, just do them one at a time */
1086                 struct arc *a;
1087
1088                 while ((a = oldState->outs) != NULL)
1089                 {
1090                         cparc(nfa, a, newState, a->to);
1091                         freearc(nfa, a);
1092                 }
1093         }
1094         else
1095         {
1096                 /*
1097                  * With many arcs, use a sort-merge approach.  Note changearcsource()
1098                  * will put the arc onto the front of newState's chain, so it does not
1099                  * break our walk through the sorted part of the chain.
1100                  */
1101                 struct arc *oa;
1102                 struct arc *na;
1103
1104                 /*
1105                  * Because we bypass newarc() in this code path, we'd better include a
1106                  * cancel check.
1107                  */
1108                 INTERRUPT(nfa->v->re);
1109
1110                 sortouts(nfa, oldState);
1111                 sortouts(nfa, newState);
1112                 if (NISERR())
1113                         return;                         /* might have failed to sort */
1114                 oa = oldState->outs;
1115                 na = newState->outs;
1116                 while (oa != NULL && na != NULL)
1117                 {
1118                         struct arc *a = oa;
1119
1120                         switch (sortouts_cmp(&oa, &na))
1121                         {
1122                                 case -1:
1123                                         /* newState does not have anything matching oa */
1124                                         oa = oa->outchain;
1125
1126                                         /*
1127                                          * Rather than doing createarc+freearc, we can just unlink
1128                                          * and relink the existing arc struct.
1129                                          */
1130                                         changearcsource(a, newState);
1131                                         break;
1132                                 case 0:
1133                                         /* match, advance in both lists */
1134                                         oa = oa->outchain;
1135                                         na = na->outchain;
1136                                         /* ... and drop duplicate arc from oldState */
1137                                         freearc(nfa, a);
1138                                         break;
1139                                 case +1:
1140                                         /* advance only na; oa might have a match later */
1141                                         na = na->outchain;
1142                                         break;
1143                                 default:
1144                                         assert(NOTREACHED);
1145                         }
1146                 }
1147                 while (oa != NULL)
1148                 {
1149                         /* newState does not have anything matching oa */
1150                         struct arc *a = oa;
1151
1152                         oa = oa->outchain;
1153                         changearcsource(a, newState);
1154                 }
1155         }
1156
1157         assert(oldState->nouts == 0);
1158         assert(oldState->outs == NULL);
1159 }
1160
1161 /*
1162  * copyouts - copy out arcs of a state to another state
1163  *
1164  * See comments for copyins()
1165  */
1166 static void
1167 copyouts(struct nfa *nfa,
1168                  struct state *oldState,
1169                  struct state *newState)
1170 {
1171         assert(oldState != newState);
1172         assert(newState->nouts == 0);   /* see comment above */
1173
1174         if (newState->nouts == 0)
1175         {
1176                 /* No need for de-duplication */
1177                 struct arc *a;
1178
1179                 for (a = oldState->outs; a != NULL; a = a->outchain)
1180                         createarc(nfa, a->type, a->co, newState, a->to);
1181         }
1182 #ifdef NOT_USED                                 /* see comment above */
1183         else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1184         {
1185                 /* With not too many arcs, just do them one at a time */
1186                 struct arc *a;
1187
1188                 for (a = oldState->outs; a != NULL; a = a->outchain)
1189                         cparc(nfa, a, newState, a->to);
1190         }
1191         else
1192         {
1193                 /*
1194                  * With many arcs, use a sort-merge approach.  Note that createarc()
1195                  * will put new arcs onto the front of newState's chain, so it does
1196                  * not break our walk through the sorted part of the chain.
1197                  */
1198                 struct arc *oa;
1199                 struct arc *na;
1200
1201                 /*
1202                  * Because we bypass newarc() in this code path, we'd better include a
1203                  * cancel check.
1204                  */
1205                 INTERRUPT(nfa->v->re);
1206
1207                 sortouts(nfa, oldState);
1208                 sortouts(nfa, newState);
1209                 if (NISERR())
1210                         return;                         /* might have failed to sort */
1211                 oa = oldState->outs;
1212                 na = newState->outs;
1213                 while (oa != NULL && na != NULL)
1214                 {
1215                         struct arc *a = oa;
1216
1217                         switch (sortouts_cmp(&oa, &na))
1218                         {
1219                                 case -1:
1220                                         /* newState does not have anything matching oa */
1221                                         oa = oa->outchain;
1222                                         createarc(nfa, a->type, a->co, newState, a->to);
1223                                         break;
1224                                 case 0:
1225                                         /* match, advance in both lists */
1226                                         oa = oa->outchain;
1227                                         na = na->outchain;
1228                                         break;
1229                                 case +1:
1230                                         /* advance only na; oa might have a match later */
1231                                         na = na->outchain;
1232                                         break;
1233                                 default:
1234                                         assert(NOTREACHED);
1235                         }
1236                 }
1237                 while (oa != NULL)
1238                 {
1239                         /* newState does not have anything matching oa */
1240                         struct arc *a = oa;
1241
1242                         oa = oa->outchain;
1243                         createarc(nfa, a->type, a->co, newState, a->to);
1244                 }
1245         }
1246 #endif                                                  /* NOT_USED */
1247 }
1248
1249 /*
1250  * cloneouts - copy out arcs of a state to another state pair, modifying type
1251  *
1252  * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
1253  * the same interpretation of "co".  It wouldn't be sensible with LACONs.
1254  */
1255 static void
1256 cloneouts(struct nfa *nfa,
1257                   struct state *old,
1258                   struct state *from,
1259                   struct state *to,
1260                   int type)
1261 {
1262         struct arc *a;
1263
1264         assert(old != from);
1265         assert(type == AHEAD || type == BEHIND);
1266
1267         for (a = old->outs; a != NULL; a = a->outchain)
1268         {
1269                 assert(a->type == PLAIN);
1270                 newarc(nfa, type, a->co, from, to);
1271         }
1272 }
1273
1274 /*
1275  * delsub - delete a sub-NFA, updating subre pointers if necessary
1276  *
1277  * This uses a recursive traversal of the sub-NFA, marking already-seen
1278  * states using their tmp pointer.
1279  */
1280 static void
1281 delsub(struct nfa *nfa,
1282            struct state *lp,            /* the sub-NFA goes from here... */
1283            struct state *rp)            /* ...to here, *not* inclusive */
1284 {
1285         assert(lp != rp);
1286
1287         rp->tmp = rp;                           /* mark end */
1288
1289         deltraverse(nfa, lp, lp);
1290         if (NISERR())
1291                 return;                                 /* asserts might not hold after failure */
1292         assert(lp->nouts == 0 && rp->nins == 0);        /* did the job */
1293         assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
1294
1295         rp->tmp = NULL;                         /* unmark end */
1296         lp->tmp = NULL;                         /* and begin, marked by deltraverse */
1297 }
1298
1299 /*
1300  * deltraverse - the recursive heart of delsub
1301  * This routine's basic job is to destroy all out-arcs of the state.
1302  */
1303 static void
1304 deltraverse(struct nfa *nfa,
1305                         struct state *leftend,
1306                         struct state *s)
1307 {
1308         struct arc *a;
1309         struct state *to;
1310
1311         /* Since this is recursive, it could be driven to stack overflow */
1312         if (STACK_TOO_DEEP(nfa->v->re))
1313         {
1314                 NERR(REG_ETOOBIG);
1315                 return;
1316         }
1317
1318         if (s->nouts == 0)
1319                 return;                                 /* nothing to do */
1320         if (s->tmp != NULL)
1321                 return;                                 /* already in progress */
1322
1323         s->tmp = s;                                     /* mark as in progress */
1324
1325         while ((a = s->outs) != NULL)
1326         {
1327                 to = a->to;
1328                 deltraverse(nfa, leftend, to);
1329                 if (NISERR())
1330                         return;                         /* asserts might not hold after failure */
1331                 assert(to->nouts == 0 || to->tmp != NULL);
1332                 freearc(nfa, a);
1333                 if (to->nins == 0 && to->tmp == NULL)
1334                 {
1335                         assert(to->nouts == 0);
1336                         freestate(nfa, to);
1337                 }
1338         }
1339
1340         assert(s->no != FREESTATE); /* we're still here */
1341         assert(s == leftend || s->nins != 0);   /* and still reachable */
1342         assert(s->nouts == 0);          /* but have no outarcs */
1343
1344         s->tmp = NULL;                          /* we're done here */
1345 }
1346
1347 /*
1348  * dupnfa - duplicate sub-NFA
1349  *
1350  * Another recursive traversal, this time using tmp to point to duplicates
1351  * as well as mark already-seen states.  (You knew there was a reason why
1352  * it's a state pointer, didn't you? :-))
1353  */
1354 static void
1355 dupnfa(struct nfa *nfa,
1356            struct state *start,         /* duplicate of subNFA starting here */
1357            struct state *stop,          /* and stopping here */
1358            struct state *from,          /* stringing duplicate from here */
1359            struct state *to)            /* to here */
1360 {
1361         if (start == stop)
1362         {
1363                 newarc(nfa, EMPTY, 0, from, to);
1364                 return;
1365         }
1366
1367         stop->tmp = to;
1368         duptraverse(nfa, start, from);
1369         /* done, except for clearing out the tmp pointers */
1370
1371         stop->tmp = NULL;
1372         cleartraverse(nfa, start);
1373 }
1374
1375 /*
1376  * duptraverse - recursive heart of dupnfa
1377  */
1378 static void
1379 duptraverse(struct nfa *nfa,
1380                         struct state *s,
1381                         struct state *stmp) /* s's duplicate, or NULL */
1382 {
1383         struct arc *a;
1384
1385         /* Since this is recursive, it could be driven to stack overflow */
1386         if (STACK_TOO_DEEP(nfa->v->re))
1387         {
1388                 NERR(REG_ETOOBIG);
1389                 return;
1390         }
1391
1392         if (s->tmp != NULL)
1393                 return;                                 /* already done */
1394
1395         s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
1396         if (s->tmp == NULL)
1397         {
1398                 assert(NISERR());
1399                 return;
1400         }
1401
1402         for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
1403         {
1404                 duptraverse(nfa, a->to, (struct state *) NULL);
1405                 if (NISERR())
1406                         break;
1407                 assert(a->to->tmp != NULL);
1408                 cparc(nfa, a, s->tmp, a->to->tmp);
1409         }
1410 }
1411
1412 /*
1413  * removeconstraints - remove any constraints in an NFA
1414  *
1415  * Constraint arcs are replaced by empty arcs, essentially treating all
1416  * constraints as automatically satisfied.
1417  */
1418 static void
1419 removeconstraints(struct nfa *nfa,
1420                                   struct state *start,  /* process subNFA starting here */
1421                                   struct state *stop)   /* and stopping here */
1422 {
1423         if (start == stop)
1424                 return;
1425
1426         stop->tmp = stop;
1427         removetraverse(nfa, start);
1428         /* done, except for clearing out the tmp pointers */
1429
1430         stop->tmp = NULL;
1431         cleartraverse(nfa, start);
1432 }
1433
1434 /*
1435  * removetraverse - recursive heart of removeconstraints
1436  */
1437 static void
1438 removetraverse(struct nfa *nfa,
1439                            struct state *s)
1440 {
1441         struct arc *a;
1442         struct arc *oa;
1443
1444         /* Since this is recursive, it could be driven to stack overflow */
1445         if (STACK_TOO_DEEP(nfa->v->re))
1446         {
1447                 NERR(REG_ETOOBIG);
1448                 return;
1449         }
1450
1451         if (s->tmp != NULL)
1452                 return;                                 /* already done */
1453
1454         s->tmp = s;
1455         for (a = s->outs; a != NULL && !NISERR(); a = oa)
1456         {
1457                 removetraverse(nfa, a->to);
1458                 if (NISERR())
1459                         break;
1460                 oa = a->outchain;
1461                 switch (a->type)
1462                 {
1463                         case PLAIN:
1464                         case EMPTY:
1465                         case CANTMATCH:
1466                                 /* nothing to do */
1467                                 break;
1468                         case AHEAD:
1469                         case BEHIND:
1470                         case '^':
1471                         case '$':
1472                         case LACON:
1473                                 /* replace it */
1474                                 newarc(nfa, EMPTY, 0, s, a->to);
1475                                 freearc(nfa, a);
1476                                 break;
1477                         default:
1478                                 NERR(REG_ASSERT);
1479                                 break;
1480                 }
1481         }
1482 }
1483
1484 /*
1485  * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
1486  */
1487 static void
1488 cleartraverse(struct nfa *nfa,
1489                           struct state *s)
1490 {
1491         struct arc *a;
1492
1493         /* Since this is recursive, it could be driven to stack overflow */
1494         if (STACK_TOO_DEEP(nfa->v->re))
1495         {
1496                 NERR(REG_ETOOBIG);
1497                 return;
1498         }
1499
1500         if (s->tmp == NULL)
1501                 return;
1502         s->tmp = NULL;
1503
1504         for (a = s->outs; a != NULL; a = a->outchain)
1505                 cleartraverse(nfa, a->to);
1506 }
1507
1508 /*
1509  * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
1510  *
1511  * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
1512  * of a set of colors), return a state whose outarc list contains only PLAIN
1513  * arcs of those color(s).  Otherwise return NULL.
1514  *
1515  * This is used before optimizing the NFA, so there may be EMPTY arcs, which
1516  * we should ignore; the possibility of an EMPTY is why the result state could
1517  * be different from s1.
1518  *
1519  * It's worth troubling to handle multiple parallel PLAIN arcs here because a
1520  * bracket construct such as [abc] might yield either one or several parallel
1521  * PLAIN arcs depending on earlier atoms in the expression.  We'd rather that
1522  * that implementation detail not create user-visible performance differences.
1523  */
1524 static struct state *
1525 single_color_transition(struct state *s1, struct state *s2)
1526 {
1527         struct arc *a;
1528
1529         /* Ignore leading EMPTY arc, if any */
1530         if (s1->nouts == 1 && s1->outs->type == EMPTY)
1531                 s1 = s1->outs->to;
1532         /* Likewise for any trailing EMPTY arc */
1533         if (s2->nins == 1 && s2->ins->type == EMPTY)
1534                 s2 = s2->ins->from;
1535         /* Perhaps we could have a single-state loop in between, if so reject */
1536         if (s1 == s2)
1537                 return NULL;
1538         /* s1 must have at least one outarc... */
1539         if (s1->outs == NULL)
1540                 return NULL;
1541         /* ... and they must all be PLAIN arcs to s2 */
1542         for (a = s1->outs; a != NULL; a = a->outchain)
1543         {
1544                 if (a->type != PLAIN || a->to != s2)
1545                         return NULL;
1546         }
1547         /* OK, return s1 as the possessor of the relevant outarcs */
1548         return s1;
1549 }
1550
1551 /*
1552  * specialcolors - fill in special colors for an NFA
1553  */
1554 static void
1555 specialcolors(struct nfa *nfa)
1556 {
1557         /* false colors for BOS, BOL, EOS, EOL */
1558         if (nfa->parent == NULL)
1559         {
1560                 nfa->bos[0] = pseudocolor(nfa->cm);
1561                 nfa->bos[1] = pseudocolor(nfa->cm);
1562                 nfa->eos[0] = pseudocolor(nfa->cm);
1563                 nfa->eos[1] = pseudocolor(nfa->cm);
1564         }
1565         else
1566         {
1567                 assert(nfa->parent->bos[0] != COLORLESS);
1568                 nfa->bos[0] = nfa->parent->bos[0];
1569                 assert(nfa->parent->bos[1] != COLORLESS);
1570                 nfa->bos[1] = nfa->parent->bos[1];
1571                 assert(nfa->parent->eos[0] != COLORLESS);
1572                 nfa->eos[0] = nfa->parent->eos[0];
1573                 assert(nfa->parent->eos[1] != COLORLESS);
1574                 nfa->eos[1] = nfa->parent->eos[1];
1575         }
1576 }
1577
1578 /*
1579  * optimize - optimize an NFA
1580  *
1581  * The main goal of this function is not so much "optimization" (though it
1582  * does try to get rid of useless NFA states) as reducing the NFA to a form
1583  * the regex executor can handle.  The executor, and indeed the cNFA format
1584  * that is its input, can only handle PLAIN and LACON arcs.  The output of
1585  * the regex parser also includes EMPTY (do-nothing) arcs, as well as
1586  * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
1587  * We first get rid of EMPTY arcs and then deal with the constraint arcs.
1588  * The hardest part of either job is to get rid of circular loops of the
1589  * target arc type.  We would have to do that in any case, though, as such a
1590  * loop would otherwise allow the executor to cycle through the loop endlessly
1591  * without making any progress in the input string.
1592  */
1593 static long                                             /* re_info bits */
1594 optimize(struct nfa *nfa,
1595                  FILE *f)                               /* for debug output; NULL none */
1596 {
1597 #ifdef REG_DEBUG
1598         int                     verbose = (f != NULL) ? 1 : 0;
1599
1600         if (verbose)
1601                 fprintf(f, "\ninitial cleanup:\n");
1602 #endif
1603         /* If we have any CANTMATCH arcs, drop them; but this is uncommon */
1604         if (nfa->flags & HASCANTMATCH)
1605         {
1606                 removecantmatch(nfa);
1607                 nfa->flags &= ~HASCANTMATCH;
1608         }
1609         cleanup(nfa);                           /* may simplify situation */
1610 #ifdef REG_DEBUG
1611         if (verbose)
1612                 dumpnfa(nfa, f);
1613         if (verbose)
1614                 fprintf(f, "\nempties:\n");
1615 #endif
1616         fixempties(nfa, f);                     /* get rid of EMPTY arcs */
1617 #ifdef REG_DEBUG
1618         if (verbose)
1619                 fprintf(f, "\nconstraints:\n");
1620 #endif
1621         fixconstraintloops(nfa, f); /* get rid of constraint loops */
1622         pullback(nfa, f);                       /* pull back constraints backward */
1623         pushfwd(nfa, f);                        /* push fwd constraints forward */
1624 #ifdef REG_DEBUG
1625         if (verbose)
1626                 fprintf(f, "\nfinal cleanup:\n");
1627 #endif
1628         cleanup(nfa);                           /* final tidying */
1629 #ifdef REG_DEBUG
1630         if (verbose)
1631                 dumpnfa(nfa, f);
1632 #endif
1633         return analyze(nfa);            /* and analysis */
1634 }
1635
1636 /*
1637  * pullback - pull back constraints backward to eliminate them
1638  */
1639 static void
1640 pullback(struct nfa *nfa,
1641                  FILE *f)                               /* for debug output; NULL none */
1642 {
1643         struct state *s;
1644         struct state *nexts;
1645         struct arc *a;
1646         struct arc *nexta;
1647         struct state *intermediates;
1648         int                     progress;
1649
1650         /* find and pull until there are no more */
1651         do
1652         {
1653                 progress = 0;
1654                 for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1655                 {
1656                         nexts = s->next;
1657                         intermediates = NULL;
1658                         for (a = s->outs; a != NULL && !NISERR(); a = nexta)
1659                         {
1660                                 nexta = a->outchain;
1661                                 if (a->type == '^' || a->type == BEHIND)
1662                                         if (pull(nfa, a, &intermediates))
1663                                                 progress = 1;
1664                         }
1665                         /* clear tmp fields of intermediate states created here */
1666                         while (intermediates != NULL)
1667                         {
1668                                 struct state *ns = intermediates->tmp;
1669
1670                                 intermediates->tmp = NULL;
1671                                 intermediates = ns;
1672                         }
1673                         /* if s is now useless, get rid of it */
1674                         if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1675                                 dropstate(nfa, s);
1676                 }
1677                 if (progress && f != NULL)
1678                         dumpnfa(nfa, f);
1679         } while (progress && !NISERR());
1680         if (NISERR())
1681                 return;
1682
1683         /*
1684          * Any ^ constraints we were able to pull to the start state can now be
1685          * replaced by PLAIN arcs referencing the BOS or BOL colors.  There should
1686          * be no other ^ or BEHIND arcs left in the NFA, though we do not check
1687          * that here (compact() will fail if so).
1688          */
1689         for (a = nfa->pre->outs; a != NULL; a = nexta)
1690         {
1691                 nexta = a->outchain;
1692                 if (a->type == '^')
1693                 {
1694                         assert(a->co == 0 || a->co == 1);
1695                         newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
1696                         freearc(nfa, a);
1697                 }
1698         }
1699 }
1700
1701 /*
1702  * pull - pull a back constraint backward past its source state
1703  *
1704  * Returns 1 if successful (which it always is unless the source is the
1705  * start state or we have an internal error), 0 if nothing happened.
1706  *
1707  * A significant property of this function is that it deletes no pre-existing
1708  * states, and no outarcs of the constraint's from state other than the given
1709  * constraint arc.  This makes the loops in pullback() safe, at the cost that
1710  * we may leave useless states behind.  Therefore, we leave it to pullback()
1711  * to delete such states.
1712  *
1713  * If the from state has multiple back-constraint outarcs, and/or multiple
1714  * compatible constraint inarcs, we only need to create one new intermediate
1715  * state per combination of predecessor and successor states.  *intermediates
1716  * points to a list of such intermediate states for this from state (chained
1717  * through their tmp fields).
1718  */
1719 static int
1720 pull(struct nfa *nfa,
1721          struct arc *con,
1722          struct state **intermediates)
1723 {
1724         struct state *from = con->from;
1725         struct state *to = con->to;
1726         struct arc *a;
1727         struct arc *nexta;
1728         struct state *s;
1729
1730         assert(from != to);                     /* should have gotten rid of this earlier */
1731         if (from->flag)                         /* can't pull back beyond start */
1732                 return 0;
1733         if (from->nins == 0)
1734         {                                                       /* unreachable */
1735                 freearc(nfa, con);
1736                 return 1;
1737         }
1738
1739         /*
1740          * First, clone from state if necessary to avoid other outarcs.  This may
1741          * seem wasteful, but it simplifies the logic, and we'll get rid of the
1742          * clone state again at the bottom.
1743          */
1744         if (from->nouts > 1)
1745         {
1746                 s = newstate(nfa);
1747                 if (NISERR())
1748                         return 0;
1749                 copyins(nfa, from, s);  /* duplicate inarcs */
1750                 cparc(nfa, con, s, to); /* move constraint arc */
1751                 freearc(nfa, con);
1752                 if (NISERR())
1753                         return 0;
1754                 from = s;
1755                 con = from->outs;
1756         }
1757         assert(from->nouts == 1);
1758
1759         /* propagate the constraint into the from state's inarcs */
1760         for (a = from->ins; a != NULL && !NISERR(); a = nexta)
1761         {
1762                 nexta = a->inchain;
1763                 switch (combine(nfa, con, a))
1764                 {
1765                         case INCOMPATIBLE:      /* destroy the arc */
1766                                 freearc(nfa, a);
1767                                 break;
1768                         case SATISFIED:         /* no action needed */
1769                                 break;
1770                         case COMPATIBLE:        /* swap the two arcs, more or less */
1771                                 /* need an intermediate state, but might have one already */
1772                                 for (s = *intermediates; s != NULL; s = s->tmp)
1773                                 {
1774                                         assert(s->nins > 0 && s->nouts > 0);
1775                                         if (s->ins->from == a->from && s->outs->to == to)
1776                                                 break;
1777                                 }
1778                                 if (s == NULL)
1779                                 {
1780                                         s = newstate(nfa);
1781                                         if (NISERR())
1782                                                 return 0;
1783                                         s->tmp = *intermediates;
1784                                         *intermediates = s;
1785                                 }
1786                                 cparc(nfa, con, a->from, s);
1787                                 cparc(nfa, a, s, to);
1788                                 freearc(nfa, a);
1789                                 break;
1790                         case REPLACEARC:        /* replace arc's color */
1791                                 newarc(nfa, a->type, con->co, a->from, to);
1792                                 freearc(nfa, a);
1793                                 break;
1794                         default:
1795                                 assert(NOTREACHED);
1796                                 break;
1797                 }
1798         }
1799
1800         /* remaining inarcs, if any, incorporate the constraint */
1801         moveins(nfa, from, to);
1802         freearc(nfa, con);
1803         /* from state is now useless, but we leave it to pullback() to clean up */
1804         return 1;
1805 }
1806
1807 /*
1808  * pushfwd - push forward constraints forward to eliminate them
1809  */
1810 static void
1811 pushfwd(struct nfa *nfa,
1812                 FILE *f)                                /* for debug output; NULL none */
1813 {
1814         struct state *s;
1815         struct state *nexts;
1816         struct arc *a;
1817         struct arc *nexta;
1818         struct state *intermediates;
1819         int                     progress;
1820
1821         /* find and push until there are no more */
1822         do
1823         {
1824                 progress = 0;
1825                 for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1826                 {
1827                         nexts = s->next;
1828                         intermediates = NULL;
1829                         for (a = s->ins; a != NULL && !NISERR(); a = nexta)
1830                         {
1831                                 nexta = a->inchain;
1832                                 if (a->type == '$' || a->type == AHEAD)
1833                                         if (push(nfa, a, &intermediates))
1834                                                 progress = 1;
1835                         }
1836                         /* clear tmp fields of intermediate states created here */
1837                         while (intermediates != NULL)
1838                         {
1839                                 struct state *ns = intermediates->tmp;
1840
1841                                 intermediates->tmp = NULL;
1842                                 intermediates = ns;
1843                         }
1844                         /* if s is now useless, get rid of it */
1845                         if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1846                                 dropstate(nfa, s);
1847                 }
1848                 if (progress && f != NULL)
1849                         dumpnfa(nfa, f);
1850         } while (progress && !NISERR());
1851         if (NISERR())
1852                 return;
1853
1854         /*
1855          * Any $ constraints we were able to push to the post state can now be
1856          * replaced by PLAIN arcs referencing the EOS or EOL colors.  There should
1857          * be no other $ or AHEAD arcs left in the NFA, though we do not check
1858          * that here (compact() will fail if so).
1859          */
1860         for (a = nfa->post->ins; a != NULL; a = nexta)
1861         {
1862                 nexta = a->inchain;
1863                 if (a->type == '$')
1864                 {
1865                         assert(a->co == 0 || a->co == 1);
1866                         newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
1867                         freearc(nfa, a);
1868                 }
1869         }
1870 }
1871
1872 /*
1873  * push - push a forward constraint forward past its destination state
1874  *
1875  * Returns 1 if successful (which it always is unless the destination is the
1876  * post state or we have an internal error), 0 if nothing happened.
1877  *
1878  * A significant property of this function is that it deletes no pre-existing
1879  * states, and no inarcs of the constraint's to state other than the given
1880  * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
1881  * we may leave useless states behind.  Therefore, we leave it to pushfwd()
1882  * to delete such states.
1883  *
1884  * If the to state has multiple forward-constraint inarcs, and/or multiple
1885  * compatible constraint outarcs, we only need to create one new intermediate
1886  * state per combination of predecessor and successor states.  *intermediates
1887  * points to a list of such intermediate states for this to state (chained
1888  * through their tmp fields).
1889  */
1890 static int
1891 push(struct nfa *nfa,
1892          struct arc *con,
1893          struct state **intermediates)
1894 {
1895         struct state *from = con->from;
1896         struct state *to = con->to;
1897         struct arc *a;
1898         struct arc *nexta;
1899         struct state *s;
1900
1901         assert(to != from);                     /* should have gotten rid of this earlier */
1902         if (to->flag)                           /* can't push forward beyond end */
1903                 return 0;
1904         if (to->nouts == 0)
1905         {                                                       /* dead end */
1906                 freearc(nfa, con);
1907                 return 1;
1908         }
1909
1910         /*
1911          * First, clone to state if necessary to avoid other inarcs.  This may
1912          * seem wasteful, but it simplifies the logic, and we'll get rid of the
1913          * clone state again at the bottom.
1914          */
1915         if (to->nins > 1)
1916         {
1917                 s = newstate(nfa);
1918                 if (NISERR())
1919                         return 0;
1920                 copyouts(nfa, to, s);   /* duplicate outarcs */
1921                 cparc(nfa, con, from, s);       /* move constraint arc */
1922                 freearc(nfa, con);
1923                 if (NISERR())
1924                         return 0;
1925                 to = s;
1926                 con = to->ins;
1927         }
1928         assert(to->nins == 1);
1929
1930         /* propagate the constraint into the to state's outarcs */
1931         for (a = to->outs; a != NULL && !NISERR(); a = nexta)
1932         {
1933                 nexta = a->outchain;
1934                 switch (combine(nfa, con, a))
1935                 {
1936                         case INCOMPATIBLE:      /* destroy the arc */
1937                                 freearc(nfa, a);
1938                                 break;
1939                         case SATISFIED:         /* no action needed */
1940                                 break;
1941                         case COMPATIBLE:        /* swap the two arcs, more or less */
1942                                 /* need an intermediate state, but might have one already */
1943                                 for (s = *intermediates; s != NULL; s = s->tmp)
1944                                 {
1945                                         assert(s->nins > 0 && s->nouts > 0);
1946                                         if (s->ins->from == from && s->outs->to == a->to)
1947                                                 break;
1948                                 }
1949                                 if (s == NULL)
1950                                 {
1951                                         s = newstate(nfa);
1952                                         if (NISERR())
1953                                                 return 0;
1954                                         s->tmp = *intermediates;
1955                                         *intermediates = s;
1956                                 }
1957                                 cparc(nfa, con, s, a->to);
1958                                 cparc(nfa, a, from, s);
1959                                 freearc(nfa, a);
1960                                 break;
1961                         case REPLACEARC:        /* replace arc's color */
1962                                 newarc(nfa, a->type, con->co, from, a->to);
1963                                 freearc(nfa, a);
1964                                 break;
1965                         default:
1966                                 assert(NOTREACHED);
1967                                 break;
1968                 }
1969         }
1970
1971         /* remaining outarcs, if any, incorporate the constraint */
1972         moveouts(nfa, to, from);
1973         freearc(nfa, con);
1974         /* to state is now useless, but we leave it to pushfwd() to clean up */
1975         return 1;
1976 }
1977
1978 /*
1979  * combine - constraint lands on an arc, what happens?
1980  *
1981  * #def INCOMPATIBLE    1       // destroys arc
1982  * #def SATISFIED               2       // constraint satisfied
1983  * #def COMPATIBLE              3       // compatible but not satisfied yet
1984  * #def REPLACEARC              4       // replace arc's color with constraint color
1985  */
1986 static int
1987 combine(struct nfa *nfa,
1988                 struct arc *con,
1989                 struct arc *a)
1990 {
1991 #define  CA(ct,at)       (((ct)<<CHAR_BIT) | (at))
1992
1993         switch (CA(con->type, a->type))
1994         {
1995                 case CA('^', PLAIN):    /* newlines are handled separately */
1996                 case CA('$', PLAIN):
1997                         return INCOMPATIBLE;
1998                         break;
1999                 case CA(AHEAD, PLAIN):  /* color constraints meet colors */
2000                 case CA(BEHIND, PLAIN):
2001                         if (con->co == a->co)
2002                                 return SATISFIED;
2003                         if (con->co == RAINBOW)
2004                         {
2005                                 /* con is satisfied unless arc's color is a pseudocolor */
2006                                 if (!(nfa->cm->cd[a->co].flags & PSEUDO))
2007                                         return SATISFIED;
2008                         }
2009                         else if (a->co == RAINBOW)
2010                         {
2011                                 /* con is incompatible if it's for a pseudocolor */
2012                                 /* (this is hypothetical; we make no such constraints today) */
2013                                 if (nfa->cm->cd[con->co].flags & PSEUDO)
2014                                         return INCOMPATIBLE;
2015                                 /* otherwise, constraint constrains arc to be only its color */
2016                                 return REPLACEARC;
2017                         }
2018                         return INCOMPATIBLE;
2019                         break;
2020                 case CA('^', '^'):              /* collision, similar constraints */
2021                 case CA('$', '$'):
2022                         if (con->co == a->co)   /* true duplication */
2023                                 return SATISFIED;
2024                         return INCOMPATIBLE;
2025                         break;
2026                 case CA(AHEAD, AHEAD):  /* collision, similar constraints */
2027                 case CA(BEHIND, BEHIND):
2028                         if (con->co == a->co)   /* true duplication */
2029                                 return SATISFIED;
2030                         if (con->co == RAINBOW)
2031                         {
2032                                 /* con is satisfied unless arc's color is a pseudocolor */
2033                                 if (!(nfa->cm->cd[a->co].flags & PSEUDO))
2034                                         return SATISFIED;
2035                         }
2036                         else if (a->co == RAINBOW)
2037                         {
2038                                 /* con is incompatible if it's for a pseudocolor */
2039                                 /* (this is hypothetical; we make no such constraints today) */
2040                                 if (nfa->cm->cd[con->co].flags & PSEUDO)
2041                                         return INCOMPATIBLE;
2042                                 /* otherwise, constraint constrains arc to be only its color */
2043                                 return REPLACEARC;
2044                         }
2045                         return INCOMPATIBLE;
2046                         break;
2047                 case CA('^', BEHIND):   /* collision, dissimilar constraints */
2048                 case CA(BEHIND, '^'):
2049                 case CA('$', AHEAD):
2050                 case CA(AHEAD, '$'):
2051                         return INCOMPATIBLE;
2052                         break;
2053                 case CA('^', '$'):              /* constraints passing each other */
2054                 case CA('^', AHEAD):
2055                 case CA(BEHIND, '$'):
2056                 case CA(BEHIND, AHEAD):
2057                 case CA('$', '^'):
2058                 case CA('$', BEHIND):
2059                 case CA(AHEAD, '^'):
2060                 case CA(AHEAD, BEHIND):
2061                 case CA('^', LACON):
2062                 case CA(BEHIND, LACON):
2063                 case CA('$', LACON):
2064                 case CA(AHEAD, LACON):
2065                         return COMPATIBLE;
2066                         break;
2067         }
2068         assert(NOTREACHED);
2069         return INCOMPATIBLE;            /* for benefit of blind compilers */
2070 }
2071
2072 /*
2073  * fixempties - get rid of EMPTY arcs
2074  */
2075 static void
2076 fixempties(struct nfa *nfa,
2077                    FILE *f)                             /* for debug output; NULL none */
2078 {
2079         struct state *s;
2080         struct state *s2;
2081         struct state *nexts;
2082         struct arc *a;
2083         struct arc *nexta;
2084         int                     totalinarcs;
2085         struct arc **inarcsorig;
2086         struct arc **arcarray;
2087         int                     arccount;
2088         int                     prevnins;
2089         int                     nskip;
2090
2091         /*
2092          * First, get rid of any states whose sole out-arc is an EMPTY, since
2093          * they're basically just aliases for their successor.  The parsing
2094          * algorithm creates enough of these that it's worth special-casing this.
2095          */
2096         for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2097         {
2098                 nexts = s->next;
2099                 if (s->flag || s->nouts != 1)
2100                         continue;
2101                 a = s->outs;
2102                 assert(a != NULL && a->outchain == NULL);
2103                 if (a->type != EMPTY)
2104                         continue;
2105                 if (s != a->to)
2106                         moveins(nfa, s, a->to);
2107                 dropstate(nfa, s);
2108         }
2109
2110         /*
2111          * Similarly, get rid of any state with a single EMPTY in-arc, by folding
2112          * it into its predecessor.
2113          */
2114         for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2115         {
2116                 nexts = s->next;
2117                 /* while we're at it, ensure tmp fields are clear for next step */
2118                 assert(s->tmp == NULL);
2119                 if (s->flag || s->nins != 1)
2120                         continue;
2121                 a = s->ins;
2122                 assert(a != NULL && a->inchain == NULL);
2123                 if (a->type != EMPTY)
2124                         continue;
2125                 if (s != a->from)
2126                         moveouts(nfa, s, a->from);
2127                 dropstate(nfa, s);
2128         }
2129
2130         if (NISERR())
2131                 return;
2132
2133         /*
2134          * For each remaining NFA state, find all other states from which it is
2135          * reachable by a chain of one or more EMPTY arcs.  Then generate new arcs
2136          * that eliminate the need for each such chain.
2137          *
2138          * We could replace a chain of EMPTY arcs that leads from a "from" state
2139          * to a "to" state either by pushing non-EMPTY arcs forward (linking
2140          * directly from "from"'s predecessors to "to") or by pulling them back
2141          * (linking directly from "from" to "to"'s successors).  We choose to
2142          * always do the former; this choice is somewhat arbitrary, but the
2143          * approach below requires that we uniformly do one or the other.
2144          *
2145          * Suppose we have a chain of N successive EMPTY arcs (where N can easily
2146          * approach the size of the NFA).  All of the intermediate states must
2147          * have additional inarcs and outarcs, else they'd have been removed by
2148          * the steps above.  Assuming their inarcs are mostly not empties, we will
2149          * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
2150          * state in the chain must be duplicated to lead to all its successor
2151          * states as well.  So there is no hope of doing less than O(N^2) work;
2152          * however, we should endeavor to keep the big-O cost from being even
2153          * worse than that, which it can easily become without care.  In
2154          * particular, suppose we were to copy all S1's inarcs forward to S2, and
2155          * then also to S3, and then later we consider pushing S2's inarcs forward
2156          * to S3.  If we include the arcs already copied from S1 in that, we'd be
2157          * doing O(N^3) work.  (The duplicate-arc elimination built into newarc()
2158          * and its cohorts would get rid of the extra arcs, but not without cost.)
2159          *
2160          * We can avoid this cost by treating only arcs that existed at the start
2161          * of this phase as candidates to be pushed forward.  To identify those,
2162          * we remember the first inarc each state had to start with.  We rely on
2163          * the fact that newarc() and friends put new arcs on the front of their
2164          * to-states' inchains, and that this phase never deletes arcs, so that
2165          * the original arcs must be the last arcs in their to-states' inchains.
2166          *
2167          * So the process here is that, for each state in the NFA, we gather up
2168          * all non-EMPTY inarcs of states that can reach the target state via
2169          * EMPTY arcs.  We then sort, de-duplicate, and merge these arcs into the
2170          * target state's inchain.  (We can safely use sort-merge for this as long
2171          * as we update each state's original-arcs pointer after we add arcs to
2172          * it; the sort step of mergeins probably changed the order of the old
2173          * arcs.)
2174          *
2175          * Another refinement worth making is that, because we only add non-EMPTY
2176          * arcs during this phase, and all added arcs have the same from-state as
2177          * the non-EMPTY arc they were cloned from, we know ahead of time that any
2178          * states having only EMPTY outarcs will be useless for lack of outarcs
2179          * after we drop the EMPTY arcs.  (They cannot gain non-EMPTY outarcs if
2180          * they had none to start with.)  So we need not bother to update the
2181          * inchains of such states at all.
2182          */
2183
2184         /* Remember the states' first original inarcs */
2185         /* ... and while at it, count how many old inarcs there are altogether */
2186         inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
2187         if (inarcsorig == NULL)
2188         {
2189                 NERR(REG_ESPACE);
2190                 return;
2191         }
2192         totalinarcs = 0;
2193         for (s = nfa->states; s != NULL; s = s->next)
2194         {
2195                 inarcsorig[s->no] = s->ins;
2196                 totalinarcs += s->nins;
2197         }
2198
2199         /*
2200          * Create a workspace for accumulating the inarcs to be added to the
2201          * current target state.  totalinarcs is probably a considerable
2202          * overestimate of the space needed, but the NFA is unlikely to be large
2203          * enough at this point to make it worth being smarter.
2204          */
2205         arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
2206         if (arcarray == NULL)
2207         {
2208                 NERR(REG_ESPACE);
2209                 FREE(inarcsorig);
2210                 return;
2211         }
2212
2213         /* And iterate over the target states */
2214         for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2215         {
2216                 /* Ignore target states without non-EMPTY outarcs, per note above */
2217                 if (!s->flag && !hasnonemptyout(s))
2218                         continue;
2219
2220                 /* Find predecessor states and accumulate their original inarcs */
2221                 arccount = 0;
2222                 for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
2223                 {
2224                         /* Add s2's original inarcs to arcarray[], but ignore empties */
2225                         for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
2226                         {
2227                                 if (a->type != EMPTY)
2228                                         arcarray[arccount++] = a;
2229                         }
2230
2231                         /* Reset the tmp fields as we walk back */
2232                         nexts = s2->tmp;
2233                         s2->tmp = NULL;
2234                 }
2235                 s->tmp = NULL;
2236                 assert(arccount <= totalinarcs);
2237
2238                 /* Remember how many original inarcs this state has */
2239                 prevnins = s->nins;
2240
2241                 /* Add non-duplicate inarcs to target state */
2242                 mergeins(nfa, s, arcarray, arccount);
2243
2244                 /* Now we must update the state's inarcsorig pointer */
2245                 nskip = s->nins - prevnins;
2246                 a = s->ins;
2247                 while (nskip-- > 0)
2248                         a = a->inchain;
2249                 inarcsorig[s->no] = a;
2250         }
2251
2252         FREE(arcarray);
2253         FREE(inarcsorig);
2254
2255         if (NISERR())
2256                 return;
2257
2258         /*
2259          * Now remove all the EMPTY arcs, since we don't need them anymore.
2260          */
2261         for (s = nfa->states; s != NULL; s = s->next)
2262         {
2263                 for (a = s->outs; a != NULL; a = nexta)
2264                 {
2265                         nexta = a->outchain;
2266                         if (a->type == EMPTY)
2267                                 freearc(nfa, a);
2268                 }
2269         }
2270
2271         /*
2272          * And remove any states that have become useless.  (This cleanup is not
2273          * very thorough, and would be even less so if we tried to combine it with
2274          * the previous step; but cleanup() will take care of anything we miss.)
2275          */
2276         for (s = nfa->states; s != NULL; s = nexts)
2277         {
2278                 nexts = s->next;
2279                 if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2280                         dropstate(nfa, s);
2281         }
2282
2283         if (f != NULL)
2284                 dumpnfa(nfa, f);
2285 }
2286
2287 /*
2288  * emptyreachable - recursively find all states that can reach s by EMPTY arcs
2289  *
2290  * The return value is the last such state found.  Its tmp field links back
2291  * to the next-to-last such state, and so on back to s, so that all these
2292  * states can be located without searching the whole NFA.
2293  *
2294  * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2295  * maintained by that function.  This lets us skip over all new inarcs, which
2296  * are certainly not EMPTY arcs.
2297  *
2298  * The maximum recursion depth here is equal to the length of the longest
2299  * loop-free chain of EMPTY arcs, which is surely no more than the size of
2300  * the NFA ... but that could still be enough to cause trouble.
2301  */
2302 static struct state *
2303 emptyreachable(struct nfa *nfa,
2304                            struct state *s,
2305                            struct state *lastfound,
2306                            struct arc **inarcsorig)
2307 {
2308         struct arc *a;
2309
2310         /* Since this is recursive, it could be driven to stack overflow */
2311         if (STACK_TOO_DEEP(nfa->v->re))
2312         {
2313                 NERR(REG_ETOOBIG);
2314                 return lastfound;
2315         }
2316
2317         s->tmp = lastfound;
2318         lastfound = s;
2319         for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
2320         {
2321                 if (a->type == EMPTY && a->from->tmp == NULL)
2322                         lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2323         }
2324         return lastfound;
2325 }
2326
2327 /*
2328  * isconstraintarc - detect whether an arc is of a constraint type
2329  */
2330 static inline int
2331 isconstraintarc(struct arc *a)
2332 {
2333         switch (a->type)
2334         {
2335                 case '^':
2336                 case '$':
2337                 case BEHIND:
2338                 case AHEAD:
2339                 case LACON:
2340                         return 1;
2341         }
2342         return 0;
2343 }
2344
2345 /*
2346  * hasconstraintout - does state have a constraint out arc?
2347  */
2348 static int
2349 hasconstraintout(struct state *s)
2350 {
2351         struct arc *a;
2352
2353         for (a = s->outs; a != NULL; a = a->outchain)
2354         {
2355                 if (isconstraintarc(a))
2356                         return 1;
2357         }
2358         return 0;
2359 }
2360
2361 /*
2362  * fixconstraintloops - get rid of loops containing only constraint arcs
2363  *
2364  * A loop of states that contains only constraint arcs is useless, since
2365  * passing around the loop represents no forward progress.  Moreover, it
2366  * would cause infinite looping in pullback/pushfwd, so we need to get rid
2367  * of such loops before doing that.
2368  */
2369 static void
2370 fixconstraintloops(struct nfa *nfa,
2371                                    FILE *f)             /* for debug output; NULL none */
2372 {
2373         struct state *s;
2374         struct state *nexts;
2375         struct arc *a;
2376         struct arc *nexta;
2377         int                     hasconstraints;
2378
2379         /*
2380          * In the trivial case of a state that loops to itself, we can just drop
2381          * the constraint arc altogether.  This is worth special-casing because
2382          * such loops are far more common than loops containing multiple states.
2383          * While we're at it, note whether any constraint arcs survive.
2384          */
2385         hasconstraints = 0;
2386         for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2387         {
2388                 nexts = s->next;
2389                 /* while we're at it, ensure tmp fields are clear for next step */
2390                 assert(s->tmp == NULL);
2391                 for (a = s->outs; a != NULL && !NISERR(); a = nexta)
2392                 {
2393                         nexta = a->outchain;
2394                         if (isconstraintarc(a))
2395                         {
2396                                 if (a->to == s)
2397                                         freearc(nfa, a);
2398                                 else
2399                                         hasconstraints = 1;
2400                         }
2401                 }
2402                 /* If we removed all the outarcs, the state is useless. */
2403                 if (s->nouts == 0 && !s->flag)
2404                         dropstate(nfa, s);
2405         }
2406
2407         /* Nothing to do if no remaining constraint arcs */
2408         if (NISERR() || !hasconstraints)
2409                 return;
2410
2411         /*
2412          * Starting from each remaining NFA state, search outwards for a
2413          * constraint loop.  If we find a loop, break the loop, then start the
2414          * search over.  (We could possibly retain some state from the first scan,
2415          * but it would complicate things greatly, and multi-state constraint
2416          * loops are rare enough that it's not worth optimizing the case.)
2417          */
2418 restart:
2419         for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2420         {
2421                 if (findconstraintloop(nfa, s))
2422                         goto restart;
2423         }
2424
2425         if (NISERR())
2426                 return;
2427
2428         /*
2429          * Now remove any states that have become useless.  (This cleanup is not
2430          * very thorough, and would be even less so if we tried to combine it with
2431          * the previous step; but cleanup() will take care of anything we miss.)
2432          *
2433          * Because findconstraintloop intentionally doesn't reset all tmp fields,
2434          * we have to clear them after it's done.  This is a convenient place to
2435          * do that, too.
2436          */
2437         for (s = nfa->states; s != NULL; s = nexts)
2438         {
2439                 nexts = s->next;
2440                 s->tmp = NULL;
2441                 if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2442                         dropstate(nfa, s);
2443         }
2444
2445         if (f != NULL)
2446                 dumpnfa(nfa, f);
2447 }
2448
2449 /*
2450  * findconstraintloop - recursively find a loop of constraint arcs
2451  *
2452  * If we find a loop, break it by calling breakconstraintloop(), then
2453  * return 1; otherwise return 0.
2454  *
2455  * State tmp fields are guaranteed all NULL on a success return, because
2456  * breakconstraintloop does that.  After a failure return, any state that
2457  * is known not to be part of a loop is marked with s->tmp == s; this allows
2458  * us not to have to re-prove that fact on later calls.  (This convention is
2459  * workable because we already eliminated single-state loops.)
2460  *
2461  * Note that the found loop doesn't necessarily include the first state we
2462  * are called on.  Any loop reachable from that state will do.
2463  *
2464  * The maximum recursion depth here is one more than the length of the longest
2465  * loop-free chain of constraint arcs, which is surely no more than the size
2466  * of the NFA ... but that could still be enough to cause trouble.
2467  */
2468 static int
2469 findconstraintloop(struct nfa *nfa, struct state *s)
2470 {
2471         struct arc *a;
2472
2473         /* Since this is recursive, it could be driven to stack overflow */
2474         if (STACK_TOO_DEEP(nfa->v->re))
2475         {
2476                 NERR(REG_ETOOBIG);
2477                 return 1;                               /* to exit as quickly as possible */
2478         }
2479
2480         if (s->tmp != NULL)
2481         {
2482                 /* Already proven uninteresting? */
2483                 if (s->tmp == s)
2484                         return 0;
2485                 /* Found a loop involving s */
2486                 breakconstraintloop(nfa, s);
2487                 /* The tmp fields have been cleaned up by breakconstraintloop */
2488                 return 1;
2489         }
2490         for (a = s->outs; a != NULL; a = a->outchain)
2491         {
2492                 if (isconstraintarc(a))
2493                 {
2494                         struct state *sto = a->to;
2495
2496                         assert(sto != s);
2497                         s->tmp = sto;
2498                         if (findconstraintloop(nfa, sto))
2499                                 return 1;
2500                 }
2501         }
2502
2503         /*
2504          * If we get here, no constraint loop exists leading out from s.  Mark it
2505          * with s->tmp == s so we need not rediscover that fact again later.
2506          */
2507         s->tmp = s;
2508         return 0;
2509 }
2510
2511 /*
2512  * breakconstraintloop - break a loop of constraint arcs
2513  *
2514  * sinitial is any one member state of the loop.  Each loop member's tmp
2515  * field links to its successor within the loop.  (Note that this function
2516  * will reset all the tmp fields to NULL.)
2517  *
2518  * We can break the loop by, for any one state S1 in the loop, cloning its
2519  * loop successor state S2 (and possibly following states), and then moving
2520  * all S1->S2 constraint arcs to point to the cloned S2.  The cloned S2 should
2521  * copy any non-constraint outarcs of S2.  Constraint outarcs should be
2522  * dropped if they point back to S1, else they need to be copied as arcs to
2523  * similarly cloned states S3, S4, etc.  In general, each cloned state copies
2524  * non-constraint outarcs, drops constraint outarcs that would lead to itself
2525  * or any earlier cloned state, and sends other constraint outarcs to newly
2526  * cloned states.  No cloned state will have any inarcs that aren't constraint
2527  * arcs or do not lead from S1 or earlier-cloned states.  It's okay to drop
2528  * constraint back-arcs since they would not take us to any state we've not
2529  * already been in; therefore, no new constraint loop is created.  In this way
2530  * we generate a modified NFA that can still represent every useful state
2531  * sequence, but not sequences that represent state loops with no consumption
2532  * of input data.  Note that the set of cloned states will certainly include
2533  * all of the loop member states other than S1, and it may also include
2534  * non-loop states that are reachable from S2 via constraint arcs.  This is
2535  * important because there is no guarantee that findconstraintloop found a
2536  * maximal loop (and searching for one would be NP-hard, so don't try).
2537  * Frequently the "non-loop states" are actually part of a larger loop that
2538  * we didn't notice, and indeed there may be several overlapping loops.
2539  * This technique ensures convergence in such cases, while considering only
2540  * the originally-found loop does not.
2541  *
2542  * If there is only one S1->S2 constraint arc, then that constraint is
2543  * certainly satisfied when we enter any of the clone states.  This means that
2544  * in the common case where many of the constraint arcs are identically
2545  * labeled, we can merge together clone states linked by a similarly-labeled
2546  * constraint: if we can get to the first one we can certainly get to the
2547  * second, so there's no need to distinguish.  This greatly reduces the number
2548  * of new states needed, so we preferentially break the given loop at a state
2549  * pair where this is true.
2550  *
2551  * Furthermore, it's fairly common to find that a cloned successor state has
2552  * no outarcs, especially if we're a bit aggressive about removing unnecessary
2553  * outarcs.  If that happens, then there is simply not any interesting state
2554  * that can be reached through the predecessor's loop arcs, which means we can
2555  * break the loop just by removing those loop arcs, with no new states added.
2556  */
2557 static void
2558 breakconstraintloop(struct nfa *nfa, struct state *sinitial)
2559 {
2560         struct state *s;
2561         struct state *shead;
2562         struct state *stail;
2563         struct state *sclone;
2564         struct state *nexts;
2565         struct arc *refarc;
2566         struct arc *a;
2567         struct arc *nexta;
2568
2569         /*
2570          * Start by identifying which loop step we want to break at.
2571          * Preferentially this is one with only one constraint arc.  (XXX are
2572          * there any other secondary heuristics we want to use here?)  Set refarc
2573          * to point to the selected lone constraint arc, if there is one.
2574          */
2575         refarc = NULL;
2576         s = sinitial;
2577         do
2578         {
2579                 nexts = s->tmp;
2580                 assert(nexts != s);             /* should not see any one-element loops */
2581                 if (refarc == NULL)
2582                 {
2583                         int                     narcs = 0;
2584
2585                         for (a = s->outs; a != NULL; a = a->outchain)
2586                         {
2587                                 if (a->to == nexts && isconstraintarc(a))
2588                                 {
2589                                         refarc = a;
2590                                         narcs++;
2591                                 }
2592                         }
2593                         assert(narcs > 0);
2594                         if (narcs > 1)
2595                                 refarc = NULL;  /* multiple constraint arcs here, no good */
2596                 }
2597                 s = nexts;
2598         } while (s != sinitial);
2599
2600         if (refarc)
2601         {
2602                 /* break at the refarc */
2603                 shead = refarc->from;
2604                 stail = refarc->to;
2605                 assert(stail == shead->tmp);
2606         }
2607         else
2608         {
2609                 /* for lack of a better idea, break after sinitial */
2610                 shead = sinitial;
2611                 stail = sinitial->tmp;
2612         }
2613
2614         /*
2615          * Reset the tmp fields so that we can use them for local storage in
2616          * clonesuccessorstates.  (findconstraintloop won't mind, since it's just
2617          * going to abandon its search anyway.)
2618          */
2619         for (s = nfa->states; s != NULL; s = s->next)
2620                 s->tmp = NULL;
2621
2622         /*
2623          * Recursively build clone state(s) as needed.
2624          */
2625         sclone = newstate(nfa);
2626         if (sclone == NULL)
2627         {
2628                 assert(NISERR());
2629                 return;
2630         }
2631
2632         clonesuccessorstates(nfa, stail, sclone, shead, refarc,
2633                                                  NULL, NULL, nfa->nstates);
2634
2635         if (NISERR())
2636                 return;
2637
2638         /*
2639          * It's possible that sclone has no outarcs at all, in which case it's
2640          * useless.  (We don't try extremely hard to get rid of useless states
2641          * here, but this is an easy and fairly common case.)
2642          */
2643         if (sclone->nouts == 0)
2644         {
2645                 freestate(nfa, sclone);
2646                 sclone = NULL;
2647         }
2648
2649         /*
2650          * Move shead's constraint-loop arcs to point to sclone, or just drop them
2651          * if we discovered we don't need sclone.
2652          */
2653         for (a = shead->outs; a != NULL; a = nexta)
2654         {
2655                 nexta = a->outchain;
2656                 if (a->to == stail && isconstraintarc(a))
2657                 {
2658                         if (sclone)
2659                                 cparc(nfa, a, shead, sclone);
2660                         freearc(nfa, a);
2661                         if (NISERR())
2662                                 break;
2663                 }
2664         }
2665 }
2666
2667 /*
2668  * clonesuccessorstates - create a tree of constraint-arc successor states
2669  *
2670  * ssource is the state to be cloned, and sclone is the state to copy its
2671  * outarcs into.  sclone's inarcs, if any, should already be set up.
2672  *
2673  * spredecessor is the original predecessor state that we are trying to build
2674  * successors for (it may not be the immediate predecessor of ssource).
2675  * refarc, if not NULL, is the original constraint arc that is known to have
2676  * been traversed out of spredecessor to reach the successor(s).
2677  *
2678  * For each cloned successor state, we transiently create a "donemap" that is
2679  * a boolean array showing which source states we've already visited for this
2680  * clone state.  This prevents infinite recursion as well as useless repeat
2681  * visits to the same state subtree (which can add up fast, since typical NFAs
2682  * have multiple redundant arc pathways).  Each donemap is a char array
2683  * indexed by state number.  The donemaps are all of the same size "nstates",
2684  * which is nfa->nstates as of the start of the recursion.  This is enough to
2685  * have entries for all pre-existing states, but *not* entries for clone
2686  * states created during the recursion.  That's okay since we have no need to
2687  * mark those.
2688  *
2689  * curdonemap is NULL when recursing to a new sclone state, or sclone's
2690  * donemap when we are recursing without having created a new state (which we
2691  * do when we decide we can merge a successor state into the current clone
2692  * state).  outerdonemap is NULL at the top level and otherwise the parent
2693  * clone state's donemap.
2694  *
2695  * The successor states we create and fill here form a strict tree structure,
2696  * with each state having exactly one predecessor, except that the toplevel
2697  * state has no inarcs as yet (breakconstraintloop will add its inarcs from
2698  * spredecessor after we're done).  Thus, we can examine sclone's inarcs back
2699  * to the root, plus refarc if any, to identify the set of constraints already
2700  * known valid at the current point.  This allows us to avoid generating extra
2701  * successor states.
2702  */
2703 static void
2704 clonesuccessorstates(struct nfa *nfa,
2705                                          struct state *ssource,
2706                                          struct state *sclone,
2707                                          struct state *spredecessor,
2708                                          struct arc *refarc,
2709                                          char *curdonemap,
2710                                          char *outerdonemap,
2711                                          int nstates)
2712 {
2713         char       *donemap;
2714         struct arc *a;
2715
2716         /* Since this is recursive, it could be driven to stack overflow */
2717         if (STACK_TOO_DEEP(nfa->v->re))
2718         {
2719                 NERR(REG_ETOOBIG);
2720                 return;
2721         }
2722
2723         /* If this state hasn't already got a donemap, create one */
2724         donemap = curdonemap;
2725         if (donemap == NULL)
2726         {
2727                 donemap = (char *) MALLOC(nstates * sizeof(char));
2728                 if (donemap == NULL)
2729                 {
2730                         NERR(REG_ESPACE);
2731                         return;
2732                 }
2733
2734                 if (outerdonemap != NULL)
2735                 {
2736                         /*
2737                          * Not at outermost recursion level, so copy the outer level's
2738                          * donemap; this ensures that we see states in process of being
2739                          * visited at outer levels, or already merged into predecessor
2740                          * states, as ones we shouldn't traverse back to.
2741                          */
2742                         memcpy(donemap, outerdonemap, nstates * sizeof(char));
2743                 }
2744                 else
2745                 {
2746                         /* At outermost level, only spredecessor is off-limits */
2747                         memset(donemap, 0, nstates * sizeof(char));
2748                         assert(spredecessor->no < nstates);
2749                         donemap[spredecessor->no] = 1;
2750                 }
2751         }
2752
2753         /* Mark ssource as visited in the donemap */
2754         assert(ssource->no < nstates);
2755         assert(donemap[ssource->no] == 0);
2756         donemap[ssource->no] = 1;
2757
2758         /*
2759          * We proceed by first cloning all of ssource's outarcs, creating new
2760          * clone states as needed but not doing more with them than that.  Then in
2761          * a second pass, recurse to process the child clone states.  This allows
2762          * us to have only one child clone state per reachable source state, even
2763          * when there are multiple outarcs leading to the same state.  Also, when
2764          * we do visit a child state, its set of inarcs is known exactly, which
2765          * makes it safe to apply the constraint-is-already-checked optimization.
2766          * Also, this ensures that we've merged all the states we can into the
2767          * current clone before we recurse to any children, thus possibly saving
2768          * them from making extra images of those states.
2769          *
2770          * While this function runs, child clone states of the current state are
2771          * marked by setting their tmp fields to point to the original state they
2772          * were cloned from.  This makes it possible to detect multiple outarcs
2773          * leading to the same state, and also makes it easy to distinguish clone
2774          * states from original states (which will have tmp == NULL).
2775          */
2776         for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
2777         {
2778                 struct state *sto = a->to;
2779
2780                 /*
2781                  * We do not consider cloning successor states that have no constraint
2782                  * outarcs; just link to them as-is.  They cannot be part of a
2783                  * constraint loop so there is no need to make copies.  In particular,
2784                  * this rule keeps us from trying to clone the post state, which would
2785                  * be a bad idea.
2786                  */
2787                 if (isconstraintarc(a) && hasconstraintout(sto))
2788                 {
2789                         struct state *prevclone;
2790                         int                     canmerge;
2791                         struct arc *a2;
2792
2793                         /*
2794                          * Back-link constraint arcs must not be followed.  Nor is there a
2795                          * need to revisit states previously merged into this clone.
2796                          */
2797                         assert(sto->no < nstates);
2798                         if (donemap[sto->no] != 0)
2799                                 continue;
2800
2801                         /*
2802                          * Check whether we already have a child clone state for this
2803                          * source state.
2804                          */
2805                         prevclone = NULL;
2806                         for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
2807                         {
2808                                 if (a2->to->tmp == sto)
2809                                 {
2810                                         prevclone = a2->to;
2811                                         break;
2812                                 }
2813                         }
2814
2815                         /*
2816                          * If this arc is labeled the same as refarc, or the same as any
2817                          * arc we must have traversed to get to sclone, then no additional
2818                          * constraints need to be met to get to sto, so we should just
2819                          * merge its outarcs into sclone.
2820                          */
2821                         if (refarc && a->type == refarc->type && a->co == refarc->co)
2822                                 canmerge = 1;
2823                         else
2824                         {
2825                                 struct state *s;
2826
2827                                 canmerge = 0;
2828                                 for (s = sclone; s->ins; s = s->ins->from)
2829                                 {
2830                                         if (s->nins == 1 &&
2831                                                 a->type == s->ins->type && a->co == s->ins->co)
2832                                         {
2833                                                 canmerge = 1;
2834                                                 break;
2835                                         }
2836                                 }
2837                         }
2838
2839                         if (canmerge)
2840                         {
2841                                 /*
2842                                  * We can merge into sclone.  If we previously made a child
2843                                  * clone state, drop it; there's no need to visit it.  (This
2844                                  * can happen if ssource has multiple pathways to sto, and we
2845                                  * only just now found one that is provably a no-op.)
2846                                  */
2847                                 if (prevclone)
2848                                         dropstate(nfa, prevclone);      /* kills our outarc, too */
2849
2850                                 /* Recurse to merge sto's outarcs into sclone */
2851                                 clonesuccessorstates(nfa,
2852                                                                          sto,
2853                                                                          sclone,
2854                                                                          spredecessor,
2855                                                                          refarc,
2856                                                                          donemap,
2857                                                                          outerdonemap,
2858                                                                          nstates);
2859                                 /* sto should now be marked as previously visited */
2860                                 assert(NISERR() || donemap[sto->no] == 1);
2861                         }
2862                         else if (prevclone)
2863                         {
2864                                 /*
2865                                  * We already have a clone state for this successor, so just
2866                                  * make another arc to it.
2867                                  */
2868                                 cparc(nfa, a, sclone, prevclone);
2869                         }
2870                         else
2871                         {
2872                                 /*
2873                                  * We need to create a new successor clone state.
2874                                  */
2875                                 struct state *stoclone;
2876
2877                                 stoclone = newstate(nfa);
2878                                 if (stoclone == NULL)
2879                                 {
2880                                         assert(NISERR());
2881                                         break;
2882                                 }
2883                                 /* Mark it as to what it's a clone of */
2884                                 stoclone->tmp = sto;
2885                                 /* ... and add the outarc leading to it */
2886                                 cparc(nfa, a, sclone, stoclone);
2887                         }
2888                 }
2889                 else
2890                 {
2891                         /*
2892                          * Non-constraint outarcs just get copied to sclone, as do outarcs
2893                          * leading to states with no constraint outarc.
2894                          */
2895                         cparc(nfa, a, sclone, sto);
2896                 }
2897         }
2898
2899         /*
2900          * If we are at outer level for this clone state, recurse to all its child
2901          * clone states, clearing their tmp fields as we go.  (If we're not
2902          * outermost for sclone, leave this to be done by the outer call level.)
2903          * Note that if we have multiple outarcs leading to the same clone state,
2904          * it will only be recursed-to once.
2905          */
2906         if (curdonemap == NULL)
2907         {
2908                 for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
2909                 {
2910                         struct state *stoclone = a->to;
2911                         struct state *sto = stoclone->tmp;
2912
2913                         if (sto != NULL)
2914                         {
2915                                 stoclone->tmp = NULL;
2916                                 clonesuccessorstates(nfa,
2917                                                                          sto,
2918                                                                          stoclone,
2919                                                                          spredecessor,
2920                                                                          refarc,
2921                                                                          NULL,
2922                                                                          donemap,
2923                                                                          nstates);
2924                         }
2925                 }
2926
2927                 /* Don't forget to free sclone's donemap when done with it */
2928                 FREE(donemap);
2929         }
2930 }
2931
2932 /*
2933  * removecantmatch - remove CANTMATCH arcs, which are no longer useful
2934  * once we are done with the parsing phase.  (We need them only to
2935  * preserve connectedness of NFA subgraphs during parsing.)
2936  */
2937 static void
2938 removecantmatch(struct nfa *nfa)
2939 {
2940         struct state *s;
2941
2942         for (s = nfa->states; s != NULL; s = s->next)
2943         {
2944                 struct arc *a;
2945                 struct arc *nexta;
2946
2947                 for (a = s->outs; a != NULL; a = nexta)
2948                 {
2949                         nexta = a->outchain;
2950                         if (a->type == CANTMATCH)
2951                         {
2952                                 freearc(nfa, a);
2953                                 if (NISERR())
2954                                         return;
2955                         }
2956                 }
2957         }
2958 }
2959
2960 /*
2961  * cleanup - clean up NFA after optimizations
2962  */
2963 static void
2964 cleanup(struct nfa *nfa)
2965 {
2966         struct state *s;
2967         struct state *nexts;
2968         int                     n;
2969
2970         if (NISERR())
2971                 return;
2972
2973         /* clear out unreachable or dead-end states */
2974         /* use pre to mark reachable, then post to mark can-reach-post */
2975         markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
2976         markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2977         for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2978         {
2979                 nexts = s->next;
2980                 if (s->tmp != nfa->post && !s->flag)
2981                         dropstate(nfa, s);
2982         }
2983         assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2984         cleartraverse(nfa, nfa->pre);
2985         assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
2986         /* the nins==0 (final unreachable) case will be caught later */
2987
2988         /* renumber surviving states */
2989         n = 0;
2990         for (s = nfa->states; s != NULL; s = s->next)
2991                 s->no = n++;
2992         nfa->nstates = n;
2993 }
2994
2995 /*
2996  * markreachable - recursive marking of reachable states
2997  */
2998 static void
2999 markreachable(struct nfa *nfa,
3000                           struct state *s,
3001                           struct state *okay,   /* consider only states with this mark */
3002                           struct state *mark)   /* the value to mark with */
3003 {
3004         struct arc *a;
3005
3006         /* Since this is recursive, it could be driven to stack overflow */
3007         if (STACK_TOO_DEEP(nfa->v->re))
3008         {
3009                 NERR(REG_ETOOBIG);
3010                 return;
3011         }
3012
3013         if (s->tmp != okay)
3014                 return;
3015         s->tmp = mark;
3016
3017         for (a = s->outs; a != NULL; a = a->outchain)
3018                 markreachable(nfa, a->to, okay, mark);
3019 }
3020
3021 /*
3022  * markcanreach - recursive marking of states which can reach here
3023  */
3024 static void
3025 markcanreach(struct nfa *nfa,
3026                          struct state *s,
3027                          struct state *okay,    /* consider only states with this mark */
3028                          struct state *mark)    /* the value to mark with */
3029 {
3030         struct arc *a;
3031
3032         /* Since this is recursive, it could be driven to stack overflow */
3033         if (STACK_TOO_DEEP(nfa->v->re))
3034         {
3035                 NERR(REG_ETOOBIG);
3036                 return;
3037         }
3038
3039         if (s->tmp != okay)
3040                 return;
3041         s->tmp = mark;
3042
3043         for (a = s->ins; a != NULL; a = a->inchain)
3044                 markcanreach(nfa, a->from, okay, mark);
3045 }
3046
3047 /*
3048  * analyze - ascertain potentially-useful facts about an optimized NFA
3049  */
3050 static long                                             /* re_info bits to be ORed in */
3051 analyze(struct nfa *nfa)
3052 {
3053         struct arc *a;
3054         struct arc *aa;
3055
3056         if (NISERR())
3057                 return 0;
3058
3059         /* Detect whether NFA can't match anything */
3060         if (nfa->pre->outs == NULL)
3061                 return REG_UIMPOSSIBLE;
3062
3063         /* Detect whether NFA matches all strings (possibly with length bounds) */
3064         checkmatchall(nfa);
3065
3066         /* Detect whether NFA can possibly match a zero-length string */
3067         for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3068                 for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
3069                         if (aa->to == nfa->post)
3070                                 return REG_UEMPTYMATCH;
3071         return 0;
3072 }
3073
3074 /*
3075  * checkmatchall - does the NFA represent no more than a string length test?
3076  *
3077  * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
3078  * to begin with) and set the MATCHALL bit in nfa->flags.
3079  *
3080  * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
3081  * for pseudocolors (i.e., BOS/BOL/EOS/EOL).  We must be able to reach the
3082  * post state via RAINBOW arcs, and if there are any loops in the graph, they
3083  * must be loop-to-self arcs, ensuring that each loop iteration consumes
3084  * exactly one character.  (Longer loops are problematic because they create
3085  * non-consecutive possible match lengths; we have no good way to represent
3086  * that situation for lengths beyond the DUPINF limit.)
3087  *
3088  * Pseudocolor arcs complicate things a little.  We know that they can only
3089  * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
3090  * EOS/EOL).  There, they must exactly replicate the parallel RAINBOW arcs,
3091  * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
3092  * and BOL outarcs to state 2, and no others.  Missing or extra pseudocolor
3093  * arcs can occur, meaning that the NFA involves some constraint on the
3094  * adjacent characters, which makes it not a matchall NFA.
3095  */
3096 static void
3097 checkmatchall(struct nfa *nfa)
3098 {
3099         bool      **haspaths;
3100         struct state *s;
3101         int                     i;
3102
3103         /*
3104          * If there are too many states, don't bother trying to detect matchall.
3105          * This limit serves to bound the time and memory we could consume below.
3106          * Note that even if the graph is all-RAINBOW, if there are significantly
3107          * more than DUPINF states then it's likely that there are paths of length
3108          * more than DUPINF, which would force us to fail anyhow.  In practice,
3109          * plausible ways of writing a matchall regex with maximum finite path
3110          * length K tend not to have very many more than K states.
3111          */
3112         if (nfa->nstates > DUPINF * 2)
3113                 return;
3114
3115         /*
3116          * First, scan all the states to verify that only RAINBOW arcs appear,
3117          * plus pseudocolor arcs adjacent to the pre and post states.  This lets
3118          * us quickly eliminate most cases that aren't matchall NFAs.
3119          */
3120         for (s = nfa->states; s != NULL; s = s->next)
3121         {
3122                 struct arc *a;
3123
3124                 for (a = s->outs; a != NULL; a = a->outchain)
3125                 {
3126                         if (a->type != PLAIN)
3127                                 return;                 /* any LACONs make it non-matchall */
3128                         if (a->co != RAINBOW)
3129                         {
3130                                 if (nfa->cm->cd[a->co].flags & PSEUDO)
3131                                 {
3132                                         /*
3133                                          * Pseudocolor arc: verify it's in a valid place (this
3134                                          * seems quite unlikely to fail, but let's be sure).
3135                                          */
3136                                         if (s == nfa->pre &&
3137                                                 (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
3138                                                  /* okay BOS/BOL arc */ ;
3139                                         else if (a->to == nfa->post &&
3140                                                          (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
3141                                                  /* okay EOS/EOL arc */ ;
3142                                         else
3143                                                 return; /* unexpected pseudocolor arc */
3144                                         /* We'll check these arcs some more below. */
3145                                 }
3146                                 else
3147                                         return;         /* any other color makes it non-matchall */
3148                         }
3149                 }
3150                 /* Also, assert that the tmp fields are available for use. */
3151                 assert(s->tmp == NULL);
3152         }
3153
3154         /*
3155          * The next cheapest check we can make is to verify that the BOS/BOL
3156          * outarcs of the pre state reach the same states as its RAINBOW outarcs.
3157          * If they don't, the NFA expresses some constraints on the character
3158          * before the matched string, making it non-matchall.  Likewise, the
3159          * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
3160          */
3161         if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
3162                 !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
3163                 !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
3164                 !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
3165                 return;
3166
3167         /*
3168          * Initialize an array of path-length arrays, in which
3169          * checkmatchall_recurse will return per-state results.  This lets us
3170          * memo-ize the recursive search and avoid exponential time consumption.
3171          */
3172         haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
3173         if (haspaths == NULL)
3174                 return;                                 /* fail quietly */
3175         memset(haspaths, 0, nfa->nstates * sizeof(bool *));
3176
3177         /*
3178          * Recursively search the graph for all-RAINBOW paths to the "post" state,
3179          * starting at the "pre" state, and computing the lengths of the paths.
3180          * (Given the preceding checks, there should be at least one such path.
3181          * However we could get back a false result anyway, in case there are
3182          * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
3183          * failures such as ENOMEM.)
3184          */
3185         if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
3186         {
3187                 /* The useful result is the path length array for the pre state */
3188                 bool       *haspath = haspaths[nfa->pre->no];
3189                 int                     minmatch,
3190                                         maxmatch,
3191                                         morematch;
3192
3193                 assert(haspath != NULL);
3194
3195                 /*
3196                  * haspath[] now represents the set of possible path lengths; but we
3197                  * want to reduce that to a min and max value, because it doesn't seem
3198                  * worth complicating regexec.c to deal with nonconsecutive possible
3199                  * match lengths.  Find min and max of first run of lengths, then
3200                  * verify there are no nonconsecutive lengths.
3201                  */
3202                 for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
3203                 {
3204                         if (haspath[minmatch])
3205                                 break;
3206                 }
3207                 assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
3208                 for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
3209                 {
3210                         if (!haspath[maxmatch + 1])
3211                                 break;
3212                 }
3213                 for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
3214                 {
3215                         if (haspath[morematch])
3216                         {
3217                                 haspath = NULL; /* fail, there are nonconsecutive lengths */
3218                                 break;
3219                         }
3220                 }
3221
3222                 if (haspath != NULL)
3223                 {
3224                         /*
3225                          * Success, so record the info.  Here we have a fine point: the
3226                          * path length from the pre state includes the pre-to-initial
3227                          * transition, so it's one more than the actually matched string
3228                          * length.  (We avoided counting the final-to-post transition
3229                          * within checkmatchall_recurse, but not this one.)  This is why
3230                          * checkmatchall_recurse allows one more level of path length than
3231                          * might seem necessary.  This decrement also takes care of
3232                          * converting checkmatchall_recurse's definition of "infinity" as
3233                          * "DUPINF+1" to our normal representation as "DUPINF".
3234                          */
3235                         assert(minmatch > 0);   /* else pre and post states were adjacent */
3236                         nfa->minmatchall = minmatch - 1;
3237                         nfa->maxmatchall = maxmatch - 1;
3238                         nfa->flags |= MATCHALL;
3239                 }
3240         }
3241
3242         /* Clean up */
3243         for (i = 0; i < nfa->nstates; i++)
3244         {
3245                 if (haspaths[i] != NULL)
3246                         FREE(haspaths[i]);
3247         }
3248         FREE(haspaths);
3249 }
3250
3251 /*
3252  * checkmatchall_recurse - recursive search for checkmatchall
3253  *
3254  * s is the state to be examined in this recursion level.
3255  * haspaths[] is an array of per-state exit path length arrays.
3256  *
3257  * We return true if the search was performed successfully, false if
3258  * we had to fail because of multi-state loops or other internal reasons.
3259  * (Because "dead" states that can't reach the post state have been
3260  * eliminated, and we already verified that only RAINBOW and matching
3261  * pseudocolor arcs exist, every state should have RAINBOW path(s) to
3262  * the post state.  Hence we take a false result from recursive calls
3263  * as meaning that we'd better fail altogether, not just that that
3264  * particular state can't reach the post state.)
3265  *
3266  * On success, we store a malloc'd result array in haspaths[s->no],
3267  * showing the possible path lengths from s to the post state.
3268  * Each state's haspath[] array is of length DUPINF+2.  The entries from
3269  * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
3270  * from this state to the string end.  haspath[DUPINF+1] is true if all
3271  * path lengths >= DUPINF+1 are possible.  (Situations that cannot be
3272  * represented under these rules cause failure.)
3273  *
3274  * checkmatchall is responsible for eventually freeing the haspath[] arrays.
3275  */
3276 static bool
3277 checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
3278 {
3279         bool            result = false;
3280         bool            foundloop = false;
3281         bool       *haspath;
3282         struct arc *a;
3283
3284         /*
3285          * Since this is recursive, it could be driven to stack overflow.  But we
3286          * need not treat that as a hard failure; just deem the NFA non-matchall.
3287          */
3288         if (STACK_TOO_DEEP(nfa->v->re))
3289                 return false;
3290
3291         /* In case the search takes a long time, check for cancel */
3292         INTERRUPT(nfa->v->re);
3293
3294         /* Create a haspath array for this state */
3295         haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
3296         if (haspath == NULL)
3297                 return false;                   /* again, treat as non-matchall */
3298         memset(haspath, 0, (DUPINF + 2) * sizeof(bool));
3299
3300         /* Mark this state as being visited */
3301         assert(s->tmp == NULL);
3302         s->tmp = s;
3303
3304         for (a = s->outs; a != NULL; a = a->outchain)
3305         {
3306                 if (a->co != RAINBOW)
3307                         continue;                       /* ignore pseudocolor arcs */
3308                 if (a->to == nfa->post)
3309                 {
3310                         /* We found an all-RAINBOW path to the post state */
3311                         result = true;
3312
3313                         /*
3314                          * Mark this state as being zero steps away from the string end
3315                          * (the transition to the post state isn't counted).
3316                          */
3317                         haspath[0] = true;
3318                 }
3319                 else if (a->to == s)
3320                 {
3321                         /* We found a cycle of length 1, which we'll deal with below. */
3322                         foundloop = true;
3323                 }
3324                 else if (a->to->tmp != NULL)
3325                 {
3326                         /* It's busy, so we found a cycle of length > 1, so fail. */
3327                         result = false;
3328                         break;
3329                 }
3330                 else
3331                 {
3332                         /* Consider paths forward through this to-state. */
3333                         bool       *nexthaspath;
3334                         int                     i;
3335
3336                         /* If to-state was not already visited, recurse */
3337                         if (haspaths[a->to->no] == NULL)
3338                         {
3339                                 result = checkmatchall_recurse(nfa, a->to, haspaths);
3340                                 /* Fail if any recursive path fails */
3341                                 if (!result)
3342                                         break;
3343                         }
3344                         else
3345                         {
3346                                 /* The previous visit must have found path(s) to the end */
3347                                 result = true;
3348                         }
3349                         assert(a->to->tmp == NULL);
3350                         nexthaspath = haspaths[a->to->no];
3351                         assert(nexthaspath != NULL);
3352
3353                         /*
3354                          * Now, for every path of length i from a->to to the string end,
3355                          * there is a path of length i + 1 from s to the string end.
3356                          */
3357                         if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
3358                         {
3359                                 /*
3360                                  * a->to has a path of length exactly DUPINF, but not longer;
3361                                  * or it has paths of all lengths > DUPINF but not one of
3362                                  * exactly that length.  In either case, we cannot represent
3363                                  * the possible path lengths from s correctly, so fail.
3364                                  */
3365                                 result = false;
3366                                 break;
3367                         }
3368                         /* Merge knowledge of these path lengths into what we have */
3369                         for (i = 0; i < DUPINF; i++)
3370                                 haspath[i + 1] |= nexthaspath[i];
3371                         /* Infinity + 1 is still infinity */
3372                         haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
3373                 }
3374         }
3375
3376         if (result && foundloop)
3377         {
3378                 /*
3379                  * If there is a length-1 loop at this state, then find the shortest
3380                  * known path length to the end.  The loop means that every larger
3381                  * path length is possible, too.  (It doesn't matter whether any of
3382                  * the longer lengths were already known possible.)
3383                  */
3384                 int                     i;
3385
3386                 for (i = 0; i <= DUPINF; i++)
3387                 {
3388                         if (haspath[i])
3389                                 break;
3390                 }
3391                 for (i++; i <= DUPINF + 1; i++)
3392                         haspath[i] = true;
3393         }
3394
3395         /* Report out the completed path length map */
3396         assert(s->no < nfa->nstates);
3397         assert(haspaths[s->no] == NULL);
3398         haspaths[s->no] = haspath;
3399
3400         /* Mark state no longer busy */
3401         s->tmp = NULL;
3402
3403         return result;
3404 }
3405
3406 /*
3407  * check_out_colors_match - subroutine for checkmatchall
3408  *
3409  * Check whether the set of states reachable from s by arcs of color co1
3410  * is equivalent to the set reachable by arcs of color co2.
3411  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
3412  * so we need not examine arc types here.
3413  */
3414 static bool
3415 check_out_colors_match(struct state *s, color co1, color co2)
3416 {
3417         bool            result = true;
3418         struct arc *a;
3419
3420         /*
3421          * To do this in linear time, we assume that the NFA contains no duplicate
3422          * arcs.  Run through the out-arcs, marking states reachable by arcs of
3423          * color co1.  Run through again, un-marking states reachable by arcs of
3424          * color co2; if we see a not-marked state, we know this co2 arc is
3425          * unmatched.  Then run through again, checking for still-marked states,
3426          * and in any case leaving all the tmp fields reset to NULL.
3427          */
3428         for (a = s->outs; a != NULL; a = a->outchain)
3429         {
3430                 if (a->co == co1)
3431                 {
3432                         assert(a->to->tmp == NULL);
3433                         a->to->tmp = a->to;
3434                 }
3435         }
3436         for (a = s->outs; a != NULL; a = a->outchain)
3437         {
3438                 if (a->co == co2)
3439                 {
3440                         if (a->to->tmp != NULL)
3441                                 a->to->tmp = NULL;
3442                         else
3443                                 result = false; /* unmatched co2 arc */
3444                 }
3445         }
3446         for (a = s->outs; a != NULL; a = a->outchain)
3447         {
3448                 if (a->co == co1)
3449                 {
3450                         if (a->to->tmp != NULL)
3451                         {
3452                                 result = false; /* unmatched co1 arc */
3453                                 a->to->tmp = NULL;
3454                         }
3455                 }
3456         }
3457         return result;
3458 }
3459
3460 /*
3461  * check_in_colors_match - subroutine for checkmatchall
3462  *
3463  * Check whether the set of states that can reach s by arcs of color co1
3464  * is equivalent to the set that can reach s by arcs of color co2.
3465  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
3466  * so we need not examine arc types here.
3467  */
3468 static bool
3469 check_in_colors_match(struct state *s, color co1, color co2)
3470 {
3471         bool            result = true;
3472         struct arc *a;
3473
3474         /*
3475          * Identical algorithm to check_out_colors_match, except examine the
3476          * from-states of s' inarcs.
3477          */
3478         for (a = s->ins; a != NULL; a = a->inchain)
3479         {
3480                 if (a->co == co1)
3481                 {
3482                         assert(a->from->tmp == NULL);
3483                         a->from->tmp = a->from;
3484                 }
3485         }
3486         for (a = s->ins; a != NULL; a = a->inchain)
3487         {
3488                 if (a->co == co2)
3489                 {
3490                         if (a->from->tmp != NULL)
3491                                 a->from->tmp = NULL;
3492                         else
3493                                 result = false; /* unmatched co2 arc */
3494                 }
3495         }
3496         for (a = s->ins; a != NULL; a = a->inchain)
3497         {
3498                 if (a->co == co1)
3499                 {
3500                         if (a->from->tmp != NULL)
3501                         {
3502                                 result = false; /* unmatched co1 arc */
3503                                 a->from->tmp = NULL;
3504                         }
3505                 }
3506         }
3507         return result;
3508 }
3509
3510 /*
3511  * compact - construct the compact representation of an NFA
3512  */
3513 static void
3514 compact(struct nfa *nfa,
3515                 struct cnfa *cnfa)
3516 {
3517         struct state *s;
3518         struct arc *a;
3519         size_t          nstates;
3520         size_t          narcs;
3521         struct carc *ca;
3522         struct carc *first;
3523
3524         assert(!NISERR());
3525
3526         nstates = 0;
3527         narcs = 0;
3528         for (s = nfa->states; s != NULL; s = s->next)
3529         {
3530                 nstates++;
3531                 narcs += s->nouts + 1;  /* need one extra for endmarker */
3532         }
3533
3534         cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
3535         cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
3536         cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
3537         if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
3538         {
3539                 if (cnfa->stflags != NULL)
3540                         FREE(cnfa->stflags);
3541                 if (cnfa->states != NULL)
3542                         FREE(cnfa->states);
3543                 if (cnfa->arcs != NULL)
3544                         FREE(cnfa->arcs);
3545                 NERR(REG_ESPACE);
3546                 return;
3547         }
3548         cnfa->nstates = nstates;
3549         cnfa->pre = nfa->pre->no;
3550         cnfa->post = nfa->post->no;
3551         cnfa->bos[0] = nfa->bos[0];
3552         cnfa->bos[1] = nfa->bos[1];
3553         cnfa->eos[0] = nfa->eos[0];
3554         cnfa->eos[1] = nfa->eos[1];
3555         cnfa->ncolors = maxcolor(nfa->cm) + 1;
3556         cnfa->flags = nfa->flags;
3557         cnfa->minmatchall = nfa->minmatchall;
3558         cnfa->maxmatchall = nfa->maxmatchall;
3559
3560         ca = cnfa->arcs;
3561         for (s = nfa->states; s != NULL; s = s->next)
3562         {
3563                 assert((size_t) s->no < nstates);
3564                 cnfa->stflags[s->no] = 0;
3565                 cnfa->states[s->no] = ca;
3566                 first = ca;
3567                 for (a = s->outs; a != NULL; a = a->outchain)
3568                         switch (a->type)
3569                         {
3570                                 case PLAIN:
3571                                         ca->co = a->co;
3572                                         ca->to = a->to->no;
3573                                         ca++;
3574                                         break;
3575                                 case LACON:
3576                                         assert(s->no != cnfa->pre);
3577                                         assert(a->co >= 0);
3578                                         ca->co = (color) (cnfa->ncolors + a->co);
3579                                         ca->to = a->to->no;
3580                                         ca++;
3581                                         cnfa->flags |= HASLACONS;
3582                                         break;
3583                                 default:
3584                                         NERR(REG_ASSERT);
3585                                         return;
3586                         }
3587                 carcsort(first, ca - first);
3588                 ca->co = COLORLESS;
3589                 ca->to = 0;
3590                 ca++;
3591         }
3592         assert(ca == &cnfa->arcs[narcs]);
3593         assert(cnfa->nstates != 0);
3594
3595         /* mark no-progress states */
3596         for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3597                 cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
3598         cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
3599 }
3600
3601 /*
3602  * carcsort - sort compacted-NFA arcs by color
3603  */
3604 static void
3605 carcsort(struct carc *first, size_t n)
3606 {
3607         if (n > 1)
3608                 qsort(first, n, sizeof(struct carc), carc_cmp);
3609 }
3610
3611 static int
3612 carc_cmp(const void *a, const void *b)
3613 {
3614         const struct carc *aa = (const struct carc *) a;
3615         const struct carc *bb = (const struct carc *) b;
3616
3617         if (aa->co < bb->co)
3618                 return -1;
3619         if (aa->co > bb->co)
3620                 return +1;
3621         if (aa->to < bb->to)
3622                 return -1;
3623         if (aa->to > bb->to)
3624                 return +1;
3625         /* This is unreached, since there should be no duplicate arcs now: */
3626         return 0;
3627 }
3628
3629 /*
3630  * freecnfa - free a compacted NFA
3631  */
3632 static void
3633 freecnfa(struct cnfa *cnfa)
3634 {
3635         assert(!NULLCNFA(*cnfa));       /* not empty already */
3636         FREE(cnfa->stflags);
3637         FREE(cnfa->states);
3638         FREE(cnfa->arcs);
3639         ZAPCNFA(*cnfa);
3640 }
3641
3642 /*
3643  * dumpnfa - dump an NFA in human-readable form
3644  */
3645 static void
3646 dumpnfa(struct nfa *nfa,
3647                 FILE *f)
3648 {
3649 #ifdef REG_DEBUG
3650         struct state *s;
3651         int                     nstates = 0;
3652         int                     narcs = 0;
3653
3654         fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
3655         if (nfa->bos[0] != COLORLESS)
3656                 fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
3657         if (nfa->bos[1] != COLORLESS)
3658                 fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
3659         if (nfa->eos[0] != COLORLESS)
3660                 fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
3661         if (nfa->eos[1] != COLORLESS)
3662                 fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
3663         if (nfa->flags & HASLACONS)
3664                 fprintf(f, ", haslacons");
3665         if (nfa->flags & HASCANTMATCH)
3666                 fprintf(f, ", hascantmatch");
3667         if (nfa->flags & MATCHALL)
3668         {
3669                 fprintf(f, ", minmatchall %d", nfa->minmatchall);
3670                 if (nfa->maxmatchall == DUPINF)
3671                         fprintf(f, ", maxmatchall inf");
3672                 else
3673                         fprintf(f, ", maxmatchall %d", nfa->maxmatchall);
3674         }
3675         fprintf(f, "\n");
3676         for (s = nfa->states; s != NULL; s = s->next)
3677         {
3678                 dumpstate(s, f);
3679                 nstates++;
3680                 narcs += s->nouts;
3681         }
3682         fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
3683         if (nfa->parent == NULL)
3684                 dumpcolors(nfa->cm, f);
3685         fflush(f);
3686 #endif
3687 }
3688
3689 #ifdef REG_DEBUG                                /* subordinates of dumpnfa */
3690
3691 /*
3692  * dumpstate - dump an NFA state in human-readable form
3693  */
3694 static void
3695 dumpstate(struct state *s,
3696                   FILE *f)
3697 {
3698         struct arc *a;
3699
3700         fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
3701                         (s->flag) ? s->flag : '.');
3702         if (s->prev != NULL && s->prev->next != s)
3703                 fprintf(f, "\tstate chain bad\n");
3704         if (s->nouts == 0)
3705                 fprintf(f, "\tno out arcs\n");
3706         else
3707                 dumparcs(s, f);
3708         for (a = s->ins; a != NULL; a = a->inchain)
3709         {
3710                 if (a->to != s)
3711                         fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
3712                                         a->from->no, a->to->no, s->no);
3713         }
3714         fflush(f);
3715 }
3716
3717 /*
3718  * dumparcs - dump out-arcs in human-readable form
3719  */
3720 static void
3721 dumparcs(struct state *s,
3722                  FILE *f)
3723 {
3724         int                     pos;
3725         struct arc *a;
3726
3727         /* printing oldest arcs first is usually clearer */
3728         a = s->outs;
3729         assert(a != NULL);
3730         while (a->outchain != NULL)
3731                 a = a->outchain;
3732         pos = 1;
3733         do
3734         {
3735                 dumparc(a, s, f);
3736                 if (pos == 5)
3737                 {
3738                         fprintf(f, "\n");
3739                         pos = 1;
3740                 }
3741                 else
3742                         pos++;
3743                 a = a->outchainRev;
3744         } while (a != NULL);
3745         if (pos != 1)
3746                 fprintf(f, "\n");
3747 }
3748
3749 /*
3750  * dumparc - dump one outarc in readable form, including prefixing tab
3751  */
3752 static void
3753 dumparc(struct arc *a,
3754                 struct state *s,
3755                 FILE *f)
3756 {
3757         struct arc *aa;
3758
3759         fprintf(f, "\t");
3760         switch (a->type)
3761         {
3762                 case PLAIN:
3763                         if (a->co == RAINBOW)
3764                                 fprintf(f, "[*]");
3765                         else
3766                                 fprintf(f, "[%ld]", (long) a->co);
3767                         break;
3768                 case AHEAD:
3769                         if (a->co == RAINBOW)
3770                                 fprintf(f, ">*>");
3771                         else
3772                                 fprintf(f, ">%ld>", (long) a->co);
3773                         break;
3774                 case BEHIND:
3775                         if (a->co == RAINBOW)
3776                                 fprintf(f, "<*<");
3777                         else
3778                                 fprintf(f, "<%ld<", (long) a->co);
3779                         break;
3780                 case LACON:
3781                         fprintf(f, ":%ld:", (long) a->co);
3782                         break;
3783                 case '^':
3784                 case '$':
3785                         fprintf(f, "%c%d", a->type, (int) a->co);
3786                         break;
3787                 case EMPTY:
3788                         break;
3789                 case CANTMATCH:
3790                         fprintf(f, "X");
3791                         break;
3792                 default:
3793                         fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
3794                         break;
3795         }
3796         if (a->from != s)
3797                 fprintf(f, "?%d?", a->from->no);
3798         for (aa = a->from->outs; aa != NULL; aa = aa->outchain)
3799                 if (aa == a)
3800                         break;                          /* NOTE BREAK OUT */
3801         if (aa == NULL)
3802                 fprintf(f, "?!?");              /* missing from out-chain */
3803         fprintf(f, "->");
3804         if (a->to == NULL)
3805         {
3806                 fprintf(f, "NULL");
3807                 return;
3808         }
3809         fprintf(f, "%d", a->to->no);
3810         for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
3811                 if (aa == a)
3812                         break;                          /* NOTE BREAK OUT */
3813         if (aa == NULL)
3814                 fprintf(f, "?!?");              /* missing from in-chain */
3815 }
3816 #endif                                                  /* REG_DEBUG */
3817
3818 /*
3819  * dumpcnfa - dump a compacted NFA in human-readable form
3820  */
3821 #ifdef REG_DEBUG
3822 static void
3823 dumpcnfa(struct cnfa *cnfa,
3824                  FILE *f)
3825 {
3826         int                     st;
3827
3828         fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
3829         if (cnfa->bos[0] != COLORLESS)
3830                 fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3831         if (cnfa->bos[1] != COLORLESS)
3832                 fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3833         if (cnfa->eos[0] != COLORLESS)
3834                 fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3835         if (cnfa->eos[1] != COLORLESS)
3836                 fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
3837         if (cnfa->flags & HASLACONS)
3838                 fprintf(f, ", haslacons");
3839         if (cnfa->flags & MATCHALL)
3840         {
3841                 fprintf(f, ", minmatchall %d", cnfa->minmatchall);
3842                 if (cnfa->maxmatchall == DUPINF)
3843                         fprintf(f, ", maxmatchall inf");
3844                 else
3845                         fprintf(f, ", maxmatchall %d", cnfa->maxmatchall);
3846         }
3847         fprintf(f, "\n");
3848         for (st = 0; st < cnfa->nstates; st++)
3849                 dumpcstate(st, cnfa, f);
3850         fflush(f);
3851 }
3852 #endif
3853
3854 #ifdef REG_DEBUG                                /* subordinates of dumpcnfa */
3855
3856 /*
3857  * dumpcstate - dump a compacted-NFA state in human-readable form
3858  */
3859 static void
3860 dumpcstate(int st,
3861                    struct cnfa *cnfa,
3862                    FILE *f)
3863 {
3864         struct carc *ca;
3865         int                     pos;
3866
3867         fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
3868         pos = 1;
3869         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
3870         {
3871                 if (ca->co == RAINBOW)
3872                         fprintf(f, "\t[*]->%d", ca->to);
3873                 else if (ca->co < cnfa->ncolors)
3874                         fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
3875                 else
3876                         fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
3877                 if (pos == 5)
3878                 {
3879                         fprintf(f, "\n");
3880                         pos = 1;
3881                 }
3882                 else
3883                         pos++;
3884         }
3885         if (ca == cnfa->states[st] || pos != 1)
3886                 fprintf(f, "\n");
3887         fflush(f);
3888 }
3889
3890 #endif                                                  /* REG_DEBUG */