lib/libc/regex/regcomp.c

   1 /*      $NetBSD: regcomp.c,v 1.33 2012/03/13 21:13:43 christos Exp $    */
   2
   3 /*-
   4  * Copyright (c) 1992, 1993, 1994
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Henry Spencer.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  35  */
  36
  37 /*-
  38  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
  39  *
  40  * This code is derived from software contributed to Berkeley by
  41  * Henry Spencer.
  42  *
  43  * Redistribution and use in source and binary forms, with or without
  44  * modification, are permitted provided that the following conditions
  45  * are met:
  46  * 1. Redistributions of source code must retain the above copyright
  47  *    notice, this list of conditions and the following disclaimer.
  48  * 2. Redistributions in binary form must reproduce the above copyright
  49  *    notice, this list of conditions and the following disclaimer in the
  50  *    documentation and/or other materials provided with the distribution.
  51  * 3. All advertising materials mentioning features or use of this software
  52  *    must display the following acknowledgement:
  53  *      This product includes software developed by the University of
  54  *      California, Berkeley and its contributors.
  55  * 4. Neither the name of the University nor the names of its contributors
  56  *    may be used to endorse or promote products derived from this software
  57  *    without specific prior written permission.
  58  *
  59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  69  * SUCH DAMAGE.
  70  *
  71  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  72  */
  73
  74 #include <sys/cdefs.h>
  75 #if defined(LIBC_SCCS) && !defined(lint)
  76 #if 0
  77 static char sccsid[] = "@(#)regcomp.c   8.5 (Berkeley) 3/20/94";
  78 #else
  79 __RCSID("$NetBSD: regcomp.c,v 1.33 2012/03/13 21:13:43 christos Exp $");
  80 #endif
  81 #endif /* LIBC_SCCS and not lint */
  82
  83 #include "namespace.h"
  84 #include <sys/types.h>
  85
  86 #include <assert.h>
  87 #include <ctype.h>
  88 #include <limits.h>
  89 #include <stdio.h>
  90 #include <stdlib.h>
  91 #include <string.h>
  92 #include <regex.h>
  93
  94 #ifdef __weak_alias
  95 __weak_alias(regcomp,_regcomp)
  96 #endif
  97
  98 #include "utils.h"
  99 #include "regex2.h"
 100
 101 #include "cclass.h"
 102 #include "cname.h"
 103
 104 /*
 105  * parse structure, passed up and down to avoid global variables and
 106  * other clumsinesses
 107  */
 108 struct parse {
 109         const char *next;       /* next character in RE */
 110         const char *end;        /* end of string (-> NUL normally) */
 111         int error;              /* has an error been seen? */
 112         sop *strip;             /* malloced strip */
 113         sopno ssize;            /* malloced strip size (allocated) */
 114         sopno slen;             /* malloced strip length (used) */
 115         size_t ncsalloc;        /* number of csets allocated */
 116         struct re_guts *g;
 117 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
 118         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
 119         sopno pend[NPAREN];     /* -> ) ([0] unused) */
 120 };
 121
 122 /* ========= begin header generated by ./mkh ========= */
 123 #ifdef __cplusplus
 124 extern "C" {
 125 #endif
 126
 127 /* === regcomp.c === */
 128 static void p_ere(struct parse *p, int stop, size_t reclimit);
 129 static void p_ere_exp(struct parse *p, size_t reclimit);
 130 static void p_str(struct parse *p);
 131 static void p_bre(struct parse *p, int end1, int end2, size_t reclimit);
 132 static int p_simp_re(struct parse *p, int starordinary, size_t reclimit);
 133 static int p_count(struct parse *p);
 134 static void p_bracket(struct parse *p);
 135 static void p_b_term(struct parse *p, cset *cs);
 136 static void p_b_cclass(struct parse *p, cset *cs);
 137 static void p_b_eclass(struct parse *p, cset *cs);
 138 static char p_b_symbol(struct parse *p);
 139 static char p_b_coll_elem(struct parse *p, int endc);
 140 static int othercase(int ch);
 141 static void bothcases(struct parse *p, int ch);
 142 static void ordinary(struct parse *p, int ch);
 143 static void nonnewline(struct parse *p);
 144 static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit);
 145 static int seterr(struct parse *p, int e);
 146 static cset *allocset(struct parse *p);
 147 static void freeset(struct parse *p, cset *cs);
 148 static sopno freezeset(struct parse *p, cset *cs);
 149 static int firstch(struct parse *p, cset *cs);
 150 static int nch(struct parse *p, cset *cs);
 151 static void mcadd(struct parse *p, cset *cs, const char *cp);
 152 #if 0
 153 static void mcsub(cset *cs, char *cp);
 154 static int mcin(cset *cs, char *cp);
 155 static char *mcfind(cset *cs, char *cp);
 156 #endif
 157 static void mcinvert(struct parse *p, cset *cs);
 158 static void mccase(struct parse *p, cset *cs);
 159 static int isinsets(struct re_guts *g, int c);
 160 static int samesets(struct re_guts *g, int c1, int c2);
 161 static void categorize(struct parse *p, struct re_guts *g);
 162 static sopno dupl(struct parse *p, sopno start, sopno finish);
 163 static void doemit(struct parse *p, sop op, sopno opnd);
 164 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos);
 165 static void dofwd(struct parse *p, sopno pos, sopno value);
 166 static int enlarge(struct parse *p, sopno size);
 167 static void stripsnug(struct parse *p, struct re_guts *g);
 168 static void findmust(struct parse *p, struct re_guts *g);
 169 static sopno pluscount(struct parse *p, struct re_guts *g);
 170
 171 #ifdef __cplusplus
 172 }
 173 #endif
 174 /* ========= end header generated by ./mkh ========= */
 175
 176 static char nuls[10];           /* place to point scanner in event of error */
 177
 178 /*
 179  * macros for use with parse structure
 180  * BEWARE:  these know that the parse structure is named `p' !!!
 181  */
 182 #define PEEK()  (*p->next)
 183 #define PEEK2() (*(p->next+1))
 184 #define MORE()  (p->next < p->end)
 185 #define MORE2() (p->next+1 < p->end)
 186 #define SEE(c)  (MORE() && PEEK() == (c))
 187 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 188 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 189 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 190 #define NEXT()  (p->next++)
 191 #define NEXT2() (p->next += 2)
 192 #define NEXTn(n)        (p->next += (n))
 193 #define GETNEXT()       (*p->next++)
 194 #define SETERROR(e)     seterr(p, (e))
 195 #define REQUIRE(co, e)  (void) ((co) || SETERROR(e))
 196 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 197 #define MUSTEAT(c, e)   (void) (REQUIRE(MORE() && GETNEXT() == (c), e))
 198 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 199 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd)
 200 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 201 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 202 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 203 #define HERE()          (p->slen)
 204 #define THERE()         (p->slen - 1)
 205 #define THERETHERE()    (p->slen - 2)
 206 #define DROP(n) (p->slen -= (n))
 207
 208 #ifndef NDEBUG
 209 static int never = 0;           /* for use in asserts; shuts lint up */
 210 #else
 211 #define never   0               /* some <assert.h>s have bugs too */
 212 #endif
 213
 214 #define MEMLIMIT        0x8000000
 215 #define MEMSIZE(p) \
 216         ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
 217         (p)->ncsalloc * sizeof(cset) + \
 218         (p)->ssize * sizeof(sop))
 219 #define RECLIMIT        256
 220
 221 /*
 222  - regcomp - interface for parser and compilation
 223  = extern int regcomp(regex_t *, const char *, int);
 224  = #define      REG_BASIC       0000
 225  = #define      REG_EXTENDED    0001
 226  = #define      REG_ICASE       0002
 227  = #define      REG_NOSUB       0004
 228  = #define      REG_NEWLINE     0010
 229  = #define      REG_NOSPEC      0020
 230  = #define      REG_PEND        0040
 231  = #define      REG_DUMP        0200
 232  */
 233 int                             /* 0 success, otherwise REG_something */
 234 regcomp(
 235     regex_t *preg,
 236     const char *pattern,
 237     int cflags)
 238 {
 239         struct parse pa;
 240         struct re_guts *g;
 241         struct parse *p = &pa;
 242         int i;
 243         size_t len;
 244 #ifdef REDEBUG
 245 #       define  GOODFLAGS(f)    (f)
 246 #else
 247 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 248 #endif
 249
 250         _DIAGASSERT(preg != NULL);
 251         _DIAGASSERT(pattern != NULL);
 252
 253         cflags = GOODFLAGS(cflags);
 254         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 255                 return(REG_INVARG);
 256
 257         if (cflags&REG_PEND) {
 258                 if (preg->re_endp < pattern)
 259                         return(REG_INVARG);
 260                 len = preg->re_endp - pattern;
 261         } else
 262                 len = strlen(pattern);
 263
 264         /* do the mallocs early so failure handling is easy */
 265         g = (struct re_guts *)malloc(sizeof(struct re_guts) +
 266                                                         (NC-1)*sizeof(cat_t));
 267         if (g == NULL)
 268                 return(REG_ESPACE);
 269         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 270         p->strip = malloc(p->ssize * sizeof(sop));
 271         p->slen = 0;
 272         if (p->strip == NULL) {
 273                 free(g);
 274                 return(REG_ESPACE);
 275         }
 276
 277         /* set things up */
 278         p->g = g;
 279         p->next = pattern;
 280         p->end = p->next + len;
 281         p->error = 0;
 282         p->ncsalloc = 0;
 283         for (i = 0; i < NPAREN; i++) {
 284                 p->pbegin[i] = 0;
 285                 p->pend[i] = 0;
 286         }
 287         g->csetsize = NC;
 288         g->sets = NULL;
 289         g->setbits = NULL;
 290         g->ncsets = 0;
 291         g->cflags = cflags;
 292         g->iflags = 0;
 293         g->nbol = 0;
 294         g->neol = 0;
 295         g->must = NULL;
 296         g->mlen = 0;
 297         g->nsub = 0;
 298         g->ncategories = 1;     /* category 0 is "everything else" */
 299         g->categories = &g->catspace[-(CHAR_MIN)];
 300         (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 301         g->backrefs = 0;
 302
 303         /* do it */
 304         EMIT(OEND, 0);
 305         g->firststate = THERE();
 306         if (cflags&REG_EXTENDED)
 307                 p_ere(p, OUT, 0);
 308         else if (cflags&REG_NOSPEC)
 309                 p_str(p);
 310         else
 311                 p_bre(p, OUT, OUT, 0);
 312         EMIT(OEND, 0);
 313         g->laststate = THERE();
 314
 315         /* tidy up loose ends and fill things in */
 316         categorize(p, g);
 317         stripsnug(p, g);
 318         findmust(p, g);
 319         g->nplus = pluscount(p, g);
 320         g->magic = MAGIC2;
 321         preg->re_nsub = g->nsub;
 322         preg->re_g = g;
 323         preg->re_magic = MAGIC1;
 324 #ifndef REDEBUG
 325         /* not debugging, so can't rely on the assert() in regexec() */
 326         if (g->iflags&BAD)
 327                 SETERROR(REG_ASSERT);
 328 #endif
 329
 330         /* win or lose, we're done */
 331         if (p->error != 0)      /* lose */
 332                 regfree(preg);
 333         return(p->error);
 334 }
 335
 336 /*
 337  - p_ere - ERE parser top level, concatenation and alternation
 338  == static void p_ere(struct parse *p, int stop, size_t reclimit);
 339  */
 340 static void
 341 p_ere(
 342     struct parse *p,
 343     int stop,                   /* character this ERE should end at */
 344     size_t reclimit)
 345 {
 346         char c;
 347         sopno prevback = 0;     /* pacify gcc */
 348         sopno prevfwd = 0;      /* pacify gcc */
 349         sopno conc;
 350         int first = 1;          /* is this the first alternative? */
 351
 352         _DIAGASSERT(p != NULL);
 353
 354         if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
 355                 p->error = REG_ESPACE;
 356                 return;
 357         }
 358
 359         for (;;) {
 360                 /* do a bunch of concatenated expressions */
 361                 conc = HERE();
 362                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 363                         p_ere_exp(p, reclimit);
 364                 REQUIRE(HERE() != conc, REG_EMPTY);     /* require nonempty */
 365
 366                 if (!EAT('|'))
 367                         break;          /* NOTE BREAK OUT */
 368
 369                 if (first) {
 370                         INSERT(OCH_, conc);     /* offset is wrong */
 371                         prevfwd = conc;
 372                         prevback = conc;
 373                         first = 0;
 374                 }
 375                 ASTERN(OOR1, prevback);
 376                 prevback = THERE();
 377                 AHEAD(prevfwd);                 /* fix previous offset */
 378                 prevfwd = HERE();
 379                 EMIT(OOR2, 0);                  /* offset is very wrong */
 380         }
 381
 382         if (!first) {           /* tail-end fixups */
 383                 AHEAD(prevfwd);
 384                 ASTERN(O_CH, prevback);
 385         }
 386
 387         assert(!MORE() || SEE(stop));
 388 }
 389
 390 /*
 391  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 392  == static void p_ere_exp(struct parse *p, size_t reclimit);
 393  */
 394 static void
 395 p_ere_exp(
 396     struct parse *p,
 397     size_t reclimit)
 398 {
 399         char c;
 400         sopno pos;
 401         int count;
 402         int count2;
 403         sopno subno;
 404         int wascaret = 0;
 405
 406         _DIAGASSERT(p != NULL);
 407
 408         assert(MORE());         /* caller should have ensured this */
 409         c = GETNEXT();
 410
 411         pos = HERE();
 412         switch (c) {
 413         case '(':
 414                 REQUIRE(MORE(), REG_EPAREN);
 415                 p->g->nsub++;
 416                 subno = p->g->nsub;
 417                 if (subno < NPAREN)
 418                         p->pbegin[subno] = HERE();
 419                 EMIT(OLPAREN, subno);
 420                 if (!SEE(')'))
 421                         p_ere(p, ')', reclimit);
 422                 if (subno < NPAREN) {
 423                         p->pend[subno] = HERE();
 424                         assert(p->pend[subno] != 0);
 425                 }
 426                 EMIT(ORPAREN, subno);
 427                 MUSTEAT(')', REG_EPAREN);
 428                 break;
 429 #ifndef POSIX_MISTAKE
 430         case ')':               /* happens only if no current unmatched ( */
 431                 /*
 432                  * You may ask, why the ifndef?  Because I didn't notice
 433                  * this until slightly too late for 1003.2, and none of the
 434                  * other 1003.2 regular-expression reviewers noticed it at
 435                  * all.  So an unmatched ) is legal POSIX, at least until
 436                  * we can get it fixed.
 437                  */
 438                 SETERROR(REG_EPAREN);
 439                 break;
 440 #endif
 441         case '^':
 442                 EMIT(OBOL, 0);
 443                 p->g->iflags |= USEBOL;
 444                 p->g->nbol++;
 445                 wascaret = 1;
 446                 break;
 447         case '$':
 448                 EMIT(OEOL, 0);
 449                 p->g->iflags |= USEEOL;
 450                 p->g->neol++;
 451                 break;
 452         case '|':
 453                 SETERROR(REG_EMPTY);
 454                 break;
 455         case '*':
 456         case '+':
 457         case '?':
 458                 SETERROR(REG_BADRPT);
 459                 break;
 460         case '.':
 461                 if (p->g->cflags&REG_NEWLINE)
 462                         nonnewline(p);
 463                 else
 464                         EMIT(OANY, 0);
 465                 break;
 466         case '[':
 467                 p_bracket(p);
 468                 break;
 469         case '\\':
 470                 REQUIRE(MORE(), REG_EESCAPE);
 471                 c = GETNEXT();
 472                 ordinary(p, c);
 473                 break;
 474         case '{':               /* okay as ordinary except if digit follows */
 475                 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT);
 476                 /* FALLTHROUGH */
 477         default:
 478                 ordinary(p, c);
 479                 break;
 480         }
 481
 482         if (!MORE())
 483                 return;
 484         c = PEEK();
 485         /* we call { a repetition if followed by a digit */
 486         if (!( c == '*' || c == '+' || c == '?' ||
 487             (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ))
 488                 return;         /* no repetition, we're done */
 489         NEXT();
 490
 491         REQUIRE(!wascaret, REG_BADRPT);
 492         switch (c) {
 493         case '*':       /* implemented as +? */
 494                 /* this case does not require the (y|) trick, noKLUDGE */
 495                 INSERT(OPLUS_, pos);
 496                 ASTERN(O_PLUS, pos);
 497                 INSERT(OQUEST_, pos);
 498                 ASTERN(O_QUEST, pos);
 499                 break;
 500         case '+':
 501                 INSERT(OPLUS_, pos);
 502                 ASTERN(O_PLUS, pos);
 503                 break;
 504         case '?':
 505                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 506                 INSERT(OCH_, pos);              /* offset slightly wrong */
 507                 ASTERN(OOR1, pos);              /* this one's right */
 508                 AHEAD(pos);                     /* fix the OCH_ */
 509                 EMIT(OOR2, 0);                  /* offset very wrong... */
 510                 AHEAD(THERE());                 /* ...so fix it */
 511                 ASTERN(O_CH, THERETHERE());
 512                 break;
 513         case '{':
 514                 count = p_count(p);
 515                 if (EAT(',')) {
 516                         if (isdigit((unsigned char)PEEK())) {
 517                                 count2 = p_count(p);
 518                                 REQUIRE(count <= count2, REG_BADBR);
 519                         } else          /* single number with comma */
 520                                 count2 = INFINITY;
 521                 } else          /* just a single number */
 522                         count2 = count;
 523                 repeat(p, pos, count, count2, 0);
 524                 if (!EAT('}')) {        /* error heuristics */
 525                         while (MORE() && PEEK() != '}')
 526                                 NEXT();
 527                         REQUIRE(MORE(), REG_EBRACE);
 528                         SETERROR(REG_BADBR);
 529                 }
 530                 break;
 531         }
 532
 533         if (!MORE())
 534                 return;
 535         c = PEEK();
 536         if (!( c == '*' || c == '+' || c == '?' ||
 537             (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) )
 538                 return;
 539         SETERROR(REG_BADRPT);
 540 }
 541
 542 /*
 543  - p_str - string (no metacharacters) "parser"
 544  == static void p_str(struct parse *p);
 545  */
 546 static void
 547 p_str(
 548     struct parse *p)
 549 {
 550
 551         _DIAGASSERT(p != NULL);
 552
 553         REQUIRE(MORE(), REG_EMPTY);
 554         while (MORE())
 555                 ordinary(p, GETNEXT());
 556 }
 557
 558 /*
 559  - p_bre - BRE parser top level, anchoring and concatenation
 560  == static void p_bre(struct parse *p, int end1, \
 561  ==     int end2, size_t reclimit);
 562  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 563  *
 564  * This implementation is a bit of a kludge, in that a trailing $ is first
 565  * taken as an ordinary character and then revised to be an anchor.  The
 566  * only undesirable side effect is that '$' gets included as a character
 567  * category in such cases.  This is fairly harmless; not worth fixing.
 568  * The amount of lookahead needed to avoid this kludge is excessive.
 569  */
 570 static void
 571 p_bre(
 572     struct parse *p,
 573     int end1,           /* first terminating character */
 574     int end2,           /* second terminating character */
 575     size_t reclimit)
 576 {
 577         sopno start;
 578         int first = 1;                  /* first subexpression? */
 579         int wasdollar = 0;
 580
 581         _DIAGASSERT(p != NULL);
 582
 583         if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
 584                 p->error = REG_ESPACE;
 585                 return;
 586         }
 587
 588         start = HERE();
 589
 590         if (EAT('^')) {
 591                 EMIT(OBOL, 0);
 592                 p->g->iflags |= USEBOL;
 593                 p->g->nbol++;
 594         }
 595         while (MORE() && !SEETWO(end1, end2)) {
 596                 wasdollar = p_simp_re(p, first, reclimit);
 597                 first = 0;
 598         }
 599         if (wasdollar) {        /* oops, that was a trailing anchor */
 600                 DROP(1);
 601                 EMIT(OEOL, 0);
 602                 p->g->iflags |= USEEOL;
 603                 p->g->neol++;
 604         }
 605
 606         REQUIRE(HERE() != start, REG_EMPTY);    /* require nonempty */
 607 }
 608
 609 /*
 610  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 611  == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit);
 612  */
 613 static int                      /* was the simple RE an unbackslashed $? */
 614 p_simp_re(
 615     struct parse *p,
 616     int starordinary,           /* is a leading * an ordinary character? */
 617     size_t reclimit)
 618 {
 619         int c;
 620         int count;
 621         int count2;
 622         sopno pos, i;
 623         sopno subno;
 624 #       define  BACKSL  (1<<CHAR_BIT)
 625
 626         _DIAGASSERT(p != NULL);
 627
 628         pos = HERE();           /* repetion op, if any, covers from here */
 629
 630         assert(MORE());         /* caller should have ensured this */
 631         c = GETNEXT();
 632         if (c == '\\') {
 633                 REQUIRE(MORE(), REG_EESCAPE);
 634                 c = BACKSL | (unsigned char)GETNEXT();
 635         }
 636         switch (c) {
 637         case '.':
 638                 if (p->g->cflags&REG_NEWLINE)
 639                         nonnewline(p);
 640                 else
 641                         EMIT(OANY, 0);
 642                 break;
 643         case '[':
 644                 p_bracket(p);
 645                 break;
 646         case BACKSL|'{':
 647                 SETERROR(REG_BADRPT);
 648                 break;
 649         case BACKSL|'(':
 650                 p->g->nsub++;
 651                 subno = p->g->nsub;
 652                 if (subno < NPAREN)
 653                         p->pbegin[subno] = HERE();
 654                 EMIT(OLPAREN, subno);
 655                 /* the MORE here is an error heuristic */
 656                 if (MORE() && !SEETWO('\\', ')'))
 657                         p_bre(p, '\\', ')', reclimit);
 658                 if (subno < NPAREN) {
 659                         p->pend[subno] = HERE();
 660                         assert(p->pend[subno] != 0);
 661                 }
 662                 EMIT(ORPAREN, subno);
 663                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 664                 break;
 665         case BACKSL|')':        /* should not get here -- must be user */
 666         case BACKSL|'}':
 667                 SETERROR(REG_EPAREN);
 668                 break;
 669         case BACKSL|'1':
 670         case BACKSL|'2':
 671         case BACKSL|'3':
 672         case BACKSL|'4':
 673         case BACKSL|'5':
 674         case BACKSL|'6':
 675         case BACKSL|'7':
 676         case BACKSL|'8':
 677         case BACKSL|'9':
 678                 i = (c&~BACKSL) - '0';
 679                 assert(i < NPAREN);
 680                 if (p->pend[i] != 0) {
 681                         assert(i <= p->g->nsub);
 682                         EMIT(OBACK_, i);
 683                         assert(p->pbegin[i] != 0);
 684                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 685                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 686                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 687                         EMIT(O_BACK, i);
 688                 } else
 689                         SETERROR(REG_ESUBREG);
 690                 p->g->backrefs = 1;
 691                 break;
 692         case '*':
 693                 REQUIRE(starordinary, REG_BADRPT);
 694                 /* FALLTHROUGH */
 695         default:
 696                 ordinary(p, c &~ BACKSL);
 697                 break;
 698         }
 699
 700         if (EAT('*')) {         /* implemented as +? */
 701                 /* this case does not require the (y|) trick, noKLUDGE */
 702                 INSERT(OPLUS_, pos);
 703                 ASTERN(O_PLUS, pos);
 704                 INSERT(OQUEST_, pos);
 705                 ASTERN(O_QUEST, pos);
 706         } else if (EATTWO('\\', '{')) {
 707                 count = p_count(p);
 708                 if (EAT(',')) {
 709                         if (MORE() && isdigit((unsigned char)PEEK())) {
 710                                 count2 = p_count(p);
 711                                 REQUIRE(count <= count2, REG_BADBR);
 712                         } else          /* single number with comma */
 713                                 count2 = INFINITY;
 714                 } else          /* just a single number */
 715                         count2 = count;
 716                 repeat(p, pos, count, count2, 0);
 717                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 718                         while (MORE() && !SEETWO('\\', '}'))
 719                                 NEXT();
 720                         REQUIRE(MORE(), REG_EBRACE);
 721                         SETERROR(REG_BADBR);
 722                 }
 723         } else if (c == (unsigned char)'$')     /* $ (but not \$) ends it */
 724                 return(1);
 725
 726         return(0);
 727 }
 728
 729 /*
 730  - p_count - parse a repetition count
 731  == static int p_count(struct parse *p);
 732  */
 733 static int                      /* the value */
 734 p_count(
 735     struct parse *p)
 736 {
 737         int count = 0;
 738         int ndigits = 0;
 739
 740         _DIAGASSERT(p != NULL);
 741
 742         while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) {
 743                 count = count*10 + (GETNEXT() - '0');
 744                 ndigits++;
 745         }
 746
 747         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 748         return(count);
 749 }
 750
 751 /*
 752  - p_bracket - parse a bracketed character list
 753  == static void p_bracket(struct parse *p);
 754  *
 755  * Note a significant property of this code:  if the allocset() did SETERROR,
 756  * no set operations are done.
 757  */
 758 static void
 759 p_bracket(
 760     struct parse *p)
 761 {
 762         cset *cs;
 763         int invert = 0;
 764         _DIAGASSERT(p != NULL);
 765
 766         cs = allocset(p);
 767         if (cs == NULL)
 768                 return;
 769
 770         /* Dept of Truly Sickening Special-Case Kludges */
 771         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]",
 772                                             (size_t)6) == 0) {
 773                 EMIT(OBOW, 0);
 774                 NEXTn(6);
 775                 return;
 776         }
 777         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]",
 778                                             (size_t)6) == 0) {
 779                 EMIT(OEOW, 0);
 780                 NEXTn(6);
 781                 return;
 782         }
 783
 784         if (EAT('^'))
 785                 invert++;       /* make note to invert set at end */
 786         if (EAT(']'))
 787                 CHadd(cs, ']');
 788         else if (EAT('-'))
 789                 CHadd(cs, '-');
 790         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 791                 p_b_term(p, cs);
 792         if (EAT('-'))
 793                 CHadd(cs, '-');
 794         MUSTEAT(']', REG_EBRACK);
 795
 796         if (p->error != 0)      /* don't mess things up further */
 797                 return;
 798
 799         if (p->g->cflags&REG_ICASE) {
 800                 ssize_t i;
 801                 int ci;
 802
 803                 for (i = p->g->csetsize - 1; i >= 0; i--)
 804                         if (CHIN(cs, i) && isalpha(i)) {
 805                                 ci = othercase((int)i);
 806                                 if (ci != i)
 807                                         CHadd(cs, ci);
 808                         }
 809                 if (cs->multis != NULL)
 810                         mccase(p, cs);
 811         }
 812         if (invert) {
 813                 ssize_t i;
 814
 815                 for (i = p->g->csetsize - 1; i >= 0; i--)
 816                         if (CHIN(cs, i))
 817                                 CHsub(cs, (int)i);
 818                         else
 819                                 CHadd(cs, (int)i);
 820                 if (p->g->cflags&REG_NEWLINE)
 821                         CHsub(cs, '\n');
 822                 if (cs->multis != NULL)
 823                         mcinvert(p, cs);
 824         }
 825
 826         assert(cs->multis == NULL);             /* xxx */
 827
 828         if (nch(p, cs) == 1) {          /* optimize singleton sets */
 829                 ordinary(p, firstch(p, cs));
 830                 freeset(p, cs);
 831         } else
 832                 EMIT(OANYOF, freezeset(p, cs));
 833 }
 834
 835 /*
 836  - p_b_term - parse one term of a bracketed character list
 837  == static void p_b_term(struct parse *p, cset *cs);
 838  */
 839 static void
 840 p_b_term(
 841     struct parse *p,
 842     cset *cs)
 843 {
 844         char c;
 845         char start, finish;
 846         int i;
 847
 848         _DIAGASSERT(p != NULL);
 849         _DIAGASSERT(cs != NULL);
 850
 851         /* classify what we've got */
 852         switch ((MORE()) ? PEEK() : '\0') {
 853         case '[':
 854                 c = (MORE2()) ? PEEK2() : '\0';
 855                 break;
 856
 857         case '-':
 858                 SETERROR(REG_ERANGE);
 859                 return;                 /* NOTE RETURN */
 860
 861         default:
 862                 c = '\0';
 863                 break;
 864         }
 865
 866         switch (c) {
 867         case ':':               /* character class */
 868                 NEXT2();
 869                 REQUIRE(MORE(), REG_EBRACK);
 870                 c = PEEK();
 871                 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 872                 p_b_cclass(p, cs);
 873                 REQUIRE(MORE(), REG_EBRACK);
 874                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 875                 break;
 876         case '=':               /* equivalence class */
 877                 NEXT2();
 878                 REQUIRE(MORE(), REG_EBRACK);
 879                 c = PEEK();
 880                 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 881                 p_b_eclass(p, cs);
 882                 REQUIRE(MORE(), REG_EBRACK);
 883                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 884                 break;
 885         default:                /* symbol, ordinary character, or range */
 886 /* xxx revision needed for multichar stuff */
 887                 start = p_b_symbol(p);
 888                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 889                         /* range */
 890                         NEXT();
 891                         if (EAT('-'))
 892                                 finish = '-';
 893                         else
 894                                 finish = p_b_symbol(p);
 895                 } else
 896                         finish = start;
 897 /* xxx what about signed chars here... */
 898                 REQUIRE(start <= finish, REG_ERANGE);
 899                 for (i = start; i <= finish; i++)
 900                         CHadd(cs, i);
 901                 break;
 902         }
 903 }
 904
 905 /*
 906  - p_b_cclass - parse a character-class name and deal with it
 907  == static void p_b_cclass(struct parse *p, cset *cs);
 908  */
 909 static void
 910 p_b_cclass(
 911     struct parse *p,
 912     cset *cs)
 913 {
 914         const char *sp;
 915         const struct cclass *cp;
 916         size_t len;
 917         const char *u;
 918         char c;
 919
 920         _DIAGASSERT(p != NULL);
 921         _DIAGASSERT(cs != NULL);
 922
 923         sp = p->next;
 924
 925         while (MORE() && isalpha((unsigned char)PEEK()))
 926                 NEXT();
 927         len = p->next - sp;
 928         for (cp = cclasses; cp->name != NULL; cp++)
 929                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 930                         break;
 931         if (cp->name == NULL) {
 932                 /* oops, didn't find it */
 933                 SETERROR(REG_ECTYPE);
 934                 return;
 935         }
 936
 937         u = cp->chars;
 938         while ((c = *u++) != '\0')
 939                 CHadd(cs, c);
 940         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
 941                 MCadd(p, cs, u);
 942 }
 943
 944 /*
 945  - p_b_eclass - parse an equivalence-class name and deal with it
 946  == static void p_b_eclass(struct parse *p, cset *cs);
 947  *
 948  * This implementation is incomplete. xxx
 949  */
 950 static void
 951 p_b_eclass(
 952     struct parse *p,
 953     cset *cs)
 954 {
 955         char c;
 956
 957         _DIAGASSERT(p != NULL);
 958         _DIAGASSERT(cs != NULL);
 959
 960         c = p_b_coll_elem(p, '=');
 961         CHadd(cs, c);
 962 }
 963
 964 /*
 965  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 966  == static char p_b_symbol(struct parse *p);
 967  */
 968 static char                     /* value of symbol */
 969 p_b_symbol(
 970     struct parse *p)
 971 {
 972         char value;
 973
 974         _DIAGASSERT(p != NULL);
 975
 976         REQUIRE(MORE(), REG_EBRACK);
 977         if (!EATTWO('[', '.'))
 978                 return(GETNEXT());
 979
 980         /* collating symbol */
 981         value = p_b_coll_elem(p, '.');
 982         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 983         return(value);
 984 }
 985
 986 /*
 987  - p_b_coll_elem - parse a collating-element name and look it up
 988  == static char p_b_coll_elem(struct parse *p, int endc);
 989  */
 990 static char                     /* value of collating element */
 991 p_b_coll_elem(
 992     struct parse *p,
 993     int endc)                   /* name ended by endc,']' */
 994 {
 995         const char *sp;
 996         const struct cname *cp;
 997         size_t len;
 998
 999         _DIAGASSERT(p != NULL);
1000
1001         sp = p->next;
1002
1003         while (MORE() && !SEETWO(endc, ']'))
1004                 NEXT();
1005         if (!MORE()) {
1006                 SETERROR(REG_EBRACK);
1007                 return(0);
1008         }
1009         len = p->next - sp;
1010         for (cp = cnames; cp->name != NULL; cp++)
1011                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1012                         return(cp->code);       /* known name */
1013         if (len == 1)
1014                 return(*sp);    /* single character */
1015         SETERROR(REG_ECOLLATE);                 /* neither */
1016         return(0);
1017 }
1018
1019 /*
1020  - othercase - return the case counterpart of an alphabetic
1021  == static int othercase(int ch);
1022  */
1023 static int                      /* if no counterpart, return ch */
1024 othercase(
1025     int ch)
1026 {
1027         assert(isalpha(ch));
1028         if (isupper(ch))
1029                 return(tolower(ch));
1030         else if (islower(ch))
1031                 return(toupper(ch));
1032         else                    /* peculiar, but could happen */
1033                 return(ch);
1034 }
1035
1036 /*
1037  - bothcases - emit a dualcase version of a two-case character
1038  == static void bothcases(struct parse *p, int ch);
1039  *
1040  * Boy, is this implementation ever a kludge...
1041  */
1042 static void
1043 bothcases(
1044     struct parse *p,
1045     int ch)
1046 {
1047         const char *oldnext;
1048         const char *oldend;
1049         char bracket[3];
1050
1051         _DIAGASSERT(p != NULL);
1052
1053         oldnext = p->next;
1054         oldend = p->end;
1055
1056         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
1057         p->next = bracket;
1058         p->end = bracket+2;
1059         bracket[0] = ch;
1060         bracket[1] = ']';
1061         bracket[2] = '\0';
1062         p_bracket(p);
1063         assert(p->next == bracket+2);
1064         p->next = oldnext;
1065         p->end = oldend;
1066 }
1067
1068 /*
1069  - ordinary - emit an ordinary character
1070  == static void ordinary(struct parse *p, int ch);
1071  */
1072 static void
1073 ordinary(
1074     struct parse *p,
1075     int ch)
1076 {
1077         cat_t *cap;
1078
1079         _DIAGASSERT(p != NULL);
1080
1081         cap = p->g->categories;
1082         if ((p->g->cflags&REG_ICASE) && isalpha((unsigned char) ch)
1083             && othercase((unsigned char) ch) != (unsigned char) ch)
1084                 bothcases(p, (unsigned char) ch);
1085         else {
1086                 EMIT(OCHAR, (sopno)(unsigned char)ch);
1087                 if (cap[ch] == 0) {
1088                         _DIAGASSERT(__type_fit(unsigned char,
1089                             p->g->ncategories + 1));
1090                         cap[ch] = (unsigned char)p->g->ncategories++;
1091                 }
1092         }
1093 }
1094
1095 /*
1096  - nonnewline - emit REG_NEWLINE version of OANY
1097  == static void nonnewline(struct parse *p);
1098  *
1099  * Boy, is this implementation ever a kludge...
1100  */
1101 static void
1102 nonnewline(
1103     struct parse *p)
1104 {
1105         const char *oldnext;
1106         const char *oldend;
1107         char bracket[4];
1108
1109         _DIAGASSERT(p != NULL);
1110
1111         oldnext = p->next;
1112         oldend = p->end;
1113
1114         p->next = bracket;
1115         p->end = bracket+3;
1116         bracket[0] = '^';
1117         bracket[1] = '\n';
1118         bracket[2] = ']';
1119         bracket[3] = '\0';
1120         p_bracket(p);
1121         assert(p->next == bracket+3);
1122         p->next = oldnext;
1123         p->end = oldend;
1124 }
1125
1126 /*
1127  - repeat - generate code for a bounded repetition, recursively if needed
1128  == static void repeat(struct parse *p, sopno start, int from, int to,
1129  == size_t reclimit);
1130  */
1131 static void
1132 repeat(
1133     struct parse *p,
1134     sopno start,                /* operand from here to end of strip */
1135     int from,                   /* repeated from this number */
1136     int to,                     /* to this number of times (maybe INFINITY) */
1137     size_t reclimit)
1138 {
1139         sopno finish;
1140 #       define  N       2
1141 #       define  INF     3
1142 #       define  REP(f, t)       ((f)*8 + (t))
1143 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1144         sopno copy;
1145
1146         _DIAGASSERT(p != NULL);
1147
1148         if (reclimit++ > RECLIMIT)
1149                 p->error = REG_ESPACE;
1150         if (p->error)
1151                 return;
1152
1153         finish = HERE();
1154
1155         assert(from <= to);
1156
1157         switch (REP(MAP(from), MAP(to))) {
1158         case REP(0, 0):                 /* must be user doing this */
1159                 DROP(finish-start);     /* drop the operand */
1160                 break;
1161         case REP(0, 1):                 /* as x{1,1}? */
1162         case REP(0, N):                 /* as x{1,n}? */
1163         case REP(0, INF):               /* as x{1,}? */
1164                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1165                 INSERT(OCH_, start);            /* offset is wrong... */
1166                 repeat(p, start+1, 1, to, reclimit);
1167                 ASTERN(OOR1, start);
1168                 AHEAD(start);                   /* ... fix it */
1169                 EMIT(OOR2, 0);
1170                 AHEAD(THERE());
1171                 ASTERN(O_CH, THERETHERE());
1172                 break;
1173         case REP(1, 1):                 /* trivial case */
1174                 /* done */
1175                 break;
1176         case REP(1, N):                 /* as x?x{1,n-1} */
1177                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1178                 INSERT(OCH_, start);
1179                 ASTERN(OOR1, start);
1180                 AHEAD(start);
1181                 EMIT(OOR2, 0);                  /* offset very wrong... */
1182                 AHEAD(THERE());                 /* ...so fix it */
1183                 ASTERN(O_CH, THERETHERE());
1184                 copy = dupl(p, start+1, finish+1);
1185                 assert(copy == finish+4);
1186                 repeat(p, copy, 1, to-1, reclimit);
1187                 break;
1188         case REP(1, INF):               /* as x+ */
1189                 INSERT(OPLUS_, start);
1190                 ASTERN(O_PLUS, start);
1191                 break;
1192         case REP(N, N):                 /* as xx{m-1,n-1} */
1193                 copy = dupl(p, start, finish);
1194                 repeat(p, copy, from-1, to-1, reclimit);
1195                 break;
1196         case REP(N, INF):               /* as xx{n-1,INF} */
1197                 copy = dupl(p, start, finish);
1198                 repeat(p, copy, from-1, to, reclimit);
1199                 break;
1200         default:                        /* "can't happen" */
1201                 SETERROR(REG_ASSERT);   /* just in case */
1202                 break;
1203         }
1204 }
1205
1206 /*
1207  - seterr - set an error condition
1208  == static int seterr(struct parse *p, int e);
1209  */
1210 static int                      /* useless but makes type checking happy */
1211 seterr(
1212     struct parse *p,
1213     int e)
1214 {
1215
1216         _DIAGASSERT(p != NULL);
1217
1218         if (p->error == 0)      /* keep earliest error condition */
1219                 p->error = e;
1220         p->next = nuls;         /* try to bring things to a halt */
1221         p->end = nuls;
1222         return(0);              /* make the return value well-defined */
1223 }
1224
1225 /*
1226  - allocset - allocate a set of characters for []
1227  == static cset *allocset(struct parse *p);
1228  */
1229 static cset *
1230 allocset(
1231     struct parse *p)
1232 {
1233         size_t no;
1234         size_t nc;
1235         size_t nbytes;
1236         cset *cs;
1237         size_t css;
1238         size_t i;
1239
1240         _DIAGASSERT(p != NULL);
1241
1242         no = p->g->ncsets++;
1243         css = (size_t)p->g->csetsize;
1244         if (no >= p->ncsalloc) {        /* need another column of space */
1245                 p->ncsalloc += CHAR_BIT;
1246                 nc = p->ncsalloc;
1247                 assert(nc % CHAR_BIT == 0);
1248                 nbytes = nc / CHAR_BIT * css;
1249                 if (MEMSIZE(p) > MEMLIMIT)
1250                         goto oomem;
1251                 if (p->g->sets == NULL)
1252                         p->g->sets = malloc(nc * sizeof(cset));
1253                 else
1254                         p->g->sets = realloc(p->g->sets, nc * sizeof(cset));
1255                 if (p->g->setbits == NULL)
1256                         p->g->setbits = malloc(nbytes);
1257                 else {
1258                         p->g->setbits = realloc(p->g->setbits, nbytes);
1259                         /* xxx this isn't right if setbits is now NULL */
1260                         for (i = 0; i < no; i++)
1261                                 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1262                 }
1263                 if (p->g->sets != NULL && p->g->setbits != NULL)
1264                         (void) memset((char *)p->g->setbits + (nbytes - css),
1265                                                                 0, css);
1266                 else {
1267 oomem:
1268                         no = 0;
1269                         SETERROR(REG_ESPACE);
1270                         /* caller's responsibility not to do set ops */
1271                         return NULL;
1272                 }
1273         }
1274
1275         cs = &p->g->sets[no];
1276         cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1277         cs->mask = 1 << (unsigned int)((no) % CHAR_BIT);
1278         cs->hash = 0;
1279         cs->smultis = 0;
1280         cs->multis = NULL;
1281
1282         return(cs);
1283 }
1284
1285 /*
1286  - freeset - free a now-unused set
1287  == static void freeset(struct parse *p, cset *cs);
1288  */
1289 static void
1290 freeset(
1291     struct parse *p,
1292     cset *cs)
1293 {
1294         size_t i;
1295         cset *top;
1296         size_t css;
1297
1298         _DIAGASSERT(p != NULL);
1299         _DIAGASSERT(cs != NULL);
1300
1301         top = &p->g->sets[p->g->ncsets];
1302         css = (size_t)p->g->csetsize;
1303
1304         for (i = 0; i < css; i++)
1305                 CHsub(cs, (int)i);
1306         if (cs == top-1)        /* recover only the easy case */
1307                 p->g->ncsets--;
1308 }
1309
1310 /*
1311  - freezeset - final processing on a set of characters
1312  == static int freezeset(struct parse *p, cset *cs);
1313  *
1314  * The main task here is merging identical sets.  This is usually a waste
1315  * of time (although the hash code minimizes the overhead), but can win
1316  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1317  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1318  * the same value!
1319  */
1320 static sopno                    /* set number */
1321 freezeset(
1322     struct parse *p,
1323     cset *cs)
1324 {
1325         uch h;
1326         size_t i;
1327         cset *top;
1328         cset *cs2;
1329         size_t css;
1330
1331         _DIAGASSERT(p != NULL);
1332         _DIAGASSERT(cs != NULL);
1333
1334         h = cs->hash;
1335         top = &p->g->sets[p->g->ncsets];
1336         css = (size_t)p->g->csetsize;
1337
1338         /* look for an earlier one which is the same */
1339         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1340                 if (cs2->hash == h && cs2 != cs) {
1341                         /* maybe */
1342                         for (i = 0; i < css; i++)
1343                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1344                                         break;          /* no */
1345                         if (i == css)
1346                                 break;                  /* yes */
1347                 }
1348
1349         if (cs2 < top) {        /* found one */
1350                 freeset(p, cs);
1351                 cs = cs2;
1352         }
1353
1354         return (sopno)(cs - p->g->sets);
1355 }
1356
1357 /*
1358  - firstch - return first character in a set (which must have at least one)
1359  == static int firstch(struct parse *p, cset *cs);
1360  */
1361 static int                      /* character; there is no "none" value */
1362 firstch(
1363     struct parse *p,
1364     cset *cs)
1365 {
1366         size_t i;
1367         size_t css;
1368
1369         _DIAGASSERT(p != NULL);
1370         _DIAGASSERT(cs != NULL);
1371
1372         css = (size_t)p->g->csetsize;
1373
1374         for (i = 0; i < css; i++)
1375                 if (CHIN(cs, i))
1376                         return((char)i);
1377         assert(never);
1378         return(0);              /* arbitrary */
1379 }
1380
1381 /*
1382  - nch - number of characters in a set
1383  == static int nch(struct parse *p, cset *cs);
1384  */
1385 static int
1386 nch(
1387     struct parse *p,
1388     cset *cs)
1389 {
1390         size_t i;
1391         size_t css;
1392         int n = 0;
1393
1394         _DIAGASSERT(p != NULL);
1395         _DIAGASSERT(cs != NULL);
1396
1397         css = (size_t)p->g->csetsize;
1398
1399         for (i = 0; i < css; i++)
1400                 if (CHIN(cs, i))
1401                         n++;
1402         return(n);
1403 }
1404
1405 /*
1406  - mcadd - add a collating element to a cset
1407  == static void mcadd(struct parse *p, cset *cs, \
1408  ==     char *cp);
1409  */
1410 static void
1411 mcadd(
1412     struct parse *p,
1413     cset *cs,
1414     const char *cp)
1415 {
1416         size_t oldend;
1417
1418         _DIAGASSERT(p != NULL);
1419         _DIAGASSERT(cs != NULL);
1420         _DIAGASSERT(cp != NULL);
1421
1422         oldend = cs->smultis;
1423
1424         cs->smultis += strlen(cp) + 1;
1425         if (cs->multis == NULL)
1426                 cs->multis = malloc(cs->smultis);
1427         else
1428                 cs->multis = realloc(cs->multis, cs->smultis);
1429         if (cs->multis == NULL) {
1430                 SETERROR(REG_ESPACE);
1431                 return;
1432         }
1433
1434         (void) strcpy(cs->multis + oldend - 1, cp);
1435         cs->multis[cs->smultis - 1] = '\0';
1436 }
1437
1438 #if 0
1439 /*
1440  - mcsub - subtract a collating element from a cset
1441  == static void mcsub(cset *cs, char *cp);
1442  */
1443 static void
1444 mcsub(
1445     cset *cs,
1446     char *cp)
1447 {
1448         char *fp;
1449         size_t len;
1450
1451         _DIAGASSERT(cs != NULL);
1452         _DIAGASSERT(cp != NULL);
1453
1454         fp = mcfind(cs, cp);
1455         len = strlen(fp);
1456
1457         assert(fp != NULL);
1458         (void) memmove(fp, fp + len + 1,
1459                                 cs->smultis - (fp + len + 1 - cs->multis));
1460         cs->smultis -= len;
1461
1462         if (cs->smultis == 0) {
1463                 free(cs->multis);
1464                 cs->multis = NULL;
1465                 return;
1466         }
1467
1468         cs->multis = realloc(cs->multis, cs->smultis);
1469         assert(cs->multis != NULL);
1470 }
1471
1472 /*
1473  - mcin - is a collating element in a cset?
1474  == static int mcin(cset *cs, char *cp);
1475  */
1476 static int
1477 mcin(
1478     cset *cs,
1479     char *cp)
1480 {
1481
1482         _DIAGASSERT(cs != NULL);
1483         _DIAGASSERT(cp != NULL);
1484
1485         return(mcfind(cs, cp) != NULL);
1486 }
1487
1488 /*
1489  - mcfind - find a collating element in a cset
1490  == static char *mcfind(cset *cs, char *cp);
1491  */
1492 static char *
1493 mcfind(
1494     cset *cs,
1495     char *cp)
1496 {
1497         char *p;
1498
1499         _DIAGASSERT(cs != NULL);
1500         _DIAGASSERT(cp != NULL);
1501
1502         if (cs->multis == NULL)
1503                 return(NULL);
1504         for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1505                 if (strcmp(cp, p) == 0)
1506                         return(p);
1507         return(NULL);
1508 }
1509 #endif
1510
1511 /*
1512  - mcinvert - invert the list of collating elements in a cset
1513  == static void mcinvert(struct parse *p, cset *cs);
1514  *
1515  * This would have to know the set of possibilities.  Implementation
1516  * is deferred.
1517  */
1518 /* ARGSUSED */
1519 static void
1520 mcinvert(
1521     struct parse *p,
1522     cset *cs)
1523 {
1524
1525         _DIAGASSERT(p != NULL);
1526         _DIAGASSERT(cs != NULL);
1527
1528         assert(cs->multis == NULL);     /* xxx */
1529 }
1530
1531 /*
1532  - mccase - add case counterparts of the list of collating elements in a cset
1533  == static void mccase(struct parse *p, cset *cs);
1534  *
1535  * This would have to know the set of possibilities.  Implementation
1536  * is deferred.
1537  */
1538 /* ARGSUSED */
1539 static void
1540 mccase(
1541     struct parse *p,
1542     cset *cs)
1543 {
1544
1545         _DIAGASSERT(p != NULL);
1546         _DIAGASSERT(cs != NULL);
1547
1548         assert(cs->multis == NULL);     /* xxx */
1549 }
1550
1551 /*
1552  - isinsets - is this character in any sets?
1553  == static int isinsets(struct re_guts *g, int c);
1554  */
1555 static int                      /* predicate */
1556 isinsets(
1557     struct re_guts *g,
1558     int c)
1559 {
1560         uch *col;
1561         size_t i;
1562         size_t ncols;
1563         unsigned uc = (unsigned char)c;
1564
1565         _DIAGASSERT(g != NULL);
1566
1567         if (g->setbits == NULL)
1568                 return 0;
1569
1570         ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1571
1572         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1573                 if (col[uc] != 0)
1574                         return(1);
1575         return(0);
1576 }
1577
1578 /*
1579  - samesets - are these two characters in exactly the same sets?
1580  == static int samesets(struct re_guts *g, int c1, int c2);
1581  */
1582 static int                      /* predicate */
1583 samesets(
1584     struct re_guts *g,
1585     int c1,
1586     int c2)
1587 {
1588         uch *col;
1589         size_t i;
1590         size_t ncols;
1591         unsigned uc1 = (unsigned char)c1;
1592         unsigned uc2 = (unsigned char)c2;
1593
1594         _DIAGASSERT(g != NULL);
1595
1596         ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1597
1598         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1599                 if (col[uc1] != col[uc2])
1600                         return(0);
1601         return(1);
1602 }
1603
1604 /*
1605  - categorize - sort out character categories
1606  == static void categorize(struct parse *p, struct re_guts *g);
1607  */
1608 static void
1609 categorize(
1610     struct parse *p,
1611     struct re_guts *g)
1612 {
1613         cat_t *cats;
1614         int c;
1615         int c2;
1616         cat_t cat;
1617
1618         _DIAGASSERT(p != NULL);
1619         _DIAGASSERT(g != NULL);
1620
1621         cats = g->categories;
1622
1623         /* avoid making error situations worse */
1624         if (p->error != 0)
1625                 return;
1626
1627         for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1628                 if (cats[c] == 0 && isinsets(g, c)) {
1629                         _DIAGASSERT(__type_fit(unsigned char,
1630                             g->ncategories + 1));
1631                         cat = g->ncategories++;
1632                         cats[c] = cat;
1633                         for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1634                                 if (cats[c2] == 0 && samesets(g, c, c2))
1635                                         cats[c2] = cat;
1636                 }
1637 }
1638
1639 /*
1640  - dupl - emit a duplicate of a bunch of sops
1641  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1642  */
1643 static sopno                    /* start of duplicate */
1644 dupl(
1645     struct parse *p,
1646     sopno start,                        /* from here */
1647     sopno finish)                       /* to this less one */
1648 {
1649         sopno ret;
1650         sopno len = finish - start;
1651
1652         _DIAGASSERT(p != NULL);
1653
1654         ret = HERE();
1655
1656         assert(finish >= start);
1657         if (len == 0)
1658                 return(ret);
1659         if (!enlarge(p, p->ssize + len))/* this many unexpected additions */
1660                 return ret;
1661         (void)memcpy(p->strip + p->slen, p->strip + start,
1662             (size_t)len * sizeof(sop));
1663         p->slen += len;
1664         return(ret);
1665 }
1666
1667 /*
1668  - doemit - emit a strip operator
1669  == static void doemit(struct parse *p, sop op, size_t opnd);
1670  *
1671  * It might seem better to implement this as a macro with a function as
1672  * hard-case backup, but it's just too big and messy unless there are
1673  * some changes to the data structures.  Maybe later.
1674  */
1675 static void
1676 doemit(
1677     struct parse *p,
1678     sop op,
1679     sopno opnd)
1680 {
1681         _DIAGASSERT(p != NULL);
1682
1683         /* avoid making error situations worse */
1684         if (p->error != 0)
1685                 return;
1686
1687         /* deal with oversize operands ("can't happen", more or less) */
1688         assert(opnd < 1<<OPSHIFT);
1689
1690         /* deal with undersized strip */
1691         if (p->slen >= p->ssize)
1692                 if (!enlarge(p, (p->ssize+1) / 2 * 3))  /* +50% */
1693                         return;
1694
1695         /* finally, it's all reduced to the easy case */
1696         p->strip[p->slen++] = (sop)SOP(op, opnd);
1697 }
1698
1699 /*
1700  - doinsert - insert a sop into the strip
1701  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1702  */
1703 static void
1704 doinsert(
1705     struct parse *p,
1706     sop op,
1707     sopno opnd,
1708     sopno pos)
1709 {
1710         sopno sn;
1711         sop s;
1712         int i;
1713
1714         _DIAGASSERT(p != NULL);
1715
1716         /* avoid making error situations worse */
1717         if (p->error != 0)
1718                 return;
1719
1720         sn = HERE();
1721         EMIT(op, opnd);         /* do checks, ensure space */
1722         assert(HERE() == sn+1);
1723         s = p->strip[sn];
1724
1725         /* adjust paren pointers */
1726         assert(pos > 0);
1727         for (i = 1; i < NPAREN; i++) {
1728                 if (p->pbegin[i] >= pos) {
1729                         p->pbegin[i]++;
1730                 }
1731                 if (p->pend[i] >= pos) {
1732                         p->pend[i]++;
1733                 }
1734         }
1735
1736         memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop));
1737         p->strip[pos] = s;
1738 }
1739
1740 /*
1741  - dofwd - complete a forward reference
1742  == static void dofwd(struct parse *p, sopno pos, sop value);
1743  */
1744 static void
1745 dofwd(
1746     struct parse *p,
1747     sopno pos,
1748     sopno value)
1749 {
1750
1751         _DIAGASSERT(p != NULL);
1752
1753         /* avoid making error situations worse */
1754         if (p->error != 0)
1755                 return;
1756
1757         assert(value < 1<<OPSHIFT);
1758         p->strip[pos] = (sop)(OP(p->strip[pos]) | value);
1759 }
1760
1761 /*
1762  - enlarge - enlarge the strip
1763  == static void enlarge(struct parse *p, sopno size);
1764  */
1765 static int
1766 enlarge(
1767     struct parse *p,
1768     sopno size)
1769 {
1770         sop *sp;
1771         sopno osize;
1772
1773         _DIAGASSERT(p != NULL);
1774
1775         if (p->ssize >= size)
1776                 return 1;
1777
1778         osize = p->ssize;
1779         p->ssize = size;
1780         if (MEMSIZE(p) > MEMLIMIT)
1781                 goto oomem;
1782         sp = realloc(p->strip, p->ssize * sizeof(sop));
1783         if (sp == NULL) {
1784 oomem:
1785                 p->ssize = osize;
1786                 SETERROR(REG_ESPACE);
1787                 return 0;
1788         }
1789         p->strip = sp;
1790         return 1;
1791 }
1792
1793 /*
1794  - stripsnug - compact the strip
1795  == static void stripsnug(struct parse *p, struct re_guts *g);
1796  */
1797 static void
1798 stripsnug(
1799     struct parse *p,
1800     struct re_guts *g)
1801 {
1802
1803         _DIAGASSERT(p != NULL);
1804         _DIAGASSERT(g != NULL);
1805
1806         g->nstates = p->slen;
1807         g->strip = realloc(p->strip, p->slen * sizeof(sop));
1808         if (g->strip == NULL) {
1809                 SETERROR(REG_ESPACE);
1810                 g->strip = p->strip;
1811         }
1812 }
1813
1814 /*
1815  - findmust - fill in must and mlen with longest mandatory literal string
1816  == static void findmust(struct parse *p, struct re_guts *g);
1817  *
1818  * This algorithm could do fancy things like analyzing the operands of |
1819  * for common subsequences.  Someday.  This code is simple and finds most
1820  * of the interesting cases.
1821  *
1822  * Note that must and mlen got initialized during setup.
1823  */
1824 static void
1825 findmust(
1826     struct parse *p,
1827     struct re_guts *g)
1828 {
1829         sop *scan;
1830         sop *start = NULL;
1831         sop *newstart = NULL;
1832         sopno newlen;
1833         sop s;
1834         char *cp;
1835         sopno i;
1836
1837         _DIAGASSERT(p != NULL);
1838         _DIAGASSERT(g != NULL);
1839
1840         /* avoid making error situations worse */
1841         if (p->error != 0)
1842                 return;
1843
1844         /* find the longest OCHAR sequence in strip */
1845         newlen = 0;
1846         scan = g->strip + 1;
1847         do {
1848                 s = *scan++;
1849                 switch (OP(s)) {
1850                 case OCHAR:             /* sequence member */
1851                         if (newlen == 0)                /* new sequence */
1852                                 newstart = scan - 1;
1853                         newlen++;
1854                         break;
1855                 case OPLUS_:            /* things that don't break one */
1856                 case OLPAREN:
1857                 case ORPAREN:
1858                         break;
1859                 case OQUEST_:           /* things that must be skipped */
1860                 case OCH_:
1861                         scan--;
1862                         do {
1863                                 scan += OPND(s);
1864                                 s = *scan;
1865                                 /* assert() interferes w debug printouts */
1866                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1867                                                         OP(s) != OOR2) {
1868                                         g->iflags |= BAD;
1869                                         return;
1870                                 }
1871                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1872                         /* FALLTHROUGH */
1873                 default:                /* things that break a sequence */
1874                         if (newlen > g->mlen) {         /* ends one */
1875                                 start = newstart;
1876                                 g->mlen = newlen;
1877                         }
1878                         newlen = 0;
1879                         break;
1880                 }
1881         } while (OP(s) != OEND);
1882
1883         if (start == NULL)
1884                 g->mlen = 0;
1885
1886         if (g->mlen == 0)       /* there isn't one */
1887                 return;
1888
1889         /* turn it into a character string */
1890         g->must = malloc((size_t)g->mlen + 1);
1891         if (g->must == NULL) {          /* argh; just forget it */
1892                 g->mlen = 0;
1893                 return;
1894         }
1895         cp = g->must;
1896         scan = start;
1897         for (i = g->mlen; i > 0; i--) {
1898                 while (OP(s = *scan++) != OCHAR)
1899                         continue;
1900                 assert(cp < g->must + g->mlen);
1901                 *cp++ = (char)OPND(s);
1902         }
1903         assert(cp == g->must + g->mlen);
1904         *cp++ = '\0';           /* just on general principles */
1905 }
1906
1907 /*
1908  - pluscount - count + nesting
1909  == static sopno pluscount(struct parse *p, struct re_guts *g);
1910  */
1911 static sopno                    /* nesting depth */
1912 pluscount(
1913     struct parse *p,
1914     struct re_guts *g)
1915 {
1916         sop *scan;
1917         sop s;
1918         sopno plusnest = 0;
1919         sopno maxnest = 0;
1920
1921         _DIAGASSERT(p != NULL);
1922         _DIAGASSERT(g != NULL);
1923
1924         if (p->error != 0)
1925                 return(0);      /* there may not be an OEND */
1926
1927         scan = g->strip + 1;
1928         do {
1929                 s = *scan++;
1930                 switch (OP(s)) {
1931                 case OPLUS_:
1932                         plusnest++;
1933                         break;
1934                 case O_PLUS:
1935                         if (plusnest > maxnest)
1936                                 maxnest = plusnest;
1937                         plusnest--;
1938                         break;
1939                 }
1940         } while (OP(s) != OEND);
1941         if (plusnest != 0)
1942                 g->iflags |= BAD;
1943         return(maxnest);
1944 }