lib/Support/regcomp.c

   1 /*-
   2  * This code is derived from OpenBSD's libc/regex, original license follows:
   3  *
   4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   5  * Copyright (c) 1992, 1993, 1994
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Henry Spencer.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  36  */
  37
  38 #include <sys/types.h>
  39 #include <stdint.h>
  40 #include <stdio.h>
  41 #include <string.h>
  42 #include <ctype.h>
  43 #include <limits.h>
  44 #include <stdlib.h>
  45 #include "regex_impl.h"
  46
  47 #include "regutils.h"
  48 #include "regex2.h"
  49
  50 #include "llvm/Config/config.h"
  51 #include "llvm/Support/Compiler.h"
  52
  53 /* character-class table */
  54 static struct cclass {
  55         const char *name;
  56         const char *chars;
  57         const char *multis;
  58 } cclasses[] = {
  59         { "alnum",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  60 0123456789",                            ""} ,
  61         { "alpha",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
  62                                         ""} ,
  63         { "blank",      " \t",          ""} ,
  64         { "cntrl",      "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
  65 \25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
  66         { "digit",      "0123456789",   ""} ,
  67         { "graph",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  68 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  69                                         ""} ,
  70         { "lower",      "abcdefghijklmnopqrstuvwxyz",
  71                                         ""} ,
  72         { "print",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  73 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
  74                                         ""} ,
  75         { "punct",      "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  76                                         ""} ,
  77         { "space",      "\t\n\v\f\r ",  ""} ,
  78         { "upper",      "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
  79                                         ""} ,
  80         { "xdigit",     "0123456789ABCDEFabcdef",
  81                                         ""} ,
  82         { NULL,         0,              "" }
  83 };
  84
  85 /* character-name table */
  86 static struct cname {
  87         const char *name;
  88         char code;
  89 } cnames[] = {
  90         { "NUL",                        '\0' },
  91         { "SOH",                        '\001' },
  92         { "STX",                        '\002' },
  93         { "ETX",                        '\003' },
  94         { "EOT",                        '\004' },
  95         { "ENQ",                        '\005' },
  96         { "ACK",                        '\006' },
  97         { "BEL",                        '\007' },
  98         { "alert",                      '\007' },
  99         { "BS",                         '\010' },
 100         { "backspace",                  '\b' },
 101         { "HT",                         '\011' },
 102         { "tab",                        '\t' },
 103         { "LF",                         '\012' },
 104         { "newline",                    '\n' },
 105         { "VT",                         '\013' },
 106         { "vertical-tab",               '\v' },
 107         { "FF",                         '\014' },
 108         { "form-feed",                  '\f' },
 109         { "CR",                         '\015' },
 110         { "carriage-return",            '\r' },
 111         { "SO",                         '\016' },
 112         { "SI",                         '\017' },
 113         { "DLE",                        '\020' },
 114         { "DC1",                        '\021' },
 115         { "DC2",                        '\022' },
 116         { "DC3",                        '\023' },
 117         { "DC4",                        '\024' },
 118         { "NAK",                        '\025' },
 119         { "SYN",                        '\026' },
 120         { "ETB",                        '\027' },
 121         { "CAN",                        '\030' },
 122         { "EM",                         '\031' },
 123         { "SUB",                        '\032' },
 124         { "ESC",                        '\033' },
 125         { "IS4",                        '\034' },
 126         { "FS",                         '\034' },
 127         { "IS3",                        '\035' },
 128         { "GS",                         '\035' },
 129         { "IS2",                        '\036' },
 130         { "RS",                         '\036' },
 131         { "IS1",                        '\037' },
 132         { "US",                         '\037' },
 133         { "space",                      ' ' },
 134         { "exclamation-mark",           '!' },
 135         { "quotation-mark",             '"' },
 136         { "number-sign",                '#' },
 137         { "dollar-sign",                '$' },
 138         { "percent-sign",               '%' },
 139         { "ampersand",                  '&' },
 140         { "apostrophe",                 '\'' },
 141         { "left-parenthesis",           '(' },
 142         { "right-parenthesis",          ')' },
 143         { "asterisk",                   '*' },
 144         { "plus-sign",                  '+' },
 145         { "comma",                      ',' },
 146         { "hyphen",                     '-' },
 147         { "hyphen-minus",               '-' },
 148         { "period",                     '.' },
 149         { "full-stop",                  '.' },
 150         { "slash",                      '/' },
 151         { "solidus",                    '/' },
 152         { "zero",                       '0' },
 153         { "one",                        '1' },
 154         { "two",                        '2' },
 155         { "three",                      '3' },
 156         { "four",                       '4' },
 157         { "five",                       '5' },
 158         { "six",                        '6' },
 159         { "seven",                      '7' },
 160         { "eight",                      '8' },
 161         { "nine",                       '9' },
 162         { "colon",                      ':' },
 163         { "semicolon",                  ';' },
 164         { "less-than-sign",             '<' },
 165         { "equals-sign",                '=' },
 166         { "greater-than-sign",          '>' },
 167         { "question-mark",              '?' },
 168         { "commercial-at",              '@' },
 169         { "left-square-bracket",        '[' },
 170         { "backslash",                  '\\' },
 171         { "reverse-solidus",            '\\' },
 172         { "right-square-bracket",       ']' },
 173         { "circumflex",                 '^' },
 174         { "circumflex-accent",          '^' },
 175         { "underscore",                 '_' },
 176         { "low-line",                   '_' },
 177         { "grave-accent",               '`' },
 178         { "left-brace",                 '{' },
 179         { "left-curly-bracket",         '{' },
 180         { "vertical-line",              '|' },
 181         { "right-brace",                '}' },
 182         { "right-curly-bracket",        '}' },
 183         { "tilde",                      '~' },
 184         { "DEL",                        '\177' },
 185         { NULL,                         0 }
 186 };
 187
 188 /*
 189  * parse structure, passed up and down to avoid global variables and
 190  * other clumsinesses
 191  */
 192 struct parse {
 193         char *next;             /* next character in RE */
 194         char *end;              /* end of string (-> NUL normally) */
 195         int error;              /* has an error been seen? */
 196         sop *strip;             /* malloced strip */
 197         sopno ssize;            /* malloced strip size (allocated) */
 198         sopno slen;             /* malloced strip length (used) */
 199         int ncsalloc;           /* number of csets allocated */
 200         struct re_guts *g;
 201 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
 202         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
 203         sopno pend[NPAREN];     /* -> ) ([0] unused) */
 204 };
 205
 206 static void p_ere(struct parse *, int);
 207 static void p_ere_exp(struct parse *);
 208 static void p_str(struct parse *);
 209 static void p_bre(struct parse *, int, int);
 210 static int p_simp_re(struct parse *, int);
 211 static int p_count(struct parse *);
 212 static void p_bracket(struct parse *);
 213 static void p_b_term(struct parse *, cset *);
 214 static void p_b_cclass(struct parse *, cset *);
 215 static void p_b_eclass(struct parse *, cset *);
 216 static char p_b_symbol(struct parse *);
 217 static char p_b_coll_elem(struct parse *, int);
 218 static char othercase(int);
 219 static void bothcases(struct parse *, int);
 220 static void ordinary(struct parse *, int);
 221 static void nonnewline(struct parse *);
 222 static void repeat(struct parse *, sopno, int, int);
 223 static int seterr(struct parse *, int);
 224 static cset *allocset(struct parse *);
 225 static void freeset(struct parse *, cset *);
 226 static int freezeset(struct parse *, cset *);
 227 static int firstch(struct parse *, cset *);
 228 static int nch(struct parse *, cset *);
 229 static void mcadd(struct parse *, cset *, const char *);
 230 static void mcinvert(struct parse *, cset *);
 231 static void mccase(struct parse *, cset *);
 232 static int isinsets(struct re_guts *, int);
 233 static int samesets(struct re_guts *, int, int);
 234 static void categorize(struct parse *, struct re_guts *);
 235 static sopno dupl(struct parse *, sopno, sopno);
 236 static void doemit(struct parse *, sop, size_t);
 237 static void doinsert(struct parse *, sop, size_t, sopno);
 238 static void dofwd(struct parse *, sopno, sop);
 239 static void enlarge(struct parse *, sopno);
 240 static void stripsnug(struct parse *, struct re_guts *);
 241 static void findmust(struct parse *, struct re_guts *);
 242 static sopno pluscount(struct parse *, struct re_guts *);
 243
 244 static char nuls[10];           /* place to point scanner in event of error */
 245
 246 /*
 247  * macros for use with parse structure
 248  * BEWARE:  these know that the parse structure is named `p' !!!
 249  */
 250 #define PEEK()  (*p->next)
 251 #define PEEK2() (*(p->next+1))
 252 #define MORE()  (p->next < p->end)
 253 #define MORE2() (p->next+1 < p->end)
 254 #define SEE(c)  (MORE() && PEEK() == (c))
 255 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 256 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 257 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 258 #define NEXT()  (p->next++)
 259 #define NEXT2() (p->next += 2)
 260 #define NEXTn(n)        (p->next += (n))
 261 #define GETNEXT()       (*p->next++)
 262 #define SETERROR(e)     seterr(p, (e))
 263 #define REQUIRE(co, e)  (void)((co) || SETERROR(e))
 264 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 265 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 266 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 267 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 268 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 269 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 270 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 271 #define HERE()          (p->slen)
 272 #define THERE()         (p->slen - 1)
 273 #define THERETHERE()    (p->slen - 2)
 274 #define DROP(n) (p->slen -= (n))
 275
 276 #ifdef  _POSIX2_RE_DUP_MAX
 277 #define DUPMAX  _POSIX2_RE_DUP_MAX
 278 #else
 279 #define DUPMAX  255
 280 #endif
 281 #define INFINITY        (DUPMAX + 1)
 282
 283 #ifndef NDEBUG
 284 static int never = 0;           /* for use in asserts; shuts lint up */
 285 #else
 286 #define never   0               /* some <assert.h>s have bugs too */
 287 #endif
 288
 289 /*
 290  - llvm_regcomp - interface for parser and compilation
 291  */
 292 int                             /* 0 success, otherwise REG_something */
 293 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
 294 {
 295         struct parse pa;
 296         struct re_guts *g;
 297         struct parse *p = &pa;
 298         int i;
 299         size_t len;
 300 #ifdef REDEBUG
 301 #       define  GOODFLAGS(f)    (f)
 302 #else
 303 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 304 #endif
 305
 306         cflags = GOODFLAGS(cflags);
 307         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 308                 return(REG_INVARG);
 309
 310         if (cflags&REG_PEND) {
 311                 if (preg->re_endp < pattern)
 312                         return(REG_INVARG);
 313                 len = preg->re_endp - pattern;
 314         } else
 315                 len = strlen((const char *)pattern);
 316
 317         /* do the mallocs early so failure handling is easy */
 318         g = (struct re_guts *)malloc(sizeof(struct re_guts) +
 319                                                         (NC-1)*sizeof(cat_t));
 320         if (g == NULL)
 321                 return(REG_ESPACE);
 322         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 323         p->strip = (sop *)calloc(p->ssize, sizeof(sop));
 324         p->slen = 0;
 325         if (p->strip == NULL) {
 326                 free((char *)g);
 327                 return(REG_ESPACE);
 328         }
 329
 330         /* set things up */
 331         p->g = g;
 332         p->next = (char *)pattern;      /* convenience; we do not modify it */
 333         p->end = p->next + len;
 334         p->error = 0;
 335         p->ncsalloc = 0;
 336         for (i = 0; i < NPAREN; i++) {
 337                 p->pbegin[i] = 0;
 338                 p->pend[i] = 0;
 339         }
 340         g->csetsize = NC;
 341         g->sets = NULL;
 342         g->setbits = NULL;
 343         g->ncsets = 0;
 344         g->cflags = cflags;
 345         g->iflags = 0;
 346         g->nbol = 0;
 347         g->neol = 0;
 348         g->must = NULL;
 349         g->mlen = 0;
 350         g->nsub = 0;
 351         g->ncategories = 1;     /* category 0 is "everything else" */
 352         g->categories = &g->catspace[-(CHAR_MIN)];
 353         (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 354         g->backrefs = 0;
 355
 356         /* do it */
 357         EMIT(OEND, 0);
 358         g->firststate = THERE();
 359         if (cflags&REG_EXTENDED)
 360                 p_ere(p, OUT);
 361         else if (cflags&REG_NOSPEC)
 362                 p_str(p);
 363         else
 364                 p_bre(p, OUT, OUT);
 365         EMIT(OEND, 0);
 366         g->laststate = THERE();
 367
 368         /* tidy up loose ends and fill things in */
 369         categorize(p, g);
 370         stripsnug(p, g);
 371         findmust(p, g);
 372         g->nplus = pluscount(p, g);
 373         g->magic = MAGIC2;
 374         preg->re_nsub = g->nsub;
 375         preg->re_g = g;
 376         preg->re_magic = MAGIC1;
 377 #ifndef REDEBUG
 378         /* not debugging, so can't rely on the assert() in llvm_regexec() */
 379         if (g->iflags&REGEX_BAD)
 380                 SETERROR(REG_ASSERT);
 381 #endif
 382
 383         /* win or lose, we're done */
 384         if (p->error != 0)      /* lose */
 385                 llvm_regfree(preg);
 386         return(p->error);
 387 }
 388
 389 /*
 390  - p_ere - ERE parser top level, concatenation and alternation
 391  */
 392 static void
 393 p_ere(struct parse *p, int stop)        /* character this ERE should end at */
 394 {
 395         char c;
 396         sopno prevback = 0;
 397         sopno prevfwd = 0;
 398         sopno conc;
 399         int first = 1;          /* is this the first alternative? */
 400
 401         for (;;) {
 402                 /* do a bunch of concatenated expressions */
 403                 conc = HERE();
 404                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 405                         p_ere_exp(p);
 406                 REQUIRE(HERE() != conc, REG_EMPTY);     /* require nonempty */
 407
 408                 if (!EAT('|'))
 409                         break;          /* NOTE BREAK OUT */
 410
 411                 if (first) {
 412                         INSERT(OCH_, conc);     /* offset is wrong */
 413                         prevfwd = conc;
 414                         prevback = conc;
 415                         first = 0;
 416                 }
 417                 ASTERN(OOR1, prevback);
 418                 prevback = THERE();
 419                 AHEAD(prevfwd);                 /* fix previous offset */
 420                 prevfwd = HERE();
 421                 EMIT(OOR2, 0);                  /* offset is very wrong */
 422         }
 423
 424         if (!first) {           /* tail-end fixups */
 425                 AHEAD(prevfwd);
 426                 ASTERN(O_CH, prevback);
 427         }
 428
 429         assert(!MORE() || SEE(stop));
 430 }
 431
 432 /*
 433  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 434  */
 435 static void
 436 p_ere_exp(struct parse *p)
 437 {
 438         char c;
 439         sopno pos;
 440         int count;
 441         int count2;
 442         int backrefnum;
 443         sopno subno;
 444         int wascaret = 0;
 445
 446         assert(MORE());         /* caller should have ensured this */
 447         c = GETNEXT();
 448
 449         pos = HERE();
 450         switch (c) {
 451         case '(':
 452                 REQUIRE(MORE(), REG_EPAREN);
 453                 p->g->nsub++;
 454                 subno = p->g->nsub;
 455                 if (subno < NPAREN)
 456                         p->pbegin[subno] = HERE();
 457                 EMIT(OLPAREN, subno);
 458                 if (!SEE(')'))
 459                         p_ere(p, ')');
 460                 if (subno < NPAREN) {
 461                         p->pend[subno] = HERE();
 462                         assert(p->pend[subno] != 0);
 463                 }
 464                 EMIT(ORPAREN, subno);
 465                 MUSTEAT(')', REG_EPAREN);
 466                 break;
 467 #ifndef POSIX_MISTAKE
 468         case ')':               /* happens only if no current unmatched ( */
 469                 /*
 470                  * You may ask, why the ifndef?  Because I didn't notice
 471                  * this until slightly too late for 1003.2, and none of the
 472                  * other 1003.2 regular-expression reviewers noticed it at
 473                  * all.  So an unmatched ) is legal POSIX, at least until
 474                  * we can get it fixed.
 475                  */
 476                 SETERROR(REG_EPAREN);
 477                 break;
 478 #endif
 479         case '^':
 480                 EMIT(OBOL, 0);
 481                 p->g->iflags |= USEBOL;
 482                 p->g->nbol++;
 483                 wascaret = 1;
 484                 break;
 485         case '$':
 486                 EMIT(OEOL, 0);
 487                 p->g->iflags |= USEEOL;
 488                 p->g->neol++;
 489                 break;
 490         case '|':
 491                 SETERROR(REG_EMPTY);
 492                 break;
 493         case '*':
 494         case '+':
 495         case '?':
 496                 SETERROR(REG_BADRPT);
 497                 break;
 498         case '.':
 499                 if (p->g->cflags&REG_NEWLINE)
 500                         nonnewline(p);
 501                 else
 502                         EMIT(OANY, 0);
 503                 break;
 504         case '[':
 505                 p_bracket(p);
 506                 break;
 507         case '\\':
 508                 REQUIRE(MORE(), REG_EESCAPE);
 509                 c = GETNEXT();
 510                 if (c >= '1' && c <= '9') {
 511                         /* \[0-9] is taken to be a back-reference to a previously specified
 512                          * matching group. backrefnum will hold the number. The matching
 513                          * group must exist (i.e. if \4 is found there must have been at
 514                          * least 4 matching groups specified in the pattern previously).
 515                          */
 516                         backrefnum = c - '0';
 517                         if (p->pend[backrefnum] == 0) {
 518                                 SETERROR(REG_ESUBREG);
 519                                 break;
 520                         }
 521
 522                         /* Make sure everything checks out and emit the sequence
 523                          * that marks a back-reference to the parse structure.
 524                          */
 525                         assert(backrefnum <= p->g->nsub);
 526                         EMIT(OBACK_, backrefnum);
 527                         assert(p->pbegin[backrefnum] != 0);
 528                         assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
 529                         assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
 530                         (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
 531                         EMIT(O_BACK, backrefnum);
 532                         p->g->backrefs = 1;
 533                 } else {
 534                         /* Other chars are simply themselves when escaped with a backslash.
 535                          */
 536                         ordinary(p, c);
 537                 }
 538                 break;
 539         case '{':               /* okay as ordinary except if digit follows */
 540                 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
 541                 LLVM_FALLTHROUGH;
 542         default:
 543                 ordinary(p, c);
 544                 break;
 545         }
 546
 547         if (!MORE())
 548                 return;
 549         c = PEEK();
 550         /* we call { a repetition if followed by a digit */
 551         if (!( c == '*' || c == '+' || c == '?' ||
 552                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
 553                 return;         /* no repetition, we're done */
 554         NEXT();
 555
 556         REQUIRE(!wascaret, REG_BADRPT);
 557         switch (c) {
 558         case '*':       /* implemented as +? */
 559                 /* this case does not require the (y|) trick, noKLUDGE */
 560                 INSERT(OPLUS_, pos);
 561                 ASTERN(O_PLUS, pos);
 562                 INSERT(OQUEST_, pos);
 563                 ASTERN(O_QUEST, pos);
 564                 break;
 565         case '+':
 566                 INSERT(OPLUS_, pos);
 567                 ASTERN(O_PLUS, pos);
 568                 break;
 569         case '?':
 570                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 571                 INSERT(OCH_, pos);              /* offset slightly wrong */
 572                 ASTERN(OOR1, pos);              /* this one's right */
 573                 AHEAD(pos);                     /* fix the OCH_ */
 574                 EMIT(OOR2, 0);                  /* offset very wrong... */
 575                 AHEAD(THERE());                 /* ...so fix it */
 576                 ASTERN(O_CH, THERETHERE());
 577                 break;
 578         case '{':
 579                 count = p_count(p);
 580                 if (EAT(',')) {
 581                         if (isdigit((uch)PEEK())) {
 582                                 count2 = p_count(p);
 583                                 REQUIRE(count <= count2, REG_BADBR);
 584                         } else          /* single number with comma */
 585                                 count2 = INFINITY;
 586                 } else          /* just a single number */
 587                         count2 = count;
 588                 repeat(p, pos, count, count2);
 589                 if (!EAT('}')) {        /* error heuristics */
 590                         while (MORE() && PEEK() != '}')
 591                                 NEXT();
 592                         REQUIRE(MORE(), REG_EBRACE);
 593                         SETERROR(REG_BADBR);
 594                 }
 595                 break;
 596         }
 597
 598         if (!MORE())
 599                 return;
 600         c = PEEK();
 601         if (!( c == '*' || c == '+' || c == '?' ||
 602                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
 603                 return;
 604         SETERROR(REG_BADRPT);
 605 }
 606
 607 /*
 608  - p_str - string (no metacharacters) "parser"
 609  */
 610 static void
 611 p_str(struct parse *p)
 612 {
 613         REQUIRE(MORE(), REG_EMPTY);
 614         while (MORE())
 615                 ordinary(p, GETNEXT());
 616 }
 617
 618 /*
 619  - p_bre - BRE parser top level, anchoring and concatenation
 620  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 621  *
 622  * This implementation is a bit of a kludge, in that a trailing $ is first
 623  * taken as an ordinary character and then revised to be an anchor.  The
 624  * only undesirable side effect is that '$' gets included as a character
 625  * category in such cases.  This is fairly harmless; not worth fixing.
 626  * The amount of lookahead needed to avoid this kludge is excessive.
 627  */
 628 static void
 629 p_bre(struct parse *p,
 630     int end1,           /* first terminating character */
 631     int end2)           /* second terminating character */
 632 {
 633         sopno start = HERE();
 634         int first = 1;                  /* first subexpression? */
 635         int wasdollar = 0;
 636
 637         if (EAT('^')) {
 638                 EMIT(OBOL, 0);
 639                 p->g->iflags |= USEBOL;
 640                 p->g->nbol++;
 641         }
 642         while (MORE() && !SEETWO(end1, end2)) {
 643                 wasdollar = p_simp_re(p, first);
 644                 first = 0;
 645         }
 646         if (wasdollar) {        /* oops, that was a trailing anchor */
 647                 DROP(1);
 648                 EMIT(OEOL, 0);
 649                 p->g->iflags |= USEEOL;
 650                 p->g->neol++;
 651         }
 652
 653         REQUIRE(HERE() != start, REG_EMPTY);    /* require nonempty */
 654 }
 655
 656 /*
 657  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 658  */
 659 static int                      /* was the simple RE an unbackslashed $? */
 660 p_simp_re(struct parse *p,
 661     int starordinary)           /* is a leading * an ordinary character? */
 662 {
 663         int c;
 664         int count;
 665         int count2;
 666         sopno pos;
 667         int i;
 668         sopno subno;
 669 #       define  BACKSL  (1<<CHAR_BIT)
 670
 671         pos = HERE(); /* repetition op, if any, covers from here */
 672
 673         assert(MORE()); /* caller should have ensured this */
 674         c = GETNEXT();
 675         if (c == '\\') {
 676                 REQUIRE(MORE(), REG_EESCAPE);
 677                 c = BACKSL | GETNEXT();
 678         }
 679         switch (c) {
 680         case '.':
 681                 if (p->g->cflags&REG_NEWLINE)
 682                         nonnewline(p);
 683                 else
 684                         EMIT(OANY, 0);
 685                 break;
 686         case '[':
 687                 p_bracket(p);
 688                 break;
 689         case BACKSL|'{':
 690                 SETERROR(REG_BADRPT);
 691                 break;
 692         case BACKSL|'(':
 693                 p->g->nsub++;
 694                 subno = p->g->nsub;
 695                 if (subno < NPAREN)
 696                         p->pbegin[subno] = HERE();
 697                 EMIT(OLPAREN, subno);
 698                 /* the MORE here is an error heuristic */
 699                 if (MORE() && !SEETWO('\\', ')'))
 700                         p_bre(p, '\\', ')');
 701                 if (subno < NPAREN) {
 702                         p->pend[subno] = HERE();
 703                         assert(p->pend[subno] != 0);
 704                 }
 705                 EMIT(ORPAREN, subno);
 706                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 707                 break;
 708         case BACKSL|')':        /* should not get here -- must be user */
 709         case BACKSL|'}':
 710                 SETERROR(REG_EPAREN);
 711                 break;
 712         case BACKSL|'1':
 713         case BACKSL|'2':
 714         case BACKSL|'3':
 715         case BACKSL|'4':
 716         case BACKSL|'5':
 717         case BACKSL|'6':
 718         case BACKSL|'7':
 719         case BACKSL|'8':
 720         case BACKSL|'9':
 721                 i = (c&~BACKSL) - '0';
 722                 assert(i < NPAREN);
 723                 if (p->pend[i] != 0) {
 724                         assert(i <= p->g->nsub);
 725                         EMIT(OBACK_, i);
 726                         assert(p->pbegin[i] != 0);
 727                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 728                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 729                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 730                         EMIT(O_BACK, i);
 731                 } else
 732                         SETERROR(REG_ESUBREG);
 733                 p->g->backrefs = 1;
 734                 break;
 735         case '*':
 736                 REQUIRE(starordinary, REG_BADRPT);
 737                 LLVM_FALLTHROUGH;
 738         default:
 739                 ordinary(p, (char)c);
 740                 break;
 741         }
 742
 743         if (EAT('*')) {         /* implemented as +? */
 744                 /* this case does not require the (y|) trick, noKLUDGE */
 745                 INSERT(OPLUS_, pos);
 746                 ASTERN(O_PLUS, pos);
 747                 INSERT(OQUEST_, pos);
 748                 ASTERN(O_QUEST, pos);
 749         } else if (EATTWO('\\', '{')) {
 750                 count = p_count(p);
 751                 if (EAT(',')) {
 752                         if (MORE() && isdigit((uch)PEEK())) {
 753                                 count2 = p_count(p);
 754                                 REQUIRE(count <= count2, REG_BADBR);
 755                         } else          /* single number with comma */
 756                                 count2 = INFINITY;
 757                 } else          /* just a single number */
 758                         count2 = count;
 759                 repeat(p, pos, count, count2);
 760                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 761                         while (MORE() && !SEETWO('\\', '}'))
 762                                 NEXT();
 763                         REQUIRE(MORE(), REG_EBRACE);
 764                         SETERROR(REG_BADBR);
 765                 }
 766         } else if (c == '$')    /* $ (but not \$) ends it */
 767                 return(1);
 768
 769         return(0);
 770 }
 771
 772 /*
 773  - p_count - parse a repetition count
 774  */
 775 static int                      /* the value */
 776 p_count(struct parse *p)
 777 {
 778         int count = 0;
 779         int ndigits = 0;
 780
 781         while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
 782                 count = count*10 + (GETNEXT() - '0');
 783                 ndigits++;
 784         }
 785
 786         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 787         return(count);
 788 }
 789
 790 /*
 791  - p_bracket - parse a bracketed character list
 792  *
 793  * Note a significant property of this code:  if the allocset() did SETERROR,
 794  * no set operations are done.
 795  */
 796 static void
 797 p_bracket(struct parse *p)
 798 {
 799         cset *cs;
 800         int invert = 0;
 801
 802         /* Dept of Truly Sickening Special-Case Kludges */
 803         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
 804                 EMIT(OBOW, 0);
 805                 NEXTn(6);
 806                 return;
 807         }
 808         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
 809                 EMIT(OEOW, 0);
 810                 NEXTn(6);
 811                 return;
 812         }
 813
 814         if ((cs = allocset(p)) == NULL) {
 815                 /* allocset did set error status in p */
 816                 return;
 817         }
 818
 819         if (EAT('^'))
 820                 invert++;       /* make note to invert set at end */
 821         if (EAT(']'))
 822                 CHadd(cs, ']');
 823         else if (EAT('-'))
 824                 CHadd(cs, '-');
 825         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 826                 p_b_term(p, cs);
 827         if (EAT('-'))
 828                 CHadd(cs, '-');
 829         MUSTEAT(']', REG_EBRACK);
 830
 831         if (p->error != 0) {    /* don't mess things up further */
 832                 freeset(p, cs);
 833                 return;
 834         }
 835
 836         if (p->g->cflags&REG_ICASE) {
 837                 int i;
 838                 int ci;
 839
 840                 for (i = p->g->csetsize - 1; i >= 0; i--)
 841                         if (CHIN(cs, i) && isalpha(i)) {
 842                                 ci = othercase(i);
 843                                 if (ci != i)
 844                                         CHadd(cs, ci);
 845                         }
 846                 if (cs->multis != NULL)
 847                         mccase(p, cs);
 848         }
 849         if (invert) {
 850                 int i;
 851
 852                 for (i = p->g->csetsize - 1; i >= 0; i--)
 853                         if (CHIN(cs, i))
 854                                 CHsub(cs, i);
 855                         else
 856                                 CHadd(cs, i);
 857                 if (p->g->cflags&REG_NEWLINE)
 858                         CHsub(cs, '\n');
 859                 if (cs->multis != NULL)
 860                         mcinvert(p, cs);
 861         }
 862
 863         assert(cs->multis == NULL);             /* xxx */
 864
 865         if (nch(p, cs) == 1) {          /* optimize singleton sets */
 866                 ordinary(p, firstch(p, cs));
 867                 freeset(p, cs);
 868         } else
 869                 EMIT(OANYOF, freezeset(p, cs));
 870 }
 871
 872 /*
 873  - p_b_term - parse one term of a bracketed character list
 874  */
 875 static void
 876 p_b_term(struct parse *p, cset *cs)
 877 {
 878         char c;
 879         char start, finish;
 880         int i;
 881
 882         /* classify what we've got */
 883         switch ((MORE()) ? PEEK() : '\0') {
 884         case '[':
 885                 c = (MORE2()) ? PEEK2() : '\0';
 886                 break;
 887         case '-':
 888                 SETERROR(REG_ERANGE);
 889                 return;                 /* NOTE RETURN */
 890                 break;
 891         default:
 892                 c = '\0';
 893                 break;
 894         }
 895
 896         switch (c) {
 897         case ':':               /* character class */
 898                 NEXT2();
 899                 REQUIRE(MORE(), REG_EBRACK);
 900                 c = PEEK();
 901                 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 902                 p_b_cclass(p, cs);
 903                 REQUIRE(MORE(), REG_EBRACK);
 904                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 905                 break;
 906         case '=':               /* equivalence class */
 907                 NEXT2();
 908                 REQUIRE(MORE(), REG_EBRACK);
 909                 c = PEEK();
 910                 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 911                 p_b_eclass(p, cs);
 912                 REQUIRE(MORE(), REG_EBRACK);
 913                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 914                 break;
 915         default:                /* symbol, ordinary character, or range */
 916 /* xxx revision needed for multichar stuff */
 917                 start = p_b_symbol(p);
 918                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 919                         /* range */
 920                         NEXT();
 921                         if (EAT('-'))
 922                                 finish = '-';
 923                         else
 924                                 finish = p_b_symbol(p);
 925                 } else
 926                         finish = start;
 927 /* xxx what about signed chars here... */
 928                 REQUIRE(start <= finish, REG_ERANGE);
 929                 for (i = start; i <= finish; i++)
 930                         CHadd(cs, i);
 931                 break;
 932         }
 933 }
 934
 935 /*
 936  - p_b_cclass - parse a character-class name and deal with it
 937  */
 938 static void
 939 p_b_cclass(struct parse *p, cset *cs)
 940 {
 941         char *sp = p->next;
 942         struct cclass *cp;
 943         size_t len;
 944         const char *u;
 945         char c;
 946
 947         while (MORE() && isalpha((uch)PEEK()))
 948                 NEXT();
 949         len = p->next - sp;
 950         for (cp = cclasses; cp->name != NULL; cp++)
 951                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 952                         break;
 953         if (cp->name == NULL) {
 954                 /* oops, didn't find it */
 955                 SETERROR(REG_ECTYPE);
 956                 return;
 957         }
 958
 959         u = cp->chars;
 960         while ((c = *u++) != '\0')
 961                 CHadd(cs, c);
 962         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
 963                 MCadd(p, cs, u);
 964 }
 965
 966 /*
 967  - p_b_eclass - parse an equivalence-class name and deal with it
 968  *
 969  * This implementation is incomplete. xxx
 970  */
 971 static void
 972 p_b_eclass(struct parse *p, cset *cs)
 973 {
 974         char c;
 975
 976         c = p_b_coll_elem(p, '=');
 977         CHadd(cs, c);
 978 }
 979
 980 /*
 981  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 982  */
 983 static char                     /* value of symbol */
 984 p_b_symbol(struct parse *p)
 985 {
 986         char value;
 987
 988         REQUIRE(MORE(), REG_EBRACK);
 989         if (!EATTWO('[', '.'))
 990                 return(GETNEXT());
 991
 992         /* collating symbol */
 993         value = p_b_coll_elem(p, '.');
 994         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 995         return(value);
 996 }
 997
 998 /*
 999  - p_b_coll_elem - parse a collating-element name and look it up
1000  */
1001 static char                     /* value of collating element */
1002 p_b_coll_elem(struct parse *p,
1003     int endc)                   /* name ended by endc,']' */
1004 {
1005         char *sp = p->next;
1006         struct cname *cp;
1007         size_t len;
1008
1009         while (MORE() && !SEETWO(endc, ']'))
1010                 NEXT();
1011         if (!MORE()) {
1012                 SETERROR(REG_EBRACK);
1013                 return(0);
1014         }
1015         len = p->next - sp;
1016         for (cp = cnames; cp->name != NULL; cp++)
1017                 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1018                         return(cp->code);       /* known name */
1019         if (len == 1)
1020                 return(*sp);    /* single character */
1021         SETERROR(REG_ECOLLATE);                 /* neither */
1022         return(0);
1023 }
1024
1025 /*
1026  - othercase - return the case counterpart of an alphabetic
1027  */
1028 static char                     /* if no counterpart, return ch */
1029 othercase(int ch)
1030 {
1031         ch = (uch)ch;
1032         assert(isalpha(ch));
1033         if (isupper(ch))
1034                 return ((uch)tolower(ch));
1035         else if (islower(ch))
1036                 return ((uch)toupper(ch));
1037         else                    /* peculiar, but could happen */
1038                 return(ch);
1039 }
1040
1041 /*
1042  - bothcases - emit a dualcase version of a two-case character
1043  *
1044  * Boy, is this implementation ever a kludge...
1045  */
1046 static void
1047 bothcases(struct parse *p, int ch)
1048 {
1049         char *oldnext = p->next;
1050         char *oldend = p->end;
1051         char bracket[3];
1052
1053         ch = (uch)ch;
1054         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
1055         p->next = bracket;
1056         p->end = bracket+2;
1057         bracket[0] = ch;
1058         bracket[1] = ']';
1059         bracket[2] = '\0';
1060         p_bracket(p);
1061         assert(p->next == bracket+2);
1062         p->next = oldnext;
1063         p->end = oldend;
1064 }
1065
1066 /*
1067  - ordinary - emit an ordinary character
1068  */
1069 static void
1070 ordinary(struct parse *p, int ch)
1071 {
1072         cat_t *cap = p->g->categories;
1073
1074         if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1075                 bothcases(p, ch);
1076         else {
1077                 EMIT(OCHAR, (uch)ch);
1078                 if (cap[ch] == 0)
1079                         cap[ch] = p->g->ncategories++;
1080         }
1081 }
1082
1083 /*
1084  - nonnewline - emit REG_NEWLINE version of OANY
1085  *
1086  * Boy, is this implementation ever a kludge...
1087  */
1088 static void
1089 nonnewline(struct parse *p)
1090 {
1091         char *oldnext = p->next;
1092         char *oldend = p->end;
1093         char bracket[4];
1094
1095         p->next = bracket;
1096         p->end = bracket+3;
1097         bracket[0] = '^';
1098         bracket[1] = '\n';
1099         bracket[2] = ']';
1100         bracket[3] = '\0';
1101         p_bracket(p);
1102         assert(p->next == bracket+3);
1103         p->next = oldnext;
1104         p->end = oldend;
1105 }
1106
1107 /*
1108  - repeat - generate code for a bounded repetition, recursively if needed
1109  */
1110 static void
1111 repeat(struct parse *p,
1112     sopno start,                /* operand from here to end of strip */
1113     int from,                   /* repeated from this number */
1114     int to)                     /* to this number of times (maybe INFINITY) */
1115 {
1116         sopno finish = HERE();
1117 #       define  N       2
1118 #       define  INF     3
1119 #       define  REP(f, t)       ((f)*8 + (t))
1120 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1121         sopno copy;
1122
1123         if (p->error != 0)      /* head off possible runaway recursion */
1124                 return;
1125
1126         assert(from <= to);
1127
1128         switch (REP(MAP(from), MAP(to))) {
1129         case REP(0, 0):                 /* must be user doing this */
1130                 DROP(finish-start);     /* drop the operand */
1131                 break;
1132         case REP(0, 1):                 /* as x{1,1}? */
1133         case REP(0, N):                 /* as x{1,n}? */
1134         case REP(0, INF):               /* as x{1,}? */
1135                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1136                 INSERT(OCH_, start);            /* offset is wrong... */
1137                 repeat(p, start+1, 1, to);
1138                 ASTERN(OOR1, start);
1139                 AHEAD(start);                   /* ... fix it */
1140                 EMIT(OOR2, 0);
1141                 AHEAD(THERE());
1142                 ASTERN(O_CH, THERETHERE());
1143                 break;
1144         case REP(1, 1):                 /* trivial case */
1145                 /* done */
1146                 break;
1147         case REP(1, N):                 /* as x?x{1,n-1} */
1148                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1149                 INSERT(OCH_, start);
1150                 ASTERN(OOR1, start);
1151                 AHEAD(start);
1152                 EMIT(OOR2, 0);                  /* offset very wrong... */
1153                 AHEAD(THERE());                 /* ...so fix it */
1154                 ASTERN(O_CH, THERETHERE());
1155                 copy = dupl(p, start+1, finish+1);
1156                 assert(copy == finish+4);
1157                 repeat(p, copy, 1, to-1);
1158                 break;
1159         case REP(1, INF):               /* as x+ */
1160                 INSERT(OPLUS_, start);
1161                 ASTERN(O_PLUS, start);
1162                 break;
1163         case REP(N, N):                 /* as xx{m-1,n-1} */
1164                 copy = dupl(p, start, finish);
1165                 repeat(p, copy, from-1, to-1);
1166                 break;
1167         case REP(N, INF):               /* as xx{n-1,INF} */
1168                 copy = dupl(p, start, finish);
1169                 repeat(p, copy, from-1, to);
1170                 break;
1171         default:                        /* "can't happen" */
1172                 SETERROR(REG_ASSERT);   /* just in case */
1173                 break;
1174         }
1175 }
1176
1177 /*
1178  - seterr - set an error condition
1179  */
1180 static int                      /* useless but makes type checking happy */
1181 seterr(struct parse *p, int e)
1182 {
1183         if (p->error == 0)      /* keep earliest error condition */
1184                 p->error = e;
1185         p->next = nuls;         /* try to bring things to a halt */
1186         p->end = nuls;
1187         return(0);              /* make the return value well-defined */
1188 }
1189
1190 /*
1191  - allocset - allocate a set of characters for []
1192  */
1193 static cset *
1194 allocset(struct parse *p)
1195 {
1196         int no = p->g->ncsets++;
1197         size_t nc;
1198         size_t nbytes;
1199         cset *cs;
1200         size_t css = (size_t)p->g->csetsize;
1201         int i;
1202
1203         if (no >= p->ncsalloc) {        /* need another column of space */
1204                 void *ptr;
1205
1206                 p->ncsalloc += CHAR_BIT;
1207                 nc = p->ncsalloc;
1208                 if (nc > SIZE_MAX / sizeof(cset))
1209                         goto nomem;
1210                 assert(nc % CHAR_BIT == 0);
1211                 nbytes = nc / CHAR_BIT * css;
1212
1213                 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1214                 if (ptr == NULL)
1215                         goto nomem;
1216                 p->g->sets = ptr;
1217
1218                 ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1219                 if (ptr == NULL)
1220                         goto nomem;
1221                 p->g->setbits = ptr;
1222
1223                 for (i = 0; i < no; i++)
1224                         p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1225
1226                 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1227         }
1228         /* XXX should not happen */
1229         if (p->g->sets == NULL || p->g->setbits == NULL)
1230                 goto nomem;
1231
1232         cs = &p->g->sets[no];
1233         cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1234         cs->mask = 1 << ((no) % CHAR_BIT);
1235         cs->hash = 0;
1236         cs->smultis = 0;
1237         cs->multis = NULL;
1238
1239         return(cs);
1240 nomem:
1241         free(p->g->sets);
1242         p->g->sets = NULL;
1243         free(p->g->setbits);
1244         p->g->setbits = NULL;
1245
1246         SETERROR(REG_ESPACE);
1247         /* caller's responsibility not to do set ops */
1248         return(NULL);
1249 }
1250
1251 /*
1252  - freeset - free a now-unused set
1253  */
1254 static void
1255 freeset(struct parse *p, cset *cs)
1256 {
1257         size_t i;
1258         cset *top = &p->g->sets[p->g->ncsets];
1259         size_t css = (size_t)p->g->csetsize;
1260
1261         for (i = 0; i < css; i++)
1262                 CHsub(cs, i);
1263         if (cs == top-1)        /* recover only the easy case */
1264                 p->g->ncsets--;
1265 }
1266
1267 /*
1268  - freezeset - final processing on a set of characters
1269  *
1270  * The main task here is merging identical sets.  This is usually a waste
1271  * of time (although the hash code minimizes the overhead), but can win
1272  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1273  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1274  * the same value!
1275  */
1276 static int                      /* set number */
1277 freezeset(struct parse *p, cset *cs)
1278 {
1279         uch h = cs->hash;
1280         size_t i;
1281         cset *top = &p->g->sets[p->g->ncsets];
1282         cset *cs2;
1283         size_t css = (size_t)p->g->csetsize;
1284
1285         /* look for an earlier one which is the same */
1286         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1287                 if (cs2->hash == h && cs2 != cs) {
1288                         /* maybe */
1289                         for (i = 0; i < css; i++)
1290                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1291                                         break;          /* no */
1292                         if (i == css)
1293                                 break;                  /* yes */
1294                 }
1295
1296         if (cs2 < top) {        /* found one */
1297                 freeset(p, cs);
1298                 cs = cs2;
1299         }
1300
1301         return((int)(cs - p->g->sets));
1302 }
1303
1304 /*
1305  - firstch - return first character in a set (which must have at least one)
1306  */
1307 static int                      /* character; there is no "none" value */
1308 firstch(struct parse *p, cset *cs)
1309 {
1310         size_t i;
1311         size_t css = (size_t)p->g->csetsize;
1312
1313         for (i = 0; i < css; i++)
1314                 if (CHIN(cs, i))
1315                         return((char)i);
1316         assert(never);
1317         return(0);              /* arbitrary */
1318 }
1319
1320 /*
1321  - nch - number of characters in a set
1322  */
1323 static int
1324 nch(struct parse *p, cset *cs)
1325 {
1326         size_t i;
1327         size_t css = (size_t)p->g->csetsize;
1328         int n = 0;
1329
1330         for (i = 0; i < css; i++)
1331                 if (CHIN(cs, i))
1332                         n++;
1333         return(n);
1334 }
1335
1336 /*
1337  - mcadd - add a collating element to a cset
1338  */
1339 static void
1340 mcadd( struct parse *p, cset *cs, const char *cp)
1341 {
1342         size_t oldend = cs->smultis;
1343         void *np;
1344
1345         cs->smultis += strlen(cp) + 1;
1346         np = realloc(cs->multis, cs->smultis);
1347         if (np == NULL) {
1348                 if (cs->multis)
1349                         free(cs->multis);
1350                 cs->multis = NULL;
1351                 SETERROR(REG_ESPACE);
1352                 return;
1353         }
1354         cs->multis = np;
1355
1356         llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1357 }
1358
1359 /*
1360  - mcinvert - invert the list of collating elements in a cset
1361  *
1362  * This would have to know the set of possibilities.  Implementation
1363  * is deferred.
1364  */
1365 /* ARGSUSED */
1366 static void
1367 mcinvert(struct parse *p, cset *cs)
1368 {
1369         assert(cs->multis == NULL);     /* xxx */
1370 }
1371
1372 /*
1373  - mccase - add case counterparts of the list of collating elements in a cset
1374  *
1375  * This would have to know the set of possibilities.  Implementation
1376  * is deferred.
1377  */
1378 /* ARGSUSED */
1379 static void
1380 mccase(struct parse *p, cset *cs)
1381 {
1382         assert(cs->multis == NULL);     /* xxx */
1383 }
1384
1385 /*
1386  - isinsets - is this character in any sets?
1387  */
1388 static int                      /* predicate */
1389 isinsets(struct re_guts *g, int c)
1390 {
1391         uch *col;
1392         int i;
1393         int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1394         unsigned uc = (uch)c;
1395
1396         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1397                 if (col[uc] != 0)
1398                         return(1);
1399         return(0);
1400 }
1401
1402 /*
1403  - samesets - are these two characters in exactly the same sets?
1404  */
1405 static int                      /* predicate */
1406 samesets(struct re_guts *g, int c1, int c2)
1407 {
1408         uch *col;
1409         int i;
1410         int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1411         unsigned uc1 = (uch)c1;
1412         unsigned uc2 = (uch)c2;
1413
1414         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1415                 if (col[uc1] != col[uc2])
1416                         return(0);
1417         return(1);
1418 }
1419
1420 /*
1421  - categorize - sort out character categories
1422  */
1423 static void
1424 categorize(struct parse *p, struct re_guts *g)
1425 {
1426         cat_t *cats = g->categories;
1427         int c;
1428         int c2;
1429         cat_t cat;
1430
1431         /* avoid making error situations worse */
1432         if (p->error != 0)
1433                 return;
1434
1435         for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1436                 if (cats[c] == 0 && isinsets(g, c)) {
1437                         cat = g->ncategories++;
1438                         cats[c] = cat;
1439                         for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1440                                 if (cats[c2] == 0 && samesets(g, c, c2))
1441                                         cats[c2] = cat;
1442                 }
1443 }
1444
1445 /*
1446  - dupl - emit a duplicate of a bunch of sops
1447  */
1448 static sopno                    /* start of duplicate */
1449 dupl(struct parse *p,
1450     sopno start,                /* from here */
1451     sopno finish)               /* to this less one */
1452 {
1453         sopno ret = HERE();
1454         sopno len = finish - start;
1455
1456         assert(finish >= start);
1457         if (len == 0)
1458                 return(ret);
1459         enlarge(p, p->ssize + len);     /* this many unexpected additions */
1460         assert(p->ssize >= p->slen + len);
1461         (void) memmove((char *)(p->strip + p->slen),
1462                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1463         p->slen += len;
1464         return(ret);
1465 }
1466
1467 /*
1468  - doemit - emit a strip operator
1469  *
1470  * It might seem better to implement this as a macro with a function as
1471  * hard-case backup, but it's just too big and messy unless there are
1472  * some changes to the data structures.  Maybe later.
1473  */
1474 static void
1475 doemit(struct parse *p, sop op, size_t opnd)
1476 {
1477         /* avoid making error situations worse */
1478         if (p->error != 0)
1479                 return;
1480
1481         /* deal with oversize operands ("can't happen", more or less) */
1482         assert(opnd < 1<<OPSHIFT);
1483
1484         /* deal with undersized strip */
1485         if (p->slen >= p->ssize)
1486                 enlarge(p, (p->ssize+1) / 2 * 3);       /* +50% */
1487         assert(p->slen < p->ssize);
1488
1489         /* finally, it's all reduced to the easy case */
1490         p->strip[p->slen++] = SOP(op, opnd);
1491 }
1492
1493 /*
1494  - doinsert - insert a sop into the strip
1495  */
1496 static void
1497 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1498 {
1499         sopno sn;
1500         sop s;
1501         int i;
1502
1503         /* avoid making error situations worse */
1504         if (p->error != 0)
1505                 return;
1506
1507         sn = HERE();
1508         EMIT(op, opnd);         /* do checks, ensure space */
1509         assert(HERE() == sn+1);
1510         s = p->strip[sn];
1511
1512         /* adjust paren pointers */
1513         assert(pos > 0);
1514         for (i = 1; i < NPAREN; i++) {
1515                 if (p->pbegin[i] >= pos) {
1516                         p->pbegin[i]++;
1517                 }
1518                 if (p->pend[i] >= pos) {
1519                         p->pend[i]++;
1520                 }
1521         }
1522
1523         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1524                                                 (HERE()-pos-1)*sizeof(sop));
1525         p->strip[pos] = s;
1526 }
1527
1528 /*
1529  - dofwd - complete a forward reference
1530  */
1531 static void
1532 dofwd(struct parse *p, sopno pos, sop value)
1533 {
1534         /* avoid making error situations worse */
1535         if (p->error != 0)
1536                 return;
1537
1538         assert(value < 1<<OPSHIFT);
1539         p->strip[pos] = OP(p->strip[pos]) | value;
1540 }
1541
1542 /*
1543  - enlarge - enlarge the strip
1544  */
1545 static void
1546 enlarge(struct parse *p, sopno size)
1547 {
1548         sop *sp;
1549
1550         if (p->ssize >= size)
1551                 return;
1552
1553         if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
1554                 SETERROR(REG_ESPACE);
1555                 return;
1556         }
1557
1558         sp = (sop *)realloc(p->strip, size*sizeof(sop));
1559         if (sp == NULL) {
1560                 SETERROR(REG_ESPACE);
1561                 return;
1562         }
1563         p->strip = sp;
1564         p->ssize = size;
1565 }
1566
1567 /*
1568  - stripsnug - compact the strip
1569  */
1570 static void
1571 stripsnug(struct parse *p, struct re_guts *g)
1572 {
1573         g->nstates = p->slen;
1574         if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
1575                 g->strip = p->strip;
1576                 SETERROR(REG_ESPACE);
1577                 return;
1578         }
1579
1580         g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1581         if (g->strip == NULL) {
1582                 SETERROR(REG_ESPACE);
1583                 g->strip = p->strip;
1584         }
1585 }
1586
1587 /*
1588  - findmust - fill in must and mlen with longest mandatory literal string
1589  *
1590  * This algorithm could do fancy things like analyzing the operands of |
1591  * for common subsequences.  Someday.  This code is simple and finds most
1592  * of the interesting cases.
1593  *
1594  * Note that must and mlen got initialized during setup.
1595  */
1596 static void
1597 findmust(struct parse *p, struct re_guts *g)
1598 {
1599         sop *scan;
1600         sop *start = 0; /* start initialized in the default case, after that */
1601         sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1602         sopno newlen;
1603         sop s;
1604         char *cp;
1605         sopno i;
1606
1607         /* avoid making error situations worse */
1608         if (p->error != 0)
1609                 return;
1610
1611         /* find the longest OCHAR sequence in strip */
1612         newlen = 0;
1613         scan = g->strip + 1;
1614         do {
1615                 s = *scan++;
1616                 switch (OP(s)) {
1617                 case OCHAR:             /* sequence member */
1618                         if (newlen == 0)                /* new sequence */
1619                                 newstart = scan - 1;
1620                         newlen++;
1621                         break;
1622                 case OPLUS_:            /* things that don't break one */
1623                 case OLPAREN:
1624                 case ORPAREN:
1625                         break;
1626                 case OQUEST_:           /* things that must be skipped */
1627                 case OCH_:
1628                         scan--;
1629                         do {
1630                                 scan += OPND(s);
1631                                 s = *scan;
1632                                 /* assert() interferes w debug printouts */
1633                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1634                                                         OP(s) != OOR2) {
1635                                         g->iflags |= REGEX_BAD;
1636                                         return;
1637                                 }
1638                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1639                         LLVM_FALLTHROUGH;
1640                 default:                /* things that break a sequence */
1641                         if (newlen > g->mlen) {         /* ends one */
1642                                 start = newstart;
1643                                 g->mlen = newlen;
1644                         }
1645                         newlen = 0;
1646                         break;
1647                 }
1648         } while (OP(s) != OEND);
1649
1650         if (g->mlen == 0)               /* there isn't one */
1651                 return;
1652
1653         /* turn it into a character string */
1654         g->must = malloc((size_t)g->mlen + 1);
1655         if (g->must == NULL) {          /* argh; just forget it */
1656                 g->mlen = 0;
1657                 return;
1658         }
1659         cp = g->must;
1660         scan = start;
1661         for (i = g->mlen; i > 0; i--) {
1662                 while (OP(s = *scan++) != OCHAR)
1663                         continue;
1664                 assert(cp < g->must + g->mlen);
1665                 *cp++ = (char)OPND(s);
1666         }
1667         assert(cp == g->must + g->mlen);
1668         *cp++ = '\0';           /* just on general principles */
1669 }
1670
1671 /*
1672  - pluscount - count + nesting
1673  */
1674 static sopno                    /* nesting depth */
1675 pluscount(struct parse *p, struct re_guts *g)
1676 {
1677         sop *scan;
1678         sop s;
1679         sopno plusnest = 0;
1680         sopno maxnest = 0;
1681
1682         if (p->error != 0)
1683                 return(0);      /* there may not be an OEND */
1684
1685         scan = g->strip + 1;
1686         do {
1687                 s = *scan++;
1688                 switch (OP(s)) {
1689                 case OPLUS_:
1690                         plusnest++;
1691                         break;
1692                 case O_PLUS:
1693                         if (plusnest > maxnest)
1694                                 maxnest = plusnest;
1695                         plusnest--;
1696                         break;
1697                 }
1698         } while (OP(s) != OEND);
1699         if (plusnest != 0)
1700                 g->iflags |= REGEX_BAD;
1701         return(maxnest);
1702 }