src/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004, 2005, 2007-2010 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24 #include <assert.h>
  25 #include <ctype.h>
  26 #include <stdio.h>
  27 #include <sys/types.h>
  28 #include <stddef.h>
  29 #include <stdlib.h>
  30 #include <limits.h>
  31 #include <string.h>
  32 #include <locale.h>
  33
  34 #ifndef isgraph
  35 #define isgraph(C) (isprint(C) && !isspace(C))
  36 #endif
  37
  38 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
  39 #define ISALPHA(C) isalpha(C)
  40 #define ISUPPER(C) isupper(C)
  41 #define ISLOWER(C) islower(C)
  42 #define ISDIGIT(C) isdigit(C)
  43 #define ISXDIGIT(C) isxdigit(C)
  44 #define ISSPACE(C) isspace(C)
  45 #define ISPUNCT(C) ispunct(C)
  46 #define ISALNUM(C) isalnum(C)
  47 #define ISPRINT(C) isprint(C)
  48 #define ISGRAPH(C) isgraph(C)
  49 #define ISCNTRL(C) iscntrl(C)
  50 #else
  51 #define ISALPHA(C) (isascii(C) && isalpha(C))
  52 #define ISUPPER(C) (isascii(C) && isupper(C))
  53 #define ISLOWER(C) (isascii(C) && islower(C))
  54 #define ISDIGIT(C) (isascii(C) && isdigit(C))
  55 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
  56 #define ISSPACE(C) (isascii(C) && isspace(C))
  57 #define ISPUNCT(C) (isascii(C) && ispunct(C))
  58 #define ISALNUM(C) (isascii(C) && isalnum(C))
  59 #define ISPRINT(C) (isascii(C) && isprint(C))
  60 #define ISGRAPH(C) (isascii(C) && isgraph(C))
  61 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
  62 #endif
  63
  64 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
  65    - Its arg may be any int or unsigned int; it need not be an unsigned char.
  66    - It's guaranteed to evaluate its argument exactly once.
  67    - It's typically faster.
  68    Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
  69    only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
  70    it's important to use the locale's definition of `digit' even when the
  71    host does not conform to Posix.  */
  72 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
  73
  74 /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
  75 #include "gettext.h"
  76 #define _(str) gettext (str)
  77
  78 #include "mbsupport.h"  /* defines MBS_SUPPORT if appropriate */
  79 #ifdef MBS_SUPPORT
  80 /* We can handle multibyte strings. */
  81 # include <wchar.h>
  82 # include <wctype.h>
  83 # include <langinfo.h>
  84 #endif
  85
  86 #include "regex.h"
  87 #include "dfa.h"
  88 #include "hard-locale.h"
  89 #include "xalloc.h"
  90
  91 /* HPUX, define those as macros in sys/param.h */
  92 #ifdef setbit
  93 # undef setbit
  94 #endif
  95 #ifdef clrbit
  96 # undef clrbit
  97 #endif
  98
  99 static void dfamust (struct dfa *dfa);
 100 static void regexp (int toplevel);
 101
 102 #define CALLOC(p, t, n) ((p) = xcalloc((size_t)(n), sizeof (t)))
 103 #define MALLOC(p, t, n) ((p) = xmalloc((n) * sizeof (t)))
 104 #define REALLOC(p, t, n) ((p) = xrealloc((p), (n) * sizeof (t)))
 105
 106 /* Reallocate an array of type t if nalloc is too small for index. */
 107 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
 108   if ((index) >= (nalloc))                        \
 109     {                                             \
 110       do                                          \
 111         (nalloc) *= 2;                            \
 112       while ((index) >= (nalloc));                \
 113       REALLOC(p, t, nalloc);                      \
 114     }
 115
 116 #ifdef DEBUG
 117
 118 static void
 119 prtok (token t)
 120 {
 121   char const *s;
 122
 123   if (t < 0)
 124     fprintf(stderr, "END");
 125   else if (t < NOTCHAR)
 126     fprintf(stderr, "%c", t);
 127   else
 128     {
 129       switch (t)
 130         {
 131         case EMPTY: s = "EMPTY"; break;
 132         case BACKREF: s = "BACKREF"; break;
 133         case BEGLINE: s = "BEGLINE"; break;
 134         case ENDLINE: s = "ENDLINE"; break;
 135         case BEGWORD: s = "BEGWORD"; break;
 136         case ENDWORD: s = "ENDWORD"; break;
 137         case LIMWORD: s = "LIMWORD"; break;
 138         case NOTLIMWORD: s = "NOTLIMWORD"; break;
 139         case QMARK: s = "QMARK"; break;
 140         case STAR: s = "STAR"; break;
 141         case PLUS: s = "PLUS"; break;
 142         case CAT: s = "CAT"; break;
 143         case OR: s = "OR"; break;
 144         case ORTOP: s = "ORTOP"; break;
 145         case LPAREN: s = "LPAREN"; break;
 146         case RPAREN: s = "RPAREN"; break;
 147 #ifdef MBS_SUPPORT
 148         case ANYCHAR: s = "ANYCHAR"; break;
 149         case MBCSET: s = "MBCSET"; break;
 150 #endif /* MBS_SUPPORT */
 151         default: s = "CSET"; break;
 152         }
 153       fprintf(stderr, "%s", s);
 154     }
 155 }
 156 #endif /* DEBUG */
 157
 158 /* Stuff pertaining to charclasses. */
 159
 160 static int
 161 tstbit (unsigned int b, charclass c)
 162 {
 163   return c[b / INTBITS] & 1 << b % INTBITS;
 164 }
 165
 166 static void
 167 setbit (unsigned int b, charclass c)
 168 {
 169   c[b / INTBITS] |= 1 << b % INTBITS;
 170 }
 171
 172 static void
 173 clrbit (unsigned int b, charclass c)
 174 {
 175   c[b / INTBITS] &= ~(1 << b % INTBITS);
 176 }
 177
 178 static void
 179 copyset (charclass src, charclass dst)
 180 {
 181   memcpy (dst, src, sizeof (charclass));
 182 }
 183
 184 static void
 185 zeroset (charclass s)
 186 {
 187   memset (s, 0, sizeof (charclass));
 188 }
 189
 190 static void
 191 notset (charclass s)
 192 {
 193   int i;
 194
 195   for (i = 0; i < CHARCLASS_INTS; ++i)
 196     s[i] = ~s[i];
 197 }
 198
 199 static int
 200 equal (charclass s1, charclass s2)
 201 {
 202   return memcmp (s1, s2, sizeof (charclass)) == 0;
 203 }
 204
 205 /* A pointer to the current dfa is kept here during parsing. */
 206 static struct dfa *dfa;
 207
 208 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
 209 static int
 210 charclass_index (charclass s)
 211 {
 212   int i;
 213
 214   for (i = 0; i < dfa->cindex; ++i)
 215     if (equal(s, dfa->charclasses[i]))
 216       return i;
 217   REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
 218   ++dfa->cindex;
 219   copyset(s, dfa->charclasses[i]);
 220   return i;
 221 }
 222
 223 /* Syntax bits controlling the behavior of the lexical analyzer. */
 224 static reg_syntax_t syntax_bits, syntax_bits_set;
 225
 226 /* Flag for case-folding letters into sets. */
 227 static int case_fold;
 228
 229 /* End-of-line byte in data.  */
 230 static unsigned char eolbyte;
 231
 232 /* Entry point to set syntax options. */
 233 void
 234 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
 235 {
 236   syntax_bits_set = 1;
 237   syntax_bits = bits;
 238   case_fold = fold;
 239   eolbyte = eol;
 240 }
 241
 242 /* Like setbit, but if case is folded, set both cases of a letter.
 243    For MB_CUR_MAX > 1, one or both of the two cases may not be set,
 244    so the resulting charset may only be used as an optimization.  */
 245 static void
 246 setbit_case_fold (
 247 #ifdef MBS_SUPPORT
 248                   wint_t b,
 249 #else
 250                   unsigned int b,
 251 #endif
 252                   charclass c)
 253 {
 254   if (case_fold)
 255     {
 256 #ifdef MBS_SUPPORT
 257       if (MB_CUR_MAX > 1)
 258         {
 259           wint_t b1 = iswupper(b) ? towlower(b) : b;
 260           wint_t b2 = iswlower(b) ? towupper(b) : b;
 261           if (wctob ((unsigned char)b1) == b1)
 262             setbit (b1, c);
 263           if (b2 != b1 && wctob ((unsigned char)b2) == b2)
 264             setbit (b2, c);
 265         }
 266       else
 267 #endif
 268         {
 269           unsigned char b1 = ISUPPER(b) ? tolower(b) : b;
 270           unsigned char b2 = ISLOWER(b) ? toupper(b) : b;
 271           setbit (b1, c);
 272           if (b2 != b1)
 273             setbit (b2, c);
 274         }
 275     }
 276   else
 277     {
 278 #ifdef MBS_SUPPORT
 279       if (wctob ((unsigned char)b) == b)
 280 #endif
 281         setbit (b, c);
 282     }
 283 }
 284
 285
 286 /* UTF-8 encoding allows some optimizations that we can't otherwise
 287    assume in a multibyte encoding. */
 288 static inline int
 289 using_utf8 (void)
 290 {
 291   static int utf8 = -1;
 292   if (utf8 == -1)
 293     {
 294 #if defined HAVE_LANGINFO_CODESET && defined MBS_SUPPORT
 295       utf8 = (strcmp (nl_langinfo (CODESET), "UTF-8") == 0);
 296 #else
 297       utf8 = 0;
 298 #endif
 299     }
 300
 301   return utf8;
 302 }
 303
 304 /* Lexical analyzer.  All the dross that deals with the obnoxious
 305    GNU Regex syntax bits is located here.  The poor, suffering
 306    reader is referred to the GNU Regex documentation for the
 307    meaning of the @#%!@#%^!@ syntax bits. */
 308
 309 static char const *lexptr;      /* Pointer to next input character. */
 310 static int lexleft;             /* Number of characters remaining. */
 311 static token lasttok;           /* Previous token returned; initially END. */
 312 static int laststart;           /* True if we're separated from beginning or (, |
 313                                    only by zero-width characters. */
 314 static int parens;              /* Count of outstanding left parens. */
 315 static int minrep, maxrep;      /* Repeat counts for {m,n}. */
 316 static int hard_LC_COLLATE;     /* Nonzero if LC_COLLATE is hard.  */
 317
 318 static int cur_mb_len = 1;      /* Length of the multibyte representation of
 319                                    wctok.  */
 320 #ifdef MBS_SUPPORT
 321 /* These variables are used only if (MB_CUR_MAX > 1).  */
 322 static mbstate_t mbs;           /* Mbstate for mbrlen().  */
 323 static wchar_t wctok;           /* Wide character representation of the current
 324                                    multibyte character.  */
 325 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
 326                                    Each element store the amount of remain
 327                                    byte of corresponding multibyte character
 328                                    in the input string.  A element's value
 329                                    is 0 if corresponding character is a
 330                                    single byte chracter.
 331                                    e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
 332                                     mblen_buf :   0,       3,       2,       1
 333                                 */
 334 static wchar_t *inputwcs;       /* Wide character representation of input
 335                                    string in dfaexec().
 336                                    The length of this array is same as
 337                                    the length of input string(char array).
 338                                    inputstring[i] is a single-byte char,
 339                                    or 1st byte of a multibyte char.
 340                                    And inputwcs[i] is the codepoint.  */
 341 static unsigned char const *buf_begin;  /* reference to begin in dfaexec().  */
 342 static unsigned char const *buf_end;    /* reference to end in dfaexec().  */
 343 #endif /* MBS_SUPPORT  */
 344
 345
 346 #ifdef MBS_SUPPORT
 347 /* Note that characters become unsigned here. */
 348 # define FETCH_WC(c, wc, eoferr)                \
 349   do {                                          \
 350     if (! lexleft)                              \
 351       {                                         \
 352         if ((eoferr) != 0)                      \
 353           dfaerror (eoferr);                    \
 354         else                                    \
 355           return lasttok = END;                 \
 356       }                                         \
 357     else                                        \
 358       {                                         \
 359         wchar_t _wc;                            \
 360         cur_mb_len = mbrtowc(&_wc, lexptr, lexleft, &mbs); \
 361         if (cur_mb_len <= 0)                    \
 362           {                                     \
 363             cur_mb_len = 1;                     \
 364             --lexleft;                          \
 365             (wc) = (c) = (unsigned char) *lexptr++; \
 366           }                                     \
 367         else                                    \
 368           {                                     \
 369             lexptr += cur_mb_len;               \
 370             lexleft -= cur_mb_len;              \
 371             (wc) = _wc;                         \
 372             (c) = wctob(wc);                    \
 373           }                                     \
 374       }                                         \
 375   } while(0)
 376
 377 # define FETCH(c, eoferr)                       \
 378   do {                                          \
 379     wint_t wc;                                  \
 380     FETCH_WC(c, wc, eoferr);                    \
 381   } while(0)
 382
 383 #else
 384 /* Note that characters become unsigned here. */
 385 # define FETCH(c, eoferr)             \
 386   do {                                \
 387     if (! lexleft)                    \
 388       {                               \
 389         if ((eoferr) != 0)            \
 390           dfaerror (eoferr);          \
 391         else                          \
 392           return lasttok = END;       \
 393       }                               \
 394     (c) = (unsigned char) *lexptr++;  \
 395     --lexleft;                        \
 396   } while(0)
 397
 398 # define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
 399
 400 #endif /* MBS_SUPPORT */
 401
 402 static int
 403 in_coll_range (char ch, char from, char to)
 404 {
 405   char c[6] = { from, 0, ch, 0, to, 0 };
 406   return strcoll (&c[0], &c[2]) <= 0 && strcoll (&c[2], &c[4]) <= 0;
 407 }
 408
 409 static int is_alpha(int c) { return ISALPHA(c); }
 410 static int is_upper(int c) { return ISUPPER(c); }
 411 static int is_lower(int c) { return ISLOWER(c); }
 412 static int is_digit(int c) { return ISDIGIT(c); }
 413 static int is_xdigit(int c) { return ISXDIGIT(c); }
 414 static int is_space(int c) { return ISSPACE(c); }
 415 static int is_punct(int c) { return ISPUNCT(c); }
 416 static int is_alnum(int c) { return ISALNUM(c); }
 417 static int is_print(int c) { return ISPRINT(c); }
 418 static int is_graph(int c) { return ISGRAPH(c); }
 419 static int is_cntrl(int c) { return ISCNTRL(c); }
 420
 421 static int
 422 is_blank (int c)
 423 {
 424    return (c == ' ' || c == '\t');
 425 }
 426
 427 typedef int predicate (int);
 428
 429 /* The following list maps the names of the Posix named character classes
 430    to predicate functions that determine whether a given character is in
 431    the class.  The leading [ has already been eaten by the lexical analyzer. */
 432 static struct {
 433   const char *name;
 434   predicate *pred;
 435 } const prednames[] = {
 436   { "alpha", is_alpha },
 437   { "upper", is_upper },
 438   { "lower", is_lower },
 439   { "digit", is_digit },
 440   { "xdigit", is_xdigit },
 441   { "space", is_space },
 442   { "punct", is_punct },
 443   { "alnum", is_alnum },
 444   { "print", is_print },
 445   { "graph", is_graph },
 446   { "cntrl", is_cntrl },
 447   { "blank", is_blank },
 448   { NULL, NULL }
 449 };
 450
 451 static predicate *
 452 find_pred (const char *str)
 453 {
 454   unsigned int i;
 455   for (i = 0; prednames[i].name; ++i)
 456     if (!strcmp(str, prednames[i].name))
 457       break;
 458
 459   return prednames[i].pred;
 460 }
 461
 462 /* Multibyte character handling sub-routine for lex.
 463    This function  parse a bracket expression and build a struct
 464    mb_char_classes.  */
 465 static token
 466 parse_bracket_exp (void)
 467 {
 468   int invert;
 469   int c, c1, c2;
 470   charclass ccl;
 471
 472 #ifdef MBS_SUPPORT
 473   wint_t wc, wc1, wc2;
 474
 475   /* Work area to build a mb_char_classes.  */
 476   struct mb_char_classes *work_mbc;
 477   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
 478     equivs_al, coll_elems_al;
 479
 480   chars_al = 1;
 481   range_sts_al = range_ends_al = 0;
 482   ch_classes_al = equivs_al = coll_elems_al = 0;
 483   if (MB_CUR_MAX > 1)
 484     {
 485       REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
 486                            dfa->mbcsets_alloc, dfa->nmbcsets + 1);
 487
 488       /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
 489          We will update dfa->multibyte_prop[] in addtok(), because we can't
 490          decide the index in dfa->tokens[].  */
 491
 492       /* Initialize work area.  */
 493       work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
 494       memset (work_mbc, 0, sizeof *work_mbc);
 495     }
 496   else
 497     work_mbc = NULL;
 498 #endif
 499
 500   memset (ccl, 0, sizeof(ccl));
 501   FETCH_WC (c, wc, _("unbalanced ["));
 502   if (c == '^')
 503     {
 504       FETCH_WC (c, wc, _("unbalanced ["));
 505       invert = 1;
 506     }
 507   else
 508     invert = 0;
 509
 510   do
 511     {
 512       c1 = EOF; /* mark c1 is not initialized".  */
 513
 514       /* Note that if we're looking at some other [:...:] construct,
 515          we just treat it as a bunch of ordinary characters.  We can do
 516          this because we assume regex has checked for syntax errors before
 517          dfa is ever called. */
 518       if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
 519         {
 520 #define BRACKET_BUFFER_SIZE 128
 521           char str[BRACKET_BUFFER_SIZE];
 522           FETCH_WC (c1, wc1, _("unbalanced ["));
 523
 524           /* If pattern contains `[[:', `[[.', or `[[='.  */
 525           if (c1 == ':'
 526 #ifdef MBS_SUPPORT
 527               /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1.  */
 528               || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))
 529 #endif
 530               )
 531             {
 532               int len = 0;
 533               for (;;)
 534                 {
 535                   FETCH_WC (c, wc, _("unbalanced ["));
 536                   if ((c == c1 && *lexptr == ']') || lexleft == 0)
 537                     break;
 538                   if (len < BRACKET_BUFFER_SIZE)
 539                     str[len++] = c;
 540                   else
 541                     /* This is in any case an invalid class name.  */
 542                     str[0] = '\0';
 543                 }
 544               str[len] = '\0';
 545
 546               /* Fetch bracket.  */
 547               FETCH_WC (c, wc, _("unbalanced ["));
 548               if (c1 == ':')
 549                 /* build character class.  */
 550                 {
 551                   char const *class
 552                     = (case_fold && (!strcmp (str, "upper")
 553                                      || !strcmp (str, "lower"))
 554                                        ? "alpha"
 555                                        : str);
 556 #ifdef MBS_SUPPORT
 557                   if (MB_CUR_MAX > 1)
 558                     {
 559                       /* Store the character class as wctype_t.  */
 560                       wctype_t wt = wctype (class);
 561
 562                       if (ch_classes_al == 0)
 563                         MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
 564                       REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
 565                                            ch_classes_al,
 566                                            work_mbc->nch_classes + 1);
 567                       work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
 568                     }
 569 #endif
 570
 571                   {
 572                     predicate *pred = find_pred (class);
 573                     if (!pred)
 574                       dfaerror(_("invalid character class"));
 575                     for (c2 = 0; c2 < NOTCHAR; ++c2)
 576                       if ((*pred)(c2))
 577                         setbit_case_fold (c2, ccl);
 578                   }
 579                 }
 580
 581 #ifdef MBS_SUPPORT
 582               else if (c1 == '=' || c1 == '.')
 583                 {
 584                   char *elem;
 585                   MALLOC(elem, char, len + 1);
 586                   strncpy(elem, str, len + 1);
 587
 588                   if (c1 == '=')
 589                     /* build equivalent class.  */
 590                     {
 591                       if (equivs_al == 0)
 592                         MALLOC(work_mbc->equivs, char*, ++equivs_al);
 593                       REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
 594                                            equivs_al,
 595                                            work_mbc->nequivs + 1);
 596                       work_mbc->equivs[work_mbc->nequivs++] = elem;
 597                     }
 598
 599                   if (c1 == '.')
 600                     /* build collating element.  */
 601                     {
 602                       if (coll_elems_al == 0)
 603                         MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
 604                       REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
 605                                            coll_elems_al,
 606                                            work_mbc->ncoll_elems + 1);
 607                       work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
 608                     }
 609                 }
 610 #endif
 611
 612               /* Fetch new lookahead character.  */
 613               FETCH_WC (c1, wc1, _("unbalanced ["));
 614               continue;
 615             }
 616
 617           /* We treat '[' as a normal character here.  c/c1/wc/wc1
 618              are already set up.  */
 619         }
 620
 621       if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
 622         FETCH_WC(c, wc, _("unbalanced ["));
 623
 624       if (c1 == EOF)
 625         FETCH_WC(c1, wc1, _("unbalanced ["));
 626
 627       if (c1 == '-')
 628         /* build range characters.  */
 629         {
 630           FETCH_WC(c2, wc2, _("unbalanced ["));
 631           if (c2 == ']')
 632             {
 633               /* In the case [x-], the - is an ordinary hyphen,
 634                  which is left in c1, the lookahead character. */
 635               lexptr -= cur_mb_len;
 636               lexleft += cur_mb_len;
 637             }
 638         }
 639
 640       if (c1 == '-' && c2 != ']')
 641         {
 642           if (c2 == '\\'
 643               && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
 644             FETCH_WC(c2, wc2, _("unbalanced ["));
 645
 646 #ifdef MBS_SUPPORT
 647           if (MB_CUR_MAX > 1)
 648             {
 649               /* When case folding map a range, say [m-z] (or even [M-z])
 650                  to the pair of ranges, [m-z] [M-Z].  */
 651               if (range_sts_al == 0)
 652                 {
 653                   MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
 654                   MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
 655                 }
 656               REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
 657                                    range_sts_al, work_mbc->nranges + 1);
 658               REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
 659                                    range_ends_al, work_mbc->nranges + 1);
 660               work_mbc->range_sts[work_mbc->nranges] =
 661                 case_fold ? towlower(wc) : (wchar_t)wc;
 662               work_mbc->range_ends[work_mbc->nranges++] =
 663                 case_fold ? towlower(wc2) : (wchar_t)wc2;
 664
 665 #ifndef GREP
 666               if (case_fold && (iswalpha(wc) || iswalpha(wc2)))
 667                 {
 668                   REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
 669                                        range_sts_al, work_mbc->nranges + 1);
 670                   work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
 671                   REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
 672                                        range_ends_al, work_mbc->nranges + 1);
 673                   work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
 674                 }
 675 #endif
 676             }
 677           else
 678 #endif
 679             {
 680               c1 = c;
 681               if (case_fold)
 682                 {
 683                   c1 = tolower (c1);
 684                   c2 = tolower (c2);
 685                 }
 686               if (!hard_LC_COLLATE)
 687                 for (c = c1; c <= c2; c++)
 688                   setbit_case_fold (c, ccl);
 689               else
 690                 for (c = 0; c < NOTCHAR; ++c)
 691                   if (!(case_fold && ISUPPER (c))
 692                       && in_coll_range (c, c1, c2))
 693                     setbit_case_fold (c, ccl);
 694             }
 695
 696           FETCH_WC(c1, wc1, _("unbalanced ["));
 697           continue;
 698         }
 699
 700 #ifdef MBS_SUPPORT
 701       /* Build normal characters.  */
 702       setbit_case_fold (wc, ccl);
 703       if (MB_CUR_MAX > 1)
 704         {
 705           if (case_fold && iswalpha(wc))
 706             {
 707               wc = towlower(wc);
 708               c = wctob(wc);
 709               if (c == EOF || (wint_t)c != (wint_t)wc)
 710                 {
 711                   REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
 712                                        work_mbc->nchars + 1);
 713                   work_mbc->chars[work_mbc->nchars++] = wc;
 714                 }
 715 #ifdef GREP
 716               continue;
 717 #else
 718               wc = towupper(wc);
 719               c = wctob(wc);
 720 #endif
 721             }
 722           if (c == EOF || (wint_t)c != (wint_t)wc)
 723             {
 724               REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
 725                                    work_mbc->nchars + 1);
 726               work_mbc->chars[work_mbc->nchars++] = wc;
 727             }
 728         }
 729 #else
 730       setbit_case_fold (c, ccl);
 731 #endif
 732     }
 733   while ((
 734 #ifdef MBS_SUPPORT
 735          wc = wc1,
 736 #endif
 737          (c = c1) != ']'));
 738
 739 #ifdef MBS_SUPPORT
 740   if (MB_CUR_MAX > 1
 741       && (!using_utf8()
 742           || invert
 743           || work_mbc->nchars != 0
 744           || work_mbc->nch_classes != 0
 745           || work_mbc->nranges != 0
 746           || work_mbc->nequivs != 0
 747           || work_mbc->ncoll_elems != 0))
 748     {
 749       static charclass zeroclass;
 750       work_mbc->invert = invert;
 751       work_mbc->cset = equal(ccl, zeroclass) ? -1 : charclass_index(ccl);
 752       return MBCSET;
 753     }
 754 #endif
 755
 756   if (invert)
 757     {
 758 #ifdef MBS_SUPPORT
 759       assert(MB_CUR_MAX == 1);
 760 #endif
 761       notset(ccl);
 762       if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
 763         clrbit(eolbyte, ccl);
 764     }
 765
 766   return CSET + charclass_index(ccl);
 767 }
 768
 769 /* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
 770 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
 771
 772 static token
 773 lex (void)
 774 {
 775   unsigned int c, c2;
 776   int backslash = 0;
 777   charclass ccl;
 778   int i;
 779
 780   /* Basic plan: We fetch a character.  If it's a backslash,
 781      we set the backslash flag and go through the loop again.
 782      On the plus side, this avoids having a duplicate of the
 783      main switch inside the backslash case.  On the minus side,
 784      it means that just about every case begins with
 785      "if (backslash) ...".  */
 786   for (i = 0; i < 2; ++i)
 787     {
 788 #ifdef MBS_SUPPORT
 789       if (MB_CUR_MAX > 1)
 790         {
 791           FETCH_WC (c, wctok, NULL);
 792           if ((int)c == EOF)
 793             goto normal_char;
 794         }
 795       else
 796 #endif /* MBS_SUPPORT  */
 797         FETCH(c, NULL);
 798
 799       switch (c)
 800         {
 801         case '\\':
 802           if (backslash)
 803             goto normal_char;
 804           if (lexleft == 0)
 805             dfaerror(_("unfinished \\ escape"));
 806           backslash = 1;
 807           break;
 808
 809         case '^':
 810           if (backslash)
 811             goto normal_char;
 812           if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
 813               || lasttok == END
 814               || lasttok == LPAREN
 815               || lasttok == OR)
 816             return lasttok = BEGLINE;
 817           goto normal_char;
 818
 819         case '$':
 820           if (backslash)
 821             goto normal_char;
 822           if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
 823               || lexleft == 0
 824               || (syntax_bits & RE_NO_BK_PARENS
 825                   ? lexleft > 0 && *lexptr == ')'
 826                   : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
 827               || (syntax_bits & RE_NO_BK_VBAR
 828                   ? lexleft > 0 && *lexptr == '|'
 829                   : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
 830               || ((syntax_bits & RE_NEWLINE_ALT)
 831                   && lexleft > 0 && *lexptr == '\n'))
 832             return lasttok = ENDLINE;
 833           goto normal_char;
 834
 835         case '1':
 836         case '2':
 837         case '3':
 838         case '4':
 839         case '5':
 840         case '6':
 841         case '7':
 842         case '8':
 843         case '9':
 844           if (backslash && !(syntax_bits & RE_NO_BK_REFS))
 845             {
 846               laststart = 0;
 847               return lasttok = BACKREF;
 848             }
 849           goto normal_char;
 850
 851         case '`':
 852           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 853             return lasttok = BEGLINE;   /* FIXME: should be beginning of string */
 854           goto normal_char;
 855
 856         case '\'':
 857           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 858             return lasttok = ENDLINE;   /* FIXME: should be end of string */
 859           goto normal_char;
 860
 861         case '<':
 862           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 863             return lasttok = BEGWORD;
 864           goto normal_char;
 865
 866         case '>':
 867           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 868             return lasttok = ENDWORD;
 869           goto normal_char;
 870
 871         case 'b':
 872           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 873             return lasttok = LIMWORD;
 874           goto normal_char;
 875
 876         case 'B':
 877           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 878             return lasttok = NOTLIMWORD;
 879           goto normal_char;
 880
 881         case '?':
 882           if (syntax_bits & RE_LIMITED_OPS)
 883             goto normal_char;
 884           if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
 885             goto normal_char;
 886           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 887             goto normal_char;
 888           return lasttok = QMARK;
 889
 890         case '*':
 891           if (backslash)
 892             goto normal_char;
 893           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 894             goto normal_char;
 895           return lasttok = STAR;
 896
 897         case '+':
 898           if (syntax_bits & RE_LIMITED_OPS)
 899             goto normal_char;
 900           if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
 901             goto normal_char;
 902           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 903             goto normal_char;
 904           return lasttok = PLUS;
 905
 906         case '{':
 907           if (!(syntax_bits & RE_INTERVALS))
 908             goto normal_char;
 909           if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
 910             goto normal_char;
 911           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 912             goto normal_char;
 913
 914           if (syntax_bits & RE_NO_BK_BRACES)
 915             {
 916               /* Scan ahead for a valid interval; if it's not valid,
 917                  treat it as a literal '{'.  */
 918               int lo = -1, hi = -1;
 919               char const *p = lexptr;
 920               char const *lim = p + lexleft;
 921               for (;  p != lim && ISASCIIDIGIT (*p);  p++)
 922                 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
 923               if (p != lim && *p == ',')
 924                 while (++p != lim && ISASCIIDIGIT (*p))
 925                   hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
 926               else
 927                 hi = lo;
 928               if (p == lim || *p != '}'
 929                   || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
 930                 goto normal_char;
 931             }
 932
 933           minrep = 0;
 934           /* Cases:
 935              {M} - exact count
 936              {M,} - minimum count, maximum is infinity
 937              {M,N} - M through N */
 938           FETCH(c, _("unfinished repeat count"));
 939           if (ISASCIIDIGIT (c))
 940             {
 941               minrep = c - '0';
 942               for (;;)
 943                 {
 944                   FETCH(c, _("unfinished repeat count"));
 945                   if (! ISASCIIDIGIT (c))
 946                     break;
 947                   minrep = 10 * minrep + c - '0';
 948                 }
 949             }
 950           else
 951             dfaerror(_("malformed repeat count"));
 952           if (c == ',')
 953             {
 954               FETCH (c, _("unfinished repeat count"));
 955               if (! ISASCIIDIGIT (c))
 956                 maxrep = -1;
 957               else
 958                 {
 959                   maxrep = c - '0';
 960                   for (;;)
 961                     {
 962                       FETCH (c, _("unfinished repeat count"));
 963                       if (! ISASCIIDIGIT (c))
 964                         break;
 965                       maxrep = 10 * maxrep + c - '0';
 966                     }
 967                   if (0 <= maxrep && maxrep < minrep)
 968                     dfaerror (_("malformed repeat count"));
 969                 }
 970             }
 971           else
 972             maxrep = minrep;
 973           if (!(syntax_bits & RE_NO_BK_BRACES))
 974             {
 975               if (c != '\\')
 976                 dfaerror(_("malformed repeat count"));
 977               FETCH(c, _("unfinished repeat count"));
 978             }
 979           if (c != '}')
 980             dfaerror(_("malformed repeat count"));
 981           laststart = 0;
 982 #ifdef GAWK
 983           dfa->broken = (minrep == maxrep && minrep == 0);
 984 #endif
 985           return lasttok = REPMN;
 986
 987         case '|':
 988           if (syntax_bits & RE_LIMITED_OPS)
 989             goto normal_char;
 990           if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
 991             goto normal_char;
 992           laststart = 1;
 993           return lasttok = OR;
 994
 995         case '\n':
 996           if (syntax_bits & RE_LIMITED_OPS
 997               || backslash
 998               || !(syntax_bits & RE_NEWLINE_ALT))
 999             goto normal_char;
1000           laststart = 1;
1001           return lasttok = OR;
1002
1003         case '(':
1004           if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1005             goto normal_char;
1006           ++parens;
1007           laststart = 1;
1008           return lasttok = LPAREN;
1009
1010         case ')':
1011           if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1012             goto normal_char;
1013           if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1014             goto normal_char;
1015           --parens;
1016           laststart = 0;
1017           return lasttok = RPAREN;
1018
1019         case '.':
1020           if (backslash)
1021             goto normal_char;
1022 #ifdef MBS_SUPPORT
1023           if (MB_CUR_MAX > 1)
1024             {
1025               /* In multibyte environment period must match with a single
1026                  character not a byte.  So we use ANYCHAR.  */
1027               laststart = 0;
1028               return lasttok = ANYCHAR;
1029             }
1030 #endif /* MBS_SUPPORT */
1031           zeroset(ccl);
1032           notset(ccl);
1033           if (!(syntax_bits & RE_DOT_NEWLINE))
1034             clrbit(eolbyte, ccl);
1035           if (syntax_bits & RE_DOT_NOT_NULL)
1036             clrbit('\0', ccl);
1037           laststart = 0;
1038           return lasttok = CSET + charclass_index(ccl);
1039
1040 #ifndef GAWK
1041         case 's':
1042         case 'S':
1043           if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1044             goto normal_char;
1045           zeroset(ccl);
1046           for (c2 = 0; c2 < NOTCHAR; ++c2)
1047             if (ISSPACE(c2))
1048               setbit(c2, ccl);
1049           if (c == 'S')
1050             notset(ccl);
1051           laststart = 0;
1052           return lasttok = CSET + charclass_index(ccl);
1053 #endif
1054
1055         case 'w':
1056         case 'W':
1057           if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1058             goto normal_char;
1059           zeroset(ccl);
1060           for (c2 = 0; c2 < NOTCHAR; ++c2)
1061             if (IS_WORD_CONSTITUENT(c2))
1062               setbit(c2, ccl);
1063           if (c == 'W')
1064             notset(ccl);
1065           laststart = 0;
1066           return lasttok = CSET + charclass_index(ccl);
1067
1068         case '[':
1069           if (backslash)
1070             goto normal_char;
1071           laststart = 0;
1072           return lasttok = parse_bracket_exp();
1073
1074         default:
1075         normal_char:
1076           laststart = 0;
1077 #ifdef MBS_SUPPORT
1078           /* For multibyte character sets, folding is done in atom.  Always
1079              return WCHAR.  */
1080           if (MB_CUR_MAX > 1)
1081             return lasttok = WCHAR;
1082 #endif
1083
1084           if (case_fold && ISALPHA(c))
1085             {
1086               zeroset(ccl);
1087               setbit_case_fold (c, ccl);
1088               return lasttok = CSET + charclass_index(ccl);
1089             }
1090
1091           return lasttok = c;
1092         }
1093     }
1094
1095   /* The above loop should consume at most a backslash
1096      and some other character. */
1097   abort();
1098   return END;   /* keeps pedantic compilers happy. */
1099 }
1100
1101 /* Recursive descent parser for regular expressions. */
1102
1103 static token tok;               /* Lookahead token. */
1104 static int depth;               /* Current depth of a hypothetical stack
1105                                    holding deferred productions.  This is
1106                                    used to determine the depth that will be
1107                                    required of the real stack later on in
1108                                    dfaanalyze(). */
1109
1110 static void
1111 addtok_mb (token t, int mbprop)
1112 {
1113 #ifdef MBS_SUPPORT
1114   if (MB_CUR_MAX > 1)
1115     {
1116       REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1117                            dfa->tindex);
1118       dfa->multibyte_prop[dfa->tindex] = mbprop;
1119     }
1120 #else
1121   (void) mbprop;
1122 #endif
1123
1124   REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1125   dfa->tokens[dfa->tindex++] = t;
1126
1127   switch (t)
1128     {
1129     case QMARK:
1130     case STAR:
1131     case PLUS:
1132       break;
1133
1134     case CAT:
1135     case OR:
1136     case ORTOP:
1137       --depth;
1138       break;
1139
1140     default:
1141       ++dfa->nleaves;
1142     case EMPTY:
1143       ++depth;
1144       break;
1145     }
1146   if (depth > dfa->depth)
1147     dfa->depth = depth;
1148 }
1149
1150 /* Add the given token to the parse tree, maintaining the depth count and
1151    updating the maximum depth if necessary. */
1152 static void
1153 addtok (token t)
1154 {
1155 #ifdef MBS_SUPPORT
1156   if (MB_CUR_MAX > 1 && t == MBCSET)
1157     addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1158   else
1159 #endif
1160     addtok_mb (t, 3);
1161 }
1162
1163 #ifdef MBS_SUPPORT
1164 /* We treat a multibyte character as a single atom, so that DFA
1165    can treat a multibyte character as a single expression.
1166
1167    e.g. We construct following tree from "<mb1><mb2>".
1168    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1169    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1170 static void
1171 addtok_wc (wint_t wc)
1172 {
1173   unsigned char buf[MB_LEN_MAX];
1174   mbstate_t s;
1175   int i;
1176   memset (&s, 0, sizeof(s));
1177   cur_mb_len = wcrtomb ((char *) buf, wc, &s);
1178   addtok_mb(buf[0], cur_mb_len == 1 ? 3 : 1);
1179   for (i = 1; i < cur_mb_len; i++)
1180     {
1181       addtok_mb(buf[i], i == cur_mb_len - 1 ? 2 : 0);
1182       addtok(CAT);
1183     }
1184 }
1185 #endif
1186
1187 /* The grammar understood by the parser is as follows.
1188
1189    regexp:
1190      regexp OR branch
1191      branch
1192
1193    branch:
1194      branch closure
1195      closure
1196
1197    closure:
1198      closure QMARK
1199      closure STAR
1200      closure PLUS
1201      closure REPMN
1202      atom
1203
1204    atom:
1205      <normal character>
1206      <multibyte character>
1207      ANYCHAR
1208      MBCSET
1209      CSET
1210      BACKREF
1211      BEGLINE
1212      ENDLINE
1213      BEGWORD
1214      ENDWORD
1215      LIMWORD
1216      NOTLIMWORD
1217      LPAREN regexp RPAREN
1218      <empty>
1219
1220    The parser builds a parse tree in postfix form in an array of tokens. */
1221
1222 static void
1223 atom (void)
1224 {
1225 #ifdef MBS_SUPPORT
1226   if (tok == WCHAR)
1227     {
1228       addtok_wc (case_fold ? towlower(wctok) : wctok);
1229 #ifndef GREP
1230       if (case_fold && iswalpha(wctok))
1231         {
1232           addtok_wc (towupper(wctok));
1233           addtok (OR);
1234         }
1235 #endif
1236
1237       tok = lex();
1238       return;
1239     }
1240 #endif /* MBS_SUPPORT  */
1241
1242   if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1243       || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1244 #ifdef MBS_SUPPORT
1245       || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1246 #endif /* MBS_SUPPORT */
1247       || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1248     {
1249       addtok(tok);
1250       tok = lex();
1251     }
1252   else if (tok == LPAREN)
1253     {
1254       tok = lex();
1255       regexp(0);
1256       if (tok != RPAREN)
1257         dfaerror(_("unbalanced ("));
1258       tok = lex();
1259     }
1260   else
1261     addtok(EMPTY);
1262 }
1263
1264 /* Return the number of tokens in the given subexpression. */
1265 static int
1266 nsubtoks (int tindex)
1267 {
1268   int ntoks1;
1269
1270   switch (dfa->tokens[tindex - 1])
1271     {
1272     default:
1273       return 1;
1274     case QMARK:
1275     case STAR:
1276     case PLUS:
1277       return 1 + nsubtoks(tindex - 1);
1278     case CAT:
1279     case OR:
1280     case ORTOP:
1281       ntoks1 = nsubtoks(tindex - 1);
1282       return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1283     }
1284 }
1285
1286 /* Copy the given subexpression to the top of the tree. */
1287 static void
1288 copytoks (int tindex, int ntokens)
1289 {
1290   int i;
1291
1292   for (i = 0; i < ntokens; ++i)
1293     {
1294       addtok(dfa->tokens[tindex + i]);
1295 #ifdef MBS_SUPPORT
1296       /* Update index into multibyte csets.  */
1297       if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
1298         dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
1299 #endif
1300     }
1301 }
1302
1303 static void
1304 closure (void)
1305 {
1306   int tindex, ntokens, i;
1307
1308   atom();
1309   while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1310     if (tok == REPMN)
1311       {
1312         ntokens = nsubtoks(dfa->tindex);
1313         tindex = dfa->tindex - ntokens;
1314         if (maxrep < 0)
1315           addtok(PLUS);
1316         if (minrep == 0)
1317           addtok(QMARK);
1318         for (i = 1; i < minrep; ++i)
1319           {
1320             copytoks(tindex, ntokens);
1321             addtok(CAT);
1322           }
1323         for (; i < maxrep; ++i)
1324           {
1325             copytoks(tindex, ntokens);
1326             addtok(QMARK);
1327             addtok(CAT);
1328           }
1329         tok = lex();
1330       }
1331     else
1332       {
1333         addtok(tok);
1334         tok = lex();
1335       }
1336 }
1337
1338 static void
1339 branch (void)
1340 {
1341   closure();
1342   while (tok != RPAREN && tok != OR && tok >= 0)
1343     {
1344       closure();
1345       addtok(CAT);
1346     }
1347 }
1348
1349 static void
1350 regexp (int toplevel)
1351 {
1352   branch();
1353   while (tok == OR)
1354     {
1355       tok = lex();
1356       branch();
1357       if (toplevel)
1358         addtok(ORTOP);
1359       else
1360         addtok(OR);
1361     }
1362 }
1363
1364 /* Main entry point for the parser.  S is a string to be parsed, len is the
1365    length of the string, so s can include NUL characters.  D is a pointer to
1366    the struct dfa to parse into. */
1367 void
1368 dfaparse (char const *s, size_t len, struct dfa *d)
1369 {
1370   dfa = d;
1371   lexptr = s;
1372   lexleft = len;
1373   lasttok = END;
1374   laststart = 1;
1375   parens = 0;
1376 #ifdef LC_COLLATE
1377   hard_LC_COLLATE = hard_locale (LC_COLLATE);
1378 #endif
1379 #ifdef MBS_SUPPORT
1380   if (MB_CUR_MAX > 1)
1381     {
1382       cur_mb_len = 0;
1383       memset(&mbs, 0, sizeof(mbstate_t));
1384     }
1385 #endif /* MBS_SUPPORT  */
1386
1387   if (! syntax_bits_set)
1388     dfaerror(_("no syntax specified"));
1389
1390   tok = lex();
1391   depth = d->depth;
1392
1393   regexp(1);
1394
1395   if (tok != END)
1396     dfaerror(_("unbalanced )"));
1397
1398   addtok(END - d->nregexps);
1399   addtok(CAT);
1400
1401   if (d->nregexps)
1402     addtok(ORTOP);
1403
1404   ++d->nregexps;
1405 }
1406
1407 /* Some primitives for operating on sets of positions. */
1408
1409 /* Copy one set to another; the destination must be large enough. */
1410 static void
1411 copy (position_set const *src, position_set *dst)
1412 {
1413   int i;
1414
1415   for (i = 0; i < src->nelem; ++i)
1416     dst->elems[i] = src->elems[i];
1417   dst->nelem = src->nelem;
1418 }
1419
1420 /* Insert a position in a set.  Position sets are maintained in sorted
1421    order according to index.  If position already exists in the set with
1422    the same index then their constraints are logically or'd together.
1423    S->elems must point to an array large enough to hold the resulting set. */
1424 static void
1425 insert (position p, position_set *s)
1426 {
1427   int count = s->nelem;
1428   int lo = 0, hi = count;
1429   while (lo < hi)
1430     {
1431       int mid = ((unsigned) lo + (unsigned) hi) >> 1;
1432       if (s->elems[mid].index < p.index)
1433         lo = mid + 1;
1434       else
1435         hi = mid;
1436     }
1437
1438   if (lo < count && p.index == s->elems[lo].index)
1439     s->elems[lo].constraint |= p.constraint;
1440   else
1441     {
1442       int i;
1443       for (i = count; i > lo; i--)
1444         s->elems[i] = s->elems[i - 1];
1445       s->elems[lo] = p;
1446       ++s->nelem;
1447     }
1448 }
1449
1450 /* Merge two sets of positions into a third.  The result is exactly as if
1451    the positions of both sets were inserted into an initially empty set. */
1452 static void
1453 merge (position_set const *s1, position_set const *s2, position_set *m)
1454 {
1455   int i = 0, j = 0;
1456
1457   m->nelem = 0;
1458   while (i < s1->nelem && j < s2->nelem)
1459     if (s1->elems[i].index > s2->elems[j].index)
1460       m->elems[m->nelem++] = s1->elems[i++];
1461     else if (s1->elems[i].index < s2->elems[j].index)
1462       m->elems[m->nelem++] = s2->elems[j++];
1463     else
1464       {
1465         m->elems[m->nelem] = s1->elems[i++];
1466         m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1467       }
1468   while (i < s1->nelem)
1469     m->elems[m->nelem++] = s1->elems[i++];
1470   while (j < s2->nelem)
1471     m->elems[m->nelem++] = s2->elems[j++];
1472 }
1473
1474 /* Delete a position from a set. */
1475 static void
1476 delete (position p, position_set *s)
1477 {
1478   int i;
1479
1480   for (i = 0; i < s->nelem; ++i)
1481     if (p.index == s->elems[i].index)
1482       break;
1483   if (i < s->nelem)
1484     for (--s->nelem; i < s->nelem; ++i)
1485       s->elems[i] = s->elems[i + 1];
1486 }
1487
1488 /* Find the index of the state corresponding to the given position set with
1489    the given preceding context, or create a new state if there is no such
1490    state.  Newline and letter tell whether we got here on a newline or
1491    letter, respectively. */
1492 static int
1493 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1494 {
1495   int hash = 0;
1496   int constraint;
1497   int i, j;
1498
1499   newline = newline ? 1 : 0;
1500   letter = letter ? 1 : 0;
1501
1502   for (i = 0; i < s->nelem; ++i)
1503     hash ^= s->elems[i].index + s->elems[i].constraint;
1504
1505   /* Try to find a state that exactly matches the proposed one. */
1506   for (i = 0; i < d->sindex; ++i)
1507     {
1508       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1509           || newline != d->states[i].newline || letter != d->states[i].letter)
1510         continue;
1511       for (j = 0; j < s->nelem; ++j)
1512         if (s->elems[j].constraint
1513             != d->states[i].elems.elems[j].constraint
1514             || s->elems[j].index != d->states[i].elems.elems[j].index)
1515           break;
1516       if (j == s->nelem)
1517         return i;
1518     }
1519
1520   /* We'll have to create a new state. */
1521   REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1522   d->states[i].hash = hash;
1523   MALLOC(d->states[i].elems.elems, position, s->nelem);
1524   copy(s, &d->states[i].elems);
1525   d->states[i].newline = newline;
1526   d->states[i].letter = letter;
1527   d->states[i].backref = 0;
1528   d->states[i].constraint = 0;
1529   d->states[i].first_end = 0;
1530 #ifdef MBS_SUPPORT
1531   d->states[i].mbps.nelem = 0;
1532   d->states[i].mbps.elems = NULL;
1533 #endif
1534   for (j = 0; j < s->nelem; ++j)
1535     if (d->tokens[s->elems[j].index] < 0)
1536       {
1537         constraint = s->elems[j].constraint;
1538         if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1539             || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1540             || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1541             || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1542           d->states[i].constraint |= constraint;
1543         if (! d->states[i].first_end)
1544           d->states[i].first_end = d->tokens[s->elems[j].index];
1545       }
1546     else if (d->tokens[s->elems[j].index] == BACKREF)
1547       {
1548         d->states[i].constraint = NO_CONSTRAINT;
1549         d->states[i].backref = 1;
1550       }
1551
1552   ++d->sindex;
1553
1554   return i;
1555 }
1556
1557 /* Find the epsilon closure of a set of positions.  If any position of the set
1558    contains a symbol that matches the empty string in some context, replace
1559    that position with the elements of its follow labeled with an appropriate
1560    constraint.  Repeat exhaustively until no funny positions are left.
1561    S->elems must be large enough to hold the result. */
1562 static void
1563 epsclosure (position_set *s, struct dfa const *d)
1564 {
1565   int i, j;
1566   char *visited;        /* array of booleans, enough to use char, not int */
1567   position p, old;
1568
1569   CALLOC(visited, char, d->tindex);
1570
1571   for (i = 0; i < s->nelem; ++i)
1572     if (d->tokens[s->elems[i].index] >= NOTCHAR
1573         && d->tokens[s->elems[i].index] != BACKREF
1574 #ifdef MBS_SUPPORT
1575         && d->tokens[s->elems[i].index] != ANYCHAR
1576         && d->tokens[s->elems[i].index] != MBCSET
1577 #endif
1578         && d->tokens[s->elems[i].index] < CSET)
1579       {
1580         old = s->elems[i];
1581         p.constraint = old.constraint;
1582         delete(s->elems[i], s);
1583         if (visited[old.index])
1584           {
1585             --i;
1586             continue;
1587           }
1588         visited[old.index] = 1;
1589         switch (d->tokens[old.index])
1590           {
1591           case BEGLINE:
1592             p.constraint &= BEGLINE_CONSTRAINT;
1593             break;
1594           case ENDLINE:
1595             p.constraint &= ENDLINE_CONSTRAINT;
1596             break;
1597           case BEGWORD:
1598             p.constraint &= BEGWORD_CONSTRAINT;
1599             break;
1600           case ENDWORD:
1601             p.constraint &= ENDWORD_CONSTRAINT;
1602             break;
1603           case LIMWORD:
1604             p.constraint &= LIMWORD_CONSTRAINT;
1605             break;
1606           case NOTLIMWORD:
1607             p.constraint &= NOTLIMWORD_CONSTRAINT;
1608             break;
1609           default:
1610             break;
1611           }
1612         for (j = 0; j < d->follows[old.index].nelem; ++j)
1613           {
1614             p.index = d->follows[old.index].elems[j].index;
1615             insert(p, s);
1616           }
1617         /* Force rescan to start at the beginning. */
1618         i = -1;
1619       }
1620
1621   free(visited);
1622 }
1623
1624 /* Perform bottom-up analysis on the parse tree, computing various functions.
1625    Note that at this point, we're pretending constructs like \< are real
1626    characters rather than constraints on what can follow them.
1627
1628    Nullable:  A node is nullable if it is at the root of a regexp that can
1629    match the empty string.
1630    *  EMPTY leaves are nullable.
1631    * No other leaf is nullable.
1632    * A QMARK or STAR node is nullable.
1633    * A PLUS node is nullable if its argument is nullable.
1634    * A CAT node is nullable if both its arguments are nullable.
1635    * An OR node is nullable if either argument is nullable.
1636
1637    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1638    that could correspond to the first character of a string matching the
1639    regexp rooted at the given node.
1640    * EMPTY leaves have empty firstpos.
1641    * The firstpos of a nonempty leaf is that leaf itself.
1642    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1643      argument.
1644    * The firstpos of a CAT node is the firstpos of the left argument, union
1645      the firstpos of the right if the left argument is nullable.
1646    * The firstpos of an OR node is the union of firstpos of each argument.
1647
1648    Lastpos:  The lastpos of a node is the set of positions that could
1649    correspond to the last character of a string matching the regexp at
1650    the given node.
1651    * EMPTY leaves have empty lastpos.
1652    * The lastpos of a nonempty leaf is that leaf itself.
1653    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1654      argument.
1655    * The lastpos of a CAT node is the lastpos of its right argument, union
1656      the lastpos of the left if the right argument is nullable.
1657    * The lastpos of an OR node is the union of the lastpos of each argument.
1658
1659    Follow:  The follow of a position is the set of positions that could
1660    correspond to the character following a character matching the node in
1661    a string matching the regexp.  At this point we consider special symbols
1662    that match the empty string in some context to be just normal characters.
1663    Later, if we find that a special symbol is in a follow set, we will
1664    replace it with the elements of its follow, labeled with an appropriate
1665    constraint.
1666    * Every node in the firstpos of the argument of a STAR or PLUS node is in
1667      the follow of every node in the lastpos.
1668    * Every node in the firstpos of the second argument of a CAT node is in
1669      the follow of every node in the lastpos of the first argument.
1670
1671    Because of the postfix representation of the parse tree, the depth-first
1672    analysis is conveniently done by a linear scan with the aid of a stack.
1673    Sets are stored as arrays of the elements, obeying a stack-like allocation
1674    scheme; the number of elements in each set deeper in the stack can be
1675    used to determine the address of a particular set's array. */
1676 void
1677 dfaanalyze (struct dfa *d, int searchflag)
1678 {
1679   int *nullable;                /* Nullable stack. */
1680   int *nfirstpos;               /* Element count stack for firstpos sets. */
1681   position *firstpos;           /* Array where firstpos elements are stored. */
1682   int *nlastpos;                /* Element count stack for lastpos sets. */
1683   position *lastpos;            /* Array where lastpos elements are stored. */
1684   int *nalloc;                  /* Sizes of arrays allocated to follow sets. */
1685   position_set tmp;             /* Temporary set for merging sets. */
1686   position_set merged;          /* Result of merging sets. */
1687   int wants_newline;            /* True if some position wants newline info. */
1688   int *o_nullable;
1689   int *o_nfirst, *o_nlast;
1690   position *o_firstpos, *o_lastpos;
1691   int i, j;
1692   position *pos;
1693
1694 #ifdef DEBUG
1695   fprintf(stderr, "dfaanalyze:\n");
1696   for (i = 0; i < d->tindex; ++i)
1697     {
1698       fprintf(stderr, " %d:", i);
1699       prtok(d->tokens[i]);
1700     }
1701   putc('\n', stderr);
1702 #endif
1703
1704   d->searchflag = searchflag;
1705
1706   MALLOC(nullable, int, d->depth);
1707   o_nullable = nullable;
1708   MALLOC(nfirstpos, int, d->depth);
1709   o_nfirst = nfirstpos;
1710   MALLOC(firstpos, position, d->nleaves);
1711   o_firstpos = firstpos, firstpos += d->nleaves;
1712   MALLOC(nlastpos, int, d->depth);
1713   o_nlast = nlastpos;
1714   MALLOC(lastpos, position, d->nleaves);
1715   o_lastpos = lastpos, lastpos += d->nleaves;
1716   CALLOC(nalloc, int, d->tindex);
1717   MALLOC(merged.elems, position, d->nleaves);
1718
1719   CALLOC(d->follows, position_set, d->tindex);
1720
1721   for (i = 0; i < d->tindex; ++i)
1722 #ifdef DEBUG
1723     {                           /* Nonsyntactic #ifdef goo... */
1724 #endif
1725     switch (d->tokens[i])
1726       {
1727       case EMPTY:
1728         /* The empty set is nullable. */
1729         *nullable++ = 1;
1730
1731         /* The firstpos and lastpos of the empty leaf are both empty. */
1732         *nfirstpos++ = *nlastpos++ = 0;
1733         break;
1734
1735       case STAR:
1736       case PLUS:
1737         /* Every element in the firstpos of the argument is in the follow
1738            of every element in the lastpos. */
1739         tmp.nelem = nfirstpos[-1];
1740         tmp.elems = firstpos;
1741         pos = lastpos;
1742         for (j = 0; j < nlastpos[-1]; ++j)
1743           {
1744             merge(&tmp, &d->follows[pos[j].index], &merged);
1745             REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1746                                  nalloc[pos[j].index], merged.nelem - 1);
1747             copy(&merged, &d->follows[pos[j].index]);
1748           }
1749
1750       case QMARK:
1751         /* A QMARK or STAR node is automatically nullable. */
1752         if (d->tokens[i] != PLUS)
1753           nullable[-1] = 1;
1754         break;
1755
1756       case CAT:
1757         /* Every element in the firstpos of the second argument is in the
1758            follow of every element in the lastpos of the first argument. */
1759         tmp.nelem = nfirstpos[-1];
1760         tmp.elems = firstpos;
1761         pos = lastpos + nlastpos[-1];
1762         for (j = 0; j < nlastpos[-2]; ++j)
1763           {
1764             merge(&tmp, &d->follows[pos[j].index], &merged);
1765             REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1766                                  nalloc[pos[j].index], merged.nelem - 1);
1767             copy(&merged, &d->follows[pos[j].index]);
1768           }
1769
1770         /* The firstpos of a CAT node is the firstpos of the first argument,
1771            union that of the second argument if the first is nullable. */
1772         if (nullable[-2])
1773           nfirstpos[-2] += nfirstpos[-1];
1774         else
1775           firstpos += nfirstpos[-1];
1776         --nfirstpos;
1777
1778         /* The lastpos of a CAT node is the lastpos of the second argument,
1779            union that of the first argument if the second is nullable. */
1780         if (nullable[-1])
1781           nlastpos[-2] += nlastpos[-1];
1782         else
1783           {
1784             pos = lastpos + nlastpos[-2];
1785             for (j = nlastpos[-1] - 1; j >= 0; --j)
1786               pos[j] = lastpos[j];
1787             lastpos += nlastpos[-2];
1788             nlastpos[-2] = nlastpos[-1];
1789           }
1790         --nlastpos;
1791
1792         /* A CAT node is nullable if both arguments are nullable. */
1793         nullable[-2] = nullable[-1] && nullable[-2];
1794         --nullable;
1795         break;
1796
1797       case OR:
1798       case ORTOP:
1799         /* The firstpos is the union of the firstpos of each argument. */
1800         nfirstpos[-2] += nfirstpos[-1];
1801         --nfirstpos;
1802
1803         /* The lastpos is the union of the lastpos of each argument. */
1804         nlastpos[-2] += nlastpos[-1];
1805         --nlastpos;
1806
1807         /* An OR node is nullable if either argument is nullable. */
1808         nullable[-2] = nullable[-1] || nullable[-2];
1809         --nullable;
1810         break;
1811
1812       default:
1813         /* Anything else is a nonempty position.  (Note that special
1814            constructs like \< are treated as nonempty strings here;
1815            an "epsilon closure" effectively makes them nullable later.
1816            Backreferences have to get a real position so we can detect
1817            transitions on them later.  But they are nullable. */
1818         *nullable++ = d->tokens[i] == BACKREF;
1819
1820         /* This position is in its own firstpos and lastpos. */
1821         *nfirstpos++ = *nlastpos++ = 1;
1822         --firstpos, --lastpos;
1823         firstpos->index = lastpos->index = i;
1824         firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1825
1826         /* Allocate the follow set for this position. */
1827         nalloc[i] = 1;
1828         MALLOC(d->follows[i].elems, position, nalloc[i]);
1829         break;
1830       }
1831 #ifdef DEBUG
1832     /* ... balance the above nonsyntactic #ifdef goo... */
1833       fprintf(stderr, "node %d:", i);
1834       prtok(d->tokens[i]);
1835       putc('\n', stderr);
1836       fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1837       fprintf(stderr, " firstpos:");
1838       for (j = nfirstpos[-1] - 1; j >= 0; --j)
1839         {
1840           fprintf(stderr, " %d:", firstpos[j].index);
1841           prtok(d->tokens[firstpos[j].index]);
1842         }
1843       fprintf(stderr, "\n lastpos:");
1844       for (j = nlastpos[-1] - 1; j >= 0; --j)
1845         {
1846           fprintf(stderr, " %d:", lastpos[j].index);
1847           prtok(d->tokens[lastpos[j].index]);
1848         }
1849       putc('\n', stderr);
1850     }
1851 #endif
1852
1853   /* For each follow set that is the follow set of a real position, replace
1854      it with its epsilon closure. */
1855   for (i = 0; i < d->tindex; ++i)
1856     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1857 #ifdef MBS_SUPPORT
1858         || d->tokens[i] == ANYCHAR
1859         || d->tokens[i] == MBCSET
1860 #endif
1861         || d->tokens[i] >= CSET)
1862       {
1863 #ifdef DEBUG
1864         fprintf(stderr, "follows(%d:", i);
1865         prtok(d->tokens[i]);
1866         fprintf(stderr, "):");
1867         for (j = d->follows[i].nelem - 1; j >= 0; --j)
1868           {
1869             fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1870             prtok(d->tokens[d->follows[i].elems[j].index]);
1871           }
1872         putc('\n', stderr);
1873 #endif
1874         copy(&d->follows[i], &merged);
1875         epsclosure(&merged, d);
1876         if (d->follows[i].nelem < merged.nelem)
1877           REALLOC(d->follows[i].elems, position, merged.nelem);
1878         copy(&merged, &d->follows[i]);
1879       }
1880
1881   /* Get the epsilon closure of the firstpos of the regexp.  The result will
1882      be the set of positions of state 0. */
1883   merged.nelem = 0;
1884   for (i = 0; i < nfirstpos[-1]; ++i)
1885     insert(firstpos[i], &merged);
1886   epsclosure(&merged, d);
1887
1888   /* Check if any of the positions of state 0 will want newline context. */
1889   wants_newline = 0;
1890   for (i = 0; i < merged.nelem; ++i)
1891     if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1892       wants_newline = 1;
1893
1894   /* Build the initial state. */
1895   d->salloc = 1;
1896   d->sindex = 0;
1897   MALLOC(d->states, dfa_state, d->salloc);
1898   state_index(d, &merged, wants_newline, 0);
1899
1900   free(o_nullable);
1901   free(o_nfirst);
1902   free(o_firstpos);
1903   free(o_nlast);
1904   free(o_lastpos);
1905   free(nalloc);
1906   free(merged.elems);
1907 }
1908
1909 /* Find, for each character, the transition out of state s of d, and store
1910    it in the appropriate slot of trans.
1911
1912    We divide the positions of s into groups (positions can appear in more
1913    than one group).  Each group is labeled with a set of characters that
1914    every position in the group matches (taking into account, if necessary,
1915    preceding context information of s).  For each group, find the union
1916    of the its elements' follows.  This set is the set of positions of the
1917    new state.  For each character in the group's label, set the transition
1918    on this character to be to a state corresponding to the set's positions,
1919    and its associated backward context information, if necessary.
1920
1921    If we are building a searching matcher, we include the positions of state
1922    0 in every state.
1923
1924    The collection of groups is constructed by building an equivalence-class
1925    partition of the positions of s.
1926
1927    For each position, find the set of characters C that it matches.  Eliminate
1928    any characters from C that fail on grounds of backward context.
1929
1930    Search through the groups, looking for a group whose label L has nonempty
1931    intersection with C.  If L - C is nonempty, create a new group labeled
1932    L - C and having the same positions as the current group, and set L to
1933    the intersection of L and C.  Insert the position in this group, set
1934    C = C - L, and resume scanning.
1935
1936    If after comparing with every group there are characters remaining in C,
1937    create a new group labeled with the characters of C and insert this
1938    position in that group. */
1939 void
1940 dfastate (int s, struct dfa *d, int trans[])
1941 {
1942   position_set grps[NOTCHAR];   /* As many as will ever be needed. */
1943   charclass labels[NOTCHAR];    /* Labels corresponding to the groups. */
1944   int ngrps = 0;                /* Number of groups actually used. */
1945   position pos;                 /* Current position being considered. */
1946   charclass matches;            /* Set of matching characters. */
1947   int matchesf;                 /* True if matches is nonempty. */
1948   charclass intersect;          /* Intersection with some label set. */
1949   int intersectf;               /* True if intersect is nonempty. */
1950   charclass leftovers;          /* Stuff in the label that didn't match. */
1951   int leftoversf;               /* True if leftovers is nonempty. */
1952   static charclass letters;     /* Set of characters considered letters. */
1953   static charclass newline;     /* Set of characters that aren't newline. */
1954   position_set follows;         /* Union of the follows of some group. */
1955   position_set tmp;             /* Temporary space for merging sets. */
1956   int state;                    /* New state. */
1957   int wants_newline;            /* New state wants to know newline context. */
1958   int state_newline;            /* New state on a newline transition. */
1959   int wants_letter;             /* New state wants to know letter context. */
1960   int state_letter;             /* New state on a letter transition. */
1961   static int initialized;       /* Flag for static initialization. */
1962 #ifdef MBS_SUPPORT
1963   int next_isnt_1st_byte = 0;   /* Flag if we can't add state0.  */
1964 #endif
1965   int i, j, k;
1966
1967   /* Initialize the set of letters, if necessary. */
1968   if (! initialized)
1969     {
1970       initialized = 1;
1971       for (i = 0; i < NOTCHAR; ++i)
1972         if (IS_WORD_CONSTITUENT(i))
1973           setbit(i, letters);
1974       setbit(eolbyte, newline);
1975     }
1976
1977   zeroset(matches);
1978
1979   for (i = 0; i < d->states[s].elems.nelem; ++i)
1980     {
1981       pos = d->states[s].elems.elems[i];
1982       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1983         setbit(d->tokens[pos.index], matches);
1984       else if (d->tokens[pos.index] >= CSET)
1985         copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1986 #ifdef MBS_SUPPORT
1987       else if (d->tokens[pos.index] == ANYCHAR
1988                || d->tokens[pos.index] == MBCSET)
1989       /* MB_CUR_MAX > 1  */
1990         {
1991           /* ANYCHAR and MBCSET must match with a single character, so we
1992              must put it to d->states[s].mbps, which contains the positions
1993              which can match with a single character not a byte.  */
1994           if (d->states[s].mbps.nelem == 0)
1995             {
1996               MALLOC(d->states[s].mbps.elems, position,
1997                      d->states[s].elems.nelem);
1998             }
1999           insert(pos, &(d->states[s].mbps));
2000           continue;
2001         }
2002 #endif /* MBS_SUPPORT */
2003       else
2004         continue;
2005
2006       /* Some characters may need to be eliminated from matches because
2007          they fail in the current context. */
2008       if (pos.constraint != 0xFF)
2009         {
2010           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2011                                          d->states[s].newline, 1))
2012             clrbit(eolbyte, matches);
2013           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2014                                          d->states[s].newline, 0))
2015             for (j = 0; j < CHARCLASS_INTS; ++j)
2016               matches[j] &= newline[j];
2017           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2018                                         d->states[s].letter, 1))
2019             for (j = 0; j < CHARCLASS_INTS; ++j)
2020               matches[j] &= ~letters[j];
2021           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2022                                         d->states[s].letter, 0))
2023             for (j = 0; j < CHARCLASS_INTS; ++j)
2024               matches[j] &= letters[j];
2025
2026           /* If there are no characters left, there's no point in going on. */
2027           for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2028             continue;
2029           if (j == CHARCLASS_INTS)
2030             continue;
2031         }
2032
2033       for (j = 0; j < ngrps; ++j)
2034         {
2035           /* If matches contains a single character only, and the current
2036              group's label doesn't contain that character, go on to the
2037              next group. */
2038           if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2039               && !tstbit(d->tokens[pos.index], labels[j]))
2040             continue;
2041
2042           /* Check if this group's label has a nonempty intersection with
2043              matches. */
2044           intersectf = 0;
2045           for (k = 0; k < CHARCLASS_INTS; ++k)
2046             (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2047           if (! intersectf)
2048             continue;
2049
2050           /* It does; now find the set differences both ways. */
2051           leftoversf = matchesf = 0;
2052           for (k = 0; k < CHARCLASS_INTS; ++k)
2053             {
2054               /* Even an optimizing compiler can't know this for sure. */
2055               int match = matches[k], label = labels[j][k];
2056
2057               (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2058               (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2059             }
2060
2061           /* If there were leftovers, create a new group labeled with them. */
2062           if (leftoversf)
2063             {
2064               copyset(leftovers, labels[ngrps]);
2065               copyset(intersect, labels[j]);
2066               MALLOC(grps[ngrps].elems, position, d->nleaves);
2067               copy(&grps[j], &grps[ngrps]);
2068               ++ngrps;
2069             }
2070
2071           /* Put the position in the current group.  Note that there is no
2072              reason to call insert() here. */
2073           grps[j].elems[grps[j].nelem++] = pos;
2074
2075           /* If every character matching the current position has been
2076              accounted for, we're done. */
2077           if (! matchesf)
2078             break;
2079         }
2080
2081       /* If we've passed the last group, and there are still characters
2082          unaccounted for, then we'll have to create a new group. */
2083       if (j == ngrps)
2084         {
2085           copyset(matches, labels[ngrps]);
2086           zeroset(matches);
2087           MALLOC(grps[ngrps].elems, position, d->nleaves);
2088           grps[ngrps].nelem = 1;
2089           grps[ngrps].elems[0] = pos;
2090           ++ngrps;
2091         }
2092     }
2093
2094   MALLOC(follows.elems, position, d->nleaves);
2095   MALLOC(tmp.elems, position, d->nleaves);
2096
2097   /* If we are a searching matcher, the default transition is to a state
2098      containing the positions of state 0, otherwise the default transition
2099      is to fail miserably. */
2100   if (d->searchflag)
2101     {
2102       wants_newline = 0;
2103       wants_letter = 0;
2104       for (i = 0; i < d->states[0].elems.nelem; ++i)
2105         {
2106           if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2107             wants_newline = 1;
2108           if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2109             wants_letter = 1;
2110         }
2111       copy(&d->states[0].elems, &follows);
2112       state = state_index(d, &follows, 0, 0);
2113       if (wants_newline)
2114         state_newline = state_index(d, &follows, 1, 0);
2115       else
2116         state_newline = state;
2117       if (wants_letter)
2118         state_letter = state_index(d, &follows, 0, 1);
2119       else
2120         state_letter = state;
2121       for (i = 0; i < NOTCHAR; ++i)
2122         trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2123       trans[eolbyte] = state_newline;
2124     }
2125   else
2126     for (i = 0; i < NOTCHAR; ++i)
2127       trans[i] = -1;
2128
2129   for (i = 0; i < ngrps; ++i)
2130     {
2131       follows.nelem = 0;
2132
2133       /* Find the union of the follows of the positions of the group.
2134          This is a hideously inefficient loop.  Fix it someday. */
2135       for (j = 0; j < grps[i].nelem; ++j)
2136         for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2137           insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2138
2139 #ifdef MBS_SUPPORT
2140       if (d->mb_cur_max > 1)
2141         {
2142           /* If a token in follows.elems is not 1st byte of a multibyte
2143              character, or the states of follows must accept the bytes
2144              which are not 1st byte of the multibyte character.
2145              Then, if a state of follows encounter a byte, it must not be
2146              a 1st byte of a multibyte character nor single byte character.
2147              We cansel to add state[0].follows to next state, because
2148              state[0] must accept 1st-byte
2149
2150              For example, we assume <sb a> is a certain single byte
2151              character, <mb A> is a certain multibyte character, and the
2152              codepoint of <sb a> equals the 2nd byte of the codepoint of
2153              <mb A>.
2154              When state[0] accepts <sb a>, state[i] transit to state[i+1]
2155              by accepting accepts 1st byte of <mb A>, and state[i+1]
2156              accepts 2nd byte of <mb A>, if state[i+1] encounter the
2157              codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2158              <mb A>, so we can not add state[0].  */
2159
2160           next_isnt_1st_byte = 0;
2161           for (j = 0; j < follows.nelem; ++j)
2162             {
2163               if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2164                 {
2165                   next_isnt_1st_byte = 1;
2166                   break;
2167                 }
2168             }
2169         }
2170 #endif
2171
2172       /* If we are building a searching matcher, throw in the positions
2173          of state 0 as well. */
2174 #ifdef MBS_SUPPORT
2175       if (d->searchflag && (d->mb_cur_max == 1 || !next_isnt_1st_byte))
2176 #else
2177       if (d->searchflag)
2178 #endif
2179         for (j = 0; j < d->states[0].elems.nelem; ++j)
2180           insert(d->states[0].elems.elems[j], &follows);
2181
2182       /* Find out if the new state will want any context information. */
2183       wants_newline = 0;
2184       if (tstbit(eolbyte, labels[i]))
2185         for (j = 0; j < follows.nelem; ++j)
2186           if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2187             wants_newline = 1;
2188
2189       wants_letter = 0;
2190       for (j = 0; j < CHARCLASS_INTS; ++j)
2191         if (labels[i][j] & letters[j])
2192           break;
2193       if (j < CHARCLASS_INTS)
2194         for (j = 0; j < follows.nelem; ++j)
2195           if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2196             wants_letter = 1;
2197
2198       /* Find the state(s) corresponding to the union of the follows. */
2199       state = state_index(d, &follows, 0, 0);
2200       if (wants_newline)
2201         state_newline = state_index(d, &follows, 1, 0);
2202       else
2203         state_newline = state;
2204       if (wants_letter)
2205         state_letter = state_index(d, &follows, 0, 1);
2206       else
2207         state_letter = state;
2208
2209       /* Set the transitions for each character in the current label. */
2210       for (j = 0; j < CHARCLASS_INTS; ++j)
2211         for (k = 0; k < INTBITS; ++k)
2212           if (labels[i][j] & 1 << k)
2213             {
2214               int c = j * INTBITS + k;
2215
2216               if (c == eolbyte)
2217                 trans[c] = state_newline;
2218               else if (IS_WORD_CONSTITUENT(c))
2219                 trans[c] = state_letter;
2220               else if (c < NOTCHAR)
2221                 trans[c] = state;
2222             }
2223     }
2224
2225   for (i = 0; i < ngrps; ++i)
2226     free(grps[i].elems);
2227   free(follows.elems);
2228   free(tmp.elems);
2229 }
2230
2231 /* Some routines for manipulating a compiled dfa's transition tables.
2232    Each state may or may not have a transition table; if it does, and it
2233    is a non-accepting state, then d->trans[state] points to its table.
2234    If it is an accepting state then d->fails[state] points to its table.
2235    If it has no table at all, then d->trans[state] is NULL.
2236    TODO: Improve this comment, get rid of the unnecessary redundancy. */
2237
2238 static void
2239 build_state (int s, struct dfa *d)
2240 {
2241   int *trans;                   /* The new transition table. */
2242   int i;
2243
2244   /* Set an upper limit on the number of transition tables that will ever
2245      exist at once.  1024 is arbitrary.  The idea is that the frequently
2246      used transition tables will be quickly rebuilt, whereas the ones that
2247      were only needed once or twice will be cleared away. */
2248   if (d->trcount >= 1024)
2249     {
2250       for (i = 0; i < d->tralloc; ++i)
2251         {
2252           free(d->trans[i]);
2253           free(d->fails[i]);
2254           d->trans[i] = d->fails[i] = NULL;
2255         }
2256       d->trcount = 0;
2257     }
2258
2259   ++d->trcount;
2260
2261   /* Set up the success bits for this state. */
2262   d->success[s] = 0;
2263   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2264       s, *d))
2265     d->success[s] |= 4;
2266   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2267       s, *d))
2268     d->success[s] |= 2;
2269   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2270       s, *d))
2271     d->success[s] |= 1;
2272
2273   MALLOC(trans, int, NOTCHAR);
2274   dfastate(s, d, trans);
2275
2276   /* Now go through the new transition table, and make sure that the trans
2277      and fail arrays are allocated large enough to hold a pointer for the
2278      largest state mentioned in the table. */
2279   for (i = 0; i < NOTCHAR; ++i)
2280     if (trans[i] >= d->tralloc)
2281       {
2282         int oldalloc = d->tralloc;
2283
2284         while (trans[i] >= d->tralloc)
2285           d->tralloc *= 2;
2286         REALLOC(d->realtrans, int *, d->tralloc + 1);
2287         d->trans = d->realtrans + 1;
2288         REALLOC(d->fails, int *, d->tralloc);
2289         REALLOC(d->success, int, d->tralloc);
2290         REALLOC(d->newlines, int, d->tralloc);
2291         while (oldalloc < d->tralloc)
2292           {
2293             d->trans[oldalloc] = NULL;
2294             d->fails[oldalloc++] = NULL;
2295           }
2296       }
2297
2298   /* Keep the newline transition in a special place so we can use it as
2299      a sentinel. */
2300   d->newlines[s] = trans[eolbyte];
2301   trans[eolbyte] = -1;
2302
2303   if (ACCEPTING(s, *d))
2304     d->fails[s] = trans;
2305   else
2306     d->trans[s] = trans;
2307 }
2308
2309 static void
2310 build_state_zero (struct dfa *d)
2311 {
2312   d->tralloc = 1;
2313   d->trcount = 0;
2314   CALLOC(d->realtrans, int *, d->tralloc + 1);
2315   d->trans = d->realtrans + 1;
2316   CALLOC(d->fails, int *, d->tralloc);
2317   MALLOC(d->success, int, d->tralloc);
2318   MALLOC(d->newlines, int, d->tralloc);
2319   build_state(0, d);
2320 }
2321
2322 #ifdef MBS_SUPPORT
2323 /* Multibyte character handling sub-routines for dfaexec.  */
2324
2325 /* Initial state may encounter the byte which is not a single byte character
2326    nor 1st byte of a multibyte character.  But it is incorrect for initial
2327    state to accept such a byte.
2328    For example, in sjis encoding the regular expression like "\\" accepts
2329    the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2330    0x815c. Then Initial state must skip the bytes which are not a single byte
2331    character nor 1st byte of a multibyte character.  */
2332 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)          \
2333   if (s == 0)                                           \
2334     {                                                   \
2335       while (inputwcs[p - buf_begin] == 0               \
2336             && mblen_buf[p - buf_begin] > 0             \
2337             && (unsigned char const *) p < buf_end)     \
2338         ++p;                                            \
2339       if ((char *) p >= end)                            \
2340         {                                               \
2341           free(mblen_buf);                              \
2342           free(inputwcs);                               \
2343           *end = saved_end;                             \
2344           return NULL;                                  \
2345         }                                               \
2346     }
2347
2348 static void
2349 realloc_trans_if_necessary(struct dfa *d, int new_state)
2350 {
2351   /* Make sure that the trans and fail arrays are allocated large enough
2352      to hold a pointer for the new state. */
2353   if (new_state >= d->tralloc)
2354     {
2355       int oldalloc = d->tralloc;
2356
2357       while (new_state >= d->tralloc)
2358         d->tralloc *= 2;
2359       REALLOC(d->realtrans, int *, d->tralloc + 1);
2360       d->trans = d->realtrans + 1;
2361       REALLOC(d->fails, int *, d->tralloc);
2362       REALLOC(d->success, int, d->tralloc);
2363       REALLOC(d->newlines, int, d->tralloc);
2364       while (oldalloc < d->tralloc)
2365         {
2366           d->trans[oldalloc] = NULL;
2367           d->fails[oldalloc++] = NULL;
2368         }
2369     }
2370 }
2371
2372 /* Return values of transit_state_singlebyte(), and
2373    transit_state_consume_1char.  */
2374 typedef enum
2375 {
2376   TRANSIT_STATE_IN_PROGRESS,    /* State transition has not finished.  */
2377   TRANSIT_STATE_DONE,           /* State transition has finished.  */
2378   TRANSIT_STATE_END_BUFFER      /* Reach the end of the buffer.  */
2379 } status_transit_state;
2380
2381 /* Consume a single byte and transit state from 's' to '*next_state'.
2382    This function is almost same as the state transition routin in dfaexec().
2383    But state transition is done just once, otherwise matching succeed or
2384    reach the end of the buffer.  */
2385 static status_transit_state
2386 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2387                                   int *next_state)
2388 {
2389   int *t;
2390   int works = s;
2391
2392   status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2393
2394   while (rval == TRANSIT_STATE_IN_PROGRESS)
2395     {
2396       if ((t = d->trans[works]) != NULL)
2397         {
2398           works = t[*p];
2399           rval = TRANSIT_STATE_DONE;
2400           if (works < 0)
2401             works = 0;
2402         }
2403       else if (works < 0)
2404         {
2405           if (p == buf_end)
2406             {
2407               /* At the moment, it must not happen.  */
2408               abort ();
2409             }
2410           works = 0;
2411         }
2412       else if (d->fails[works])
2413         {
2414           works = d->fails[works][*p];
2415           rval = TRANSIT_STATE_DONE;
2416         }
2417       else
2418         {
2419           build_state(works, d);
2420         }
2421     }
2422   *next_state = works;
2423   return rval;
2424 }
2425
2426 /* Check whether period can match or not in the current context.  If it can,
2427    return the amount of the bytes with which period can match, otherwise
2428    return 0.
2429    `pos' is the position of the period.  `idx' is the index from the
2430    buf_begin, and it is the current position in the buffer.  */
2431 static int
2432 match_anychar (struct dfa *d, int s, position pos, int idx)
2433 {
2434   int newline = 0;
2435   int letter = 0;
2436   wchar_t wc;
2437   int mbclen;
2438
2439   wc = inputwcs[idx];
2440   mbclen = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
2441
2442   /* Check context.  */
2443   if (wc == (wchar_t)eolbyte)
2444     {
2445       if (!(syntax_bits & RE_DOT_NEWLINE))
2446         return 0;
2447       newline = 1;
2448     }
2449   else if (wc == (wchar_t)'\0')
2450     {
2451       if (syntax_bits & RE_DOT_NOT_NULL)
2452         return 0;
2453       newline = 1;
2454     }
2455
2456   if (iswalnum(wc) || wc == L'_')
2457     letter = 1;
2458
2459   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2460                            newline, d->states[s].letter, letter))
2461     return 0;
2462
2463   return mbclen;
2464 }
2465
2466 /* Check whether bracket expression can match or not in the current context.
2467    If it can, return the amount of the bytes with which expression can match,
2468    otherwise return 0.
2469    `pos' is the position of the bracket expression.  `idx' is the index
2470    from the buf_begin, and it is the current position in the buffer.  */
2471 static int
2472 match_mb_charset (struct dfa *d, int s, position pos, int idx)
2473 {
2474   int i;
2475   int match;            /* Flag which represent that matching succeed.  */
2476   int match_len;        /* Length of the character (or collating element)
2477                            with which this operator match.  */
2478   int op_len;           /* Length of the operator.  */
2479   char buffer[128];
2480   wchar_t wcbuf[6];
2481
2482   /* Pointer to the structure to which we are currently refering.  */
2483   struct mb_char_classes *work_mbc;
2484
2485   int newline = 0;
2486   int letter = 0;
2487   wchar_t wc;           /* Current refering character.  */
2488
2489   wc = inputwcs[idx];
2490
2491   /* Check context.  */
2492   if (wc == (wchar_t)eolbyte)
2493     {
2494       if (!(syntax_bits & RE_DOT_NEWLINE))
2495         return 0;
2496       newline = 1;
2497     }
2498   else if (wc == (wchar_t)'\0')
2499     {
2500       if (syntax_bits & RE_DOT_NOT_NULL)
2501         return 0;
2502       newline = 1;
2503     }
2504   if (iswalnum(wc) || wc == L'_')
2505     letter = 1;
2506   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2507                            newline, d->states[s].letter, letter))
2508     return 0;
2509
2510   /* Assign the current refering operator to work_mbc.  */
2511   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2512   match = !work_mbc->invert;
2513   match_len = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
2514
2515   /* Match in range 0-255?  */
2516   if (wc < NOTCHAR && work_mbc->cset != -1
2517       && tstbit((unsigned char)wc, d->charclasses[work_mbc->cset]))
2518     goto charset_matched;
2519
2520   /* match with a character class?  */
2521   for (i = 0; i<work_mbc->nch_classes; i++)
2522     {
2523       if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2524         goto charset_matched;
2525     }
2526
2527   strncpy(buffer, (char const *) buf_begin + idx, match_len);
2528   buffer[match_len] = '\0';
2529
2530   /* match with an equivalent class?  */
2531   for (i = 0; i<work_mbc->nequivs; i++)
2532     {
2533       op_len = strlen(work_mbc->equivs[i]);
2534       strncpy(buffer, (char const *) buf_begin + idx, op_len);
2535       buffer[op_len] = '\0';
2536       if (strcoll(work_mbc->equivs[i], buffer) == 0)
2537         {
2538           match_len = op_len;
2539           goto charset_matched;
2540         }
2541     }
2542
2543   /* match with a collating element?  */
2544   for (i = 0; i<work_mbc->ncoll_elems; i++)
2545     {
2546       op_len = strlen(work_mbc->coll_elems[i]);
2547       strncpy(buffer, (char const *) buf_begin + idx, op_len);
2548       buffer[op_len] = '\0';
2549
2550       if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2551         {
2552           match_len = op_len;
2553           goto charset_matched;
2554         }
2555     }
2556
2557   wcbuf[0] = wc;
2558   wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2559
2560   /* match with a range?  */
2561   for (i = 0; i<work_mbc->nranges; i++)
2562     {
2563       wcbuf[2] = work_mbc->range_sts[i];
2564       wcbuf[4] = work_mbc->range_ends[i];
2565
2566       if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2567           wcscoll(wcbuf+4, wcbuf) >= 0)
2568         goto charset_matched;
2569     }
2570
2571   /* match with a character?  */
2572   for (i = 0; i<work_mbc->nchars; i++)
2573     {
2574       if (wc == work_mbc->chars[i])
2575         goto charset_matched;
2576     }
2577
2578   match = !match;
2579
2580  charset_matched:
2581   return match ? match_len : 0;
2582 }
2583
2584 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2585    array which corresponds to `d->states[s].mbps.elem' and each element of
2586    the array contains the amount of the bytes with which the element can
2587    match.
2588    `idx' is the index from the buf_begin, and it is the current position
2589    in the buffer.
2590    Caller MUST free the array which this function return.  */
2591 static int*
2592 check_matching_with_multibyte_ops (struct dfa *d, int s, int idx)
2593 {
2594   int i;
2595   int* rarray;
2596
2597   MALLOC(rarray, int, d->states[s].mbps.nelem);
2598   for (i = 0; i < d->states[s].mbps.nelem; ++i)
2599     {
2600       position pos = d->states[s].mbps.elems[i];
2601       switch(d->tokens[pos.index])
2602         {
2603         case ANYCHAR:
2604           rarray[i] = match_anychar(d, s, pos, idx);
2605           break;
2606         case MBCSET:
2607           rarray[i] = match_mb_charset(d, s, pos, idx);
2608           break;
2609         default:
2610           break; /* can not happen.  */
2611         }
2612     }
2613   return rarray;
2614 }
2615
2616 /* Consume a single character and enumerate all of the positions which can
2617    be next position from the state `s'.
2618    `match_lens' is the input. It can be NULL, but it can also be the output
2619    of check_matching_with_multibyte_ops() for optimization.
2620    `mbclen' and `pps' are the output.  `mbclen' is the length of the
2621    character consumed, and `pps' is the set this function enumerate.  */
2622 static status_transit_state
2623 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2624                              int *match_lens, int *mbclen, position_set *pps)
2625 {
2626   int i, j;
2627   int s1, s2;
2628   int* work_mbls;
2629   status_transit_state rs = TRANSIT_STATE_DONE;
2630
2631   /* Calculate the length of the (single/multi byte) character
2632      to which p points.  */
2633   *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2634     : mblen_buf[*pp - buf_begin];
2635
2636   /* Calculate the state which can be reached from the state `s' by
2637      consuming `*mbclen' single bytes from the buffer.  */
2638   s1 = s;
2639   for (i = 0; i < *mbclen; i++)
2640     {
2641       s2 = s1;
2642       rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2643     }
2644   /* Copy the positions contained by `s1' to the set `pps'.  */
2645   copy(&(d->states[s1].elems), pps);
2646
2647   /* Check (inputed)match_lens, and initialize if it is NULL.  */
2648   if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2649     work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2650   else
2651     work_mbls = match_lens;
2652
2653   /* Add all of the positions which can be reached from `s' by consuming
2654      a single character.  */
2655   for (i = 0; i < d->states[s].mbps.nelem ; i++)
2656    {
2657       if (work_mbls[i] == *mbclen)
2658         for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2659              j++)
2660           insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2661                  pps);
2662     }
2663
2664   if (match_lens == NULL && work_mbls != NULL)
2665     free(work_mbls);
2666   return rs;
2667 }
2668
2669 /* Transit state from s, then return new state and update the pointer of the
2670    buffer.  This function is for some operator which can match with a multi-
2671    byte character or a collating element (which may be multi characters).  */
2672 static int
2673 transit_state (struct dfa *d, int s, unsigned char const **pp)
2674 {
2675   int s1;
2676   int mbclen;           /* The length of current input multibyte character. */
2677   int maxlen = 0;
2678   int i, j;
2679   int *match_lens = NULL;
2680   int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2681   position_set follows;
2682   unsigned char const *p1 = *pp;
2683   wchar_t wc;
2684
2685   if (nelem > 0)
2686     /* This state has (a) multibyte operator(s).
2687        We check whether each of them can match or not.  */
2688     {
2689       /* Note: caller must free the return value of this function.  */
2690       match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2691
2692       for (i = 0; i < nelem; i++)
2693         /* Search the operator which match the longest string,
2694            in this state.  */
2695         {
2696           if (match_lens[i] > maxlen)
2697             maxlen = match_lens[i];
2698         }
2699     }
2700
2701   if (nelem == 0 || maxlen == 0)
2702     /* This state has no multibyte operator which can match.
2703        We need to check only one single byte character.  */
2704     {
2705       status_transit_state rs;
2706       rs = transit_state_singlebyte(d, s, *pp, &s1);
2707
2708       /* We must update the pointer if state transition succeeded.  */
2709       if (rs == TRANSIT_STATE_DONE)
2710         ++*pp;
2711
2712       free(match_lens);
2713       return s1;
2714     }
2715
2716   /* This state has some operators which can match a multibyte character.  */
2717   follows.nelem = 0;
2718   MALLOC(follows.elems, position, d->nleaves);
2719
2720   /* `maxlen' may be longer than the length of a character, because it may
2721      not be a character but a (multi character) collating element.
2722      We enumerate all of the positions which `s' can reach by consuming
2723      `maxlen' bytes.  */
2724   transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2725
2726   wc = inputwcs[*pp - mbclen - buf_begin];
2727   s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2728   realloc_trans_if_necessary(d, s1);
2729
2730   while (*pp - p1 < maxlen)
2731     {
2732       follows.nelem = 0;
2733       transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2734
2735       for (i = 0; i < nelem ; i++)
2736         {
2737           if (match_lens[i] == *pp - p1)
2738             for (j = 0;
2739                  j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2740               insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2741                      &follows);
2742         }
2743
2744       wc = inputwcs[*pp - mbclen - buf_begin];
2745       s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2746       realloc_trans_if_necessary(d, s1);
2747     }
2748   free(match_lens);
2749   free(follows.elems);
2750   return s1;
2751 }
2752
2753 #endif /* MBS_SUPPORT */
2754
2755 /* Search through a buffer looking for a match to the given struct dfa.
2756    Find the first occurrence of a string matching the regexp in the
2757    buffer, and the shortest possible version thereof.  Return a pointer to
2758    the first character after the match, or NULL if none is found.  BEGIN
2759    points to the beginning of the buffer, and END points to the first byte
2760    after its end.  Note however that we store a sentinel byte (usually
2761    newline) in *END, so the actual buffer must be one byte longer.
2762    When NEWLINE is nonzero, newlines may appear in the matching string.
2763    If COUNT is non-NULL, increment *COUNT once for each newline processed.
2764    Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
2765    encountered a back-reference (1) or not (0).  The caller may use this
2766    to decide whether to fall back on a backtracking matcher. */
2767 char *
2768 dfaexec (struct dfa *d, char const *begin, char *end,
2769          int newline, int *count, int *backref)
2770 {
2771   int s, s1, tmp;       /* Current state. */
2772   unsigned char const *p; /* Current input character. */
2773   int **trans, *t;      /* Copy of d->trans so it can be optimized
2774                                    into a register. */
2775   unsigned char eol = eolbyte;  /* Likewise for eolbyte.  */
2776   static int sbit[NOTCHAR];     /* Table for anding with d->success. */
2777   static int sbit_init;
2778
2779   if (! sbit_init)
2780     {
2781       int i;
2782
2783       sbit_init = 1;
2784       for (i = 0; i < NOTCHAR; ++i)
2785         sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2786       sbit[eol] = 4;
2787     }
2788
2789   if (! d->tralloc)
2790     build_state_zero(d);
2791
2792   s = s1 = 0;
2793   p = (unsigned char const *) begin;
2794   trans = d->trans;
2795   unsigned char saved_end = *(unsigned char *) end;
2796   *end = eol;
2797
2798 #ifdef MBS_SUPPORT
2799   if (d->mb_cur_max > 1)
2800     {
2801       int remain_bytes, i;
2802       buf_begin = (unsigned char *) begin;
2803       buf_end = (unsigned char *) end;
2804
2805       /* initialize mblen_buf, and inputwcs.  */
2806       MALLOC(mblen_buf, unsigned char, end - begin + 2);
2807       MALLOC(inputwcs, wchar_t, end - begin + 2);
2808       memset(&mbs, 0, sizeof(mbstate_t));
2809       remain_bytes = 0;
2810       for (i = 0; i < end - begin + 1; i++)
2811         {
2812           if (remain_bytes == 0)
2813             {
2814               remain_bytes
2815                 = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
2816               if (remain_bytes < 1
2817                 || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
2818                 {
2819                   remain_bytes = 0;
2820                   inputwcs[i] = (wchar_t)begin[i];
2821                   mblen_buf[i] = 0;
2822                 }
2823               else
2824                 {
2825                   mblen_buf[i] = remain_bytes;
2826                   remain_bytes--;
2827                 }
2828             }
2829           else
2830             {
2831               mblen_buf[i] = remain_bytes;
2832               inputwcs[i] = 0;
2833               remain_bytes--;
2834             }
2835         }
2836       mblen_buf[i] = 0;
2837       inputwcs[i] = 0; /* sentinel */
2838     }
2839 #endif /* MBS_SUPPORT */
2840
2841   for (;;)
2842     {
2843 #ifdef MBS_SUPPORT
2844       if (d->mb_cur_max > 1)
2845         while ((t = trans[s]))
2846           {
2847             if ((char *) p > end)
2848               break;
2849             s1 = s;
2850             SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2851
2852             if (d->states[s].mbps.nelem == 0)
2853               {
2854                 s = t[*p++];
2855                 continue;
2856               }
2857
2858             /* Can match with a multibyte character (and multi character
2859                collating element).  Transition table might be updated.  */
2860             s = transit_state(d, s, &p);
2861             trans = d->trans;
2862           }
2863       else
2864 #endif /* MBS_SUPPORT */
2865       while ((t = trans[s]) != 0) { /* hand-optimized loop */
2866         s1 = t[*p++];
2867         if ((t = trans[s1]) == 0) {
2868           tmp = s ; s = s1 ; s1 = tmp ; /* swap */
2869           break;
2870         }
2871         s = t[*p++];
2872       }
2873
2874       if (s >= 0 && (char *) p <= end && d->fails[s])
2875         {
2876           if (d->success[s] & sbit[*p])
2877             {
2878               if (backref)
2879                 *backref = (d->states[s].backref != 0);
2880 #ifdef MBS_SUPPORT
2881               if (d->mb_cur_max > 1)
2882                 {
2883                   free(mblen_buf);
2884                   free(inputwcs);
2885                 }
2886 #endif /* MBS_SUPPORT */
2887               *end = saved_end;
2888               return (char *) p;
2889             }
2890
2891           s1 = s;
2892 #ifdef MBS_SUPPORT
2893           if (d->mb_cur_max > 1)
2894             {
2895               /* Can match with a multibyte character (and multicharacter
2896                  collating element).  Transition table might be updated.  */
2897               s = transit_state(d, s, &p);
2898               trans = d->trans;
2899             }
2900           else
2901 #endif /* MBS_SUPPORT */
2902           s = d->fails[s][*p++];
2903           continue;
2904         }
2905
2906       /* If the previous character was a newline, count it. */
2907       if (count && (char *) p <= end && p[-1] == eol)
2908         ++*count;
2909
2910       /* Check if we've run off the end of the buffer. */
2911       if ((char *) p > end)
2912         {
2913 #ifdef MBS_SUPPORT
2914           if (d->mb_cur_max > 1)
2915             {
2916               free(mblen_buf);
2917               free(inputwcs);
2918             }
2919 #endif /* MBS_SUPPORT */
2920           *end = saved_end;
2921           return NULL;
2922         }
2923
2924       if (s >= 0)
2925         {
2926           build_state(s, d);
2927           trans = d->trans;
2928           continue;
2929         }
2930
2931       if (p[-1] == eol && newline)
2932         {
2933           s = d->newlines[s1];
2934           continue;
2935         }
2936
2937       s = 0;
2938     }
2939 }
2940
2941 #ifdef MBS_SUPPORT
2942 static void
2943 free_mbdata (struct dfa *d)
2944 {
2945   unsigned int i;
2946
2947   free(d->multibyte_prop);
2948   d->multibyte_prop = NULL;
2949
2950   for (i = 0; i < d->nmbcsets; ++i)
2951     {
2952       unsigned int j;
2953       struct mb_char_classes *p = &(d->mbcsets[i]);
2954       free(p->chars);
2955       free(p->ch_classes);
2956       free(p->range_sts);
2957       free(p->range_ends);
2958
2959       for (j = 0; j < p->nequivs; ++j)
2960         free(p->equivs[j]);
2961       free(p->equivs);
2962
2963       for (j = 0; j < p->ncoll_elems; ++j)
2964         free(p->coll_elems[j]);
2965       free(p->coll_elems);
2966     }
2967
2968   free(d->mbcsets);
2969   d->mbcsets = NULL;
2970   d->nmbcsets = 0;
2971 }
2972 #endif
2973
2974 /* Initialize the components of a dfa that the other routines don't
2975    initialize for themselves. */
2976 void
2977 dfainit (struct dfa *d)
2978 {
2979   d->calloc = 1;
2980   MALLOC(d->charclasses, charclass, d->calloc);
2981   d->cindex = 0;
2982
2983   d->talloc = 1;
2984   MALLOC(d->tokens, token, d->talloc);
2985   d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2986
2987 #ifdef MBS_SUPPORT
2988   d->mb_cur_max = MB_CUR_MAX;
2989   if (d->mb_cur_max > 1)
2990     {
2991       d->nmultibyte_prop = 1;
2992       MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2993       d->nmbcsets = 0;
2994       d->mbcsets_alloc = 1;
2995       MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2996     }
2997 #endif
2998
2999   d->searchflag = 0;
3000   d->tralloc = 0;
3001
3002   d->musts = 0;
3003   d->realtrans = 0;
3004   d->fails = 0;
3005   d->newlines = 0;
3006   d->success = 0;
3007 #ifdef GAWK
3008   d->broken = 0;
3009 #endif
3010 }
3011
3012 #ifdef MBS_SUPPORT
3013 static void
3014 dfaoptimize (struct dfa *d)
3015 {
3016   unsigned int i;
3017   if (!using_utf8())
3018     return;
3019
3020   for (i = 0; i < d->tindex; ++i)
3021     {
3022       switch(d->tokens[i])
3023         {
3024         case ANYCHAR:
3025         case MBCSET:
3026           /* Requires multi-byte algorithm.  */
3027           return;
3028         default:
3029           break;
3030         }
3031     }
3032
3033   free_mbdata (d);
3034   d->mb_cur_max = 1;
3035 }
3036 #endif
3037
3038 /* Parse and analyze a single string of the given length. */
3039 void
3040 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3041 {
3042   dfainit(d);
3043   dfaparse(s, len, d);
3044   dfamust(d);
3045 #ifdef MBS_SUPPORT
3046   dfaoptimize(d);
3047 #endif
3048   dfaanalyze(d, searchflag);
3049 }
3050
3051 /* Free the storage held by the components of a dfa. */
3052 void
3053 dfafree (struct dfa *d)
3054 {
3055   int i;
3056   struct dfamust *dm, *ndm;
3057
3058   free(d->charclasses);
3059   free(d->tokens);
3060
3061 #ifdef MBS_SUPPORT
3062   if (d->mb_cur_max > 1)
3063     free_mbdata(d);
3064 #endif /* MBS_SUPPORT */
3065
3066   for (i = 0; i < d->sindex; ++i) {
3067     free(d->states[i].elems.elems);
3068 #ifdef MBS_SUPPORT
3069     free(d->states[i].mbps.elems);
3070 #endif /* MBS_SUPPORT */
3071   }
3072   free(d->states);
3073   for (i = 0; i < d->tindex; ++i)
3074     free(d->follows[i].elems);
3075   free(d->follows);
3076   for (i = 0; i < d->tralloc; ++i)
3077     {
3078       free(d->trans[i]);
3079       free(d->fails[i]);
3080     }
3081   free(d->realtrans);
3082   free(d->fails);
3083   free(d->newlines);
3084   free(d->success);
3085   for (dm = d->musts; dm; dm = ndm)
3086     {
3087       ndm = dm->next;
3088       free(dm->must);
3089       free(dm);
3090     }
3091 }
3092
3093 /* Having found the postfix representation of the regular expression,
3094    try to find a long sequence of characters that must appear in any line
3095    containing the r.e.
3096    Finding a "longest" sequence is beyond the scope here;
3097    we take an easy way out and hope for the best.
3098    (Take "(ab|a)b"--please.)
3099
3100    We do a bottom-up calculation of sequences of characters that must appear
3101    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3102    representation:
3103         sequences that must appear at the left of the match ("left")
3104         sequences that must appear at the right of the match ("right")
3105         lists of sequences that must appear somewhere in the match ("in")
3106         sequences that must constitute the match ("is")
3107
3108    When we get to the root of the tree, we use one of the longest of its
3109    calculated "in" sequences as our answer.  The sequence we find is returned in
3110    d->must (where "d" is the single argument passed to "dfamust");
3111    the length of the sequence is returned in d->mustn.
3112
3113    The sequences calculated for the various types of node (in pseudo ANSI c)
3114    are shown below.  "p" is the operand of unary operators (and the left-hand
3115    operand of binary operators); "q" is the right-hand operand of binary
3116    operators.
3117
3118    "ZERO" means "a zero-length sequence" below.
3119
3120         Type    left            right           is              in
3121         ----    ----            -----           --              --
3122         char c  # c             # c             # c             # c
3123
3124         ANYCHAR ZERO            ZERO            ZERO            ZERO
3125
3126         MBCSET  ZERO            ZERO            ZERO            ZERO
3127
3128         CSET    ZERO            ZERO            ZERO            ZERO
3129
3130         STAR    ZERO            ZERO            ZERO            ZERO
3131
3132         QMARK   ZERO            ZERO            ZERO            ZERO
3133
3134         PLUS    p->left         p->right        ZERO            p->in
3135
3136         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3137                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3138                 p->is##q->left  p->right##q->is p->is##q->is :  p->right##q->left
3139                                                 ZERO
3140
3141         OR      longest common  longest common  (do p->is and   substrings common to
3142                 leading         trailing        q->is have same p->in and q->in
3143                 (sub)sequence   (sub)sequence   length and
3144                 of p->left      of p->right     content) ?
3145                 and q->left     and q->right    p->is : NULL
3146
3147    If there's anything else we recognize in the tree, all four sequences get set
3148    to zero-length sequences.  If there's something we don't recognize in the tree,
3149    we just return a zero-length sequence.
3150
3151    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3152    'aaa')?
3153
3154    And. . .is it here or someplace that we might ponder "optimizations" such as
3155         egrep 'psi|epsilon'     ->      egrep 'psi'
3156         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3157                                         (Yes, we now find "epsi" as a "string
3158                                         that must occur", but we might also
3159                                         simplify the *entire* r.e. being sought)
3160         grep '[c]'              ->      grep 'c'
3161         grep '(ab|a)b'          ->      grep 'ab'
3162         grep 'ab*'              ->      grep 'a'
3163         grep 'a*b'              ->      grep 'b'
3164
3165    There are several issues:
3166
3167    Is optimization easy (enough)?
3168
3169    Does optimization actually accomplish anything,
3170    or is the automaton you get from "psi|epsilon" (for example)
3171    the same as the one you get from "psi" (for example)?
3172
3173    Are optimizable r.e.'s likely to be used in real-life situations
3174    (something like 'ab*' is probably unlikely; something like is
3175    'psi|epsilon' is likelier)? */
3176
3177 static char *
3178 icatalloc (char *old, char *new)
3179 {
3180   char *result;
3181   size_t oldsize, newsize;
3182
3183   newsize = (new == NULL) ? 0 : strlen(new);
3184   if (old == NULL)
3185     oldsize = 0;
3186   else if (newsize == 0)
3187     return old;
3188   else  oldsize = strlen(old);
3189   if (old == NULL)
3190     result = (char *) malloc(newsize + 1);
3191   else
3192     result = (char *) realloc((void *) old, oldsize + newsize + 1);
3193   if (result != NULL && new != NULL)
3194     (void) strcpy(result + oldsize, new);
3195   return result;
3196 }
3197
3198 static char *
3199 icpyalloc (char *string)
3200 {
3201   return icatalloc((char *) NULL, string);
3202 }
3203
3204 static char *
3205 istrstr (char *lookin, char *lookfor)
3206 {
3207   char *cp;
3208   size_t len;
3209
3210   len = strlen(lookfor);
3211   for (cp = lookin; *cp != '\0'; ++cp)
3212     if (strncmp(cp, lookfor, len) == 0)
3213       return cp;
3214   return NULL;
3215 }
3216
3217 static void
3218 freelist (char **cpp)
3219 {
3220   int i;
3221
3222   if (cpp == NULL)
3223     return;
3224   for (i = 0; cpp[i] != NULL; ++i)
3225     {
3226       free(cpp[i]);
3227       cpp[i] = NULL;
3228     }
3229 }
3230
3231 static char **
3232 enlist (char **cpp, char *new, size_t len)
3233 {
3234   int i, j;
3235
3236   if (cpp == NULL)
3237     return NULL;
3238   if ((new = icpyalloc(new)) == NULL)
3239     {
3240       freelist(cpp);
3241       return NULL;
3242     }
3243   new[len] = '\0';
3244   /* Is there already something in the list that's new (or longer)? */
3245   for (i = 0; cpp[i] != NULL; ++i)
3246     if (istrstr(cpp[i], new) != NULL)
3247       {
3248         free(new);
3249         return cpp;
3250       }
3251   /* Eliminate any obsoleted strings. */
3252   j = 0;
3253   while (cpp[j] != NULL)
3254     if (istrstr(new, cpp[j]) == NULL)
3255       ++j;
3256     else
3257       {
3258         free(cpp[j]);
3259         if (--i == j)
3260           break;
3261         cpp[j] = cpp[i];
3262         cpp[i] = NULL;
3263       }
3264   /* Add the new string. */
3265   cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3266   if (cpp == NULL)
3267     return NULL;
3268   cpp[i] = new;
3269   cpp[i + 1] = NULL;
3270   return cpp;
3271 }
3272
3273 /* Given pointers to two strings, return a pointer to an allocated
3274    list of their distinct common substrings. Return NULL if something
3275    seems wild. */
3276 static char **
3277 comsubs (char *left, char *right)
3278 {
3279   char **cpp;
3280   char *lcp;
3281   char *rcp;
3282   size_t i, len;
3283
3284   if (left == NULL || right == NULL)
3285     return NULL;
3286   cpp = (char **) malloc(sizeof *cpp);
3287   if (cpp == NULL)
3288     return NULL;
3289   cpp[0] = NULL;
3290   for (lcp = left; *lcp != '\0'; ++lcp)
3291     {
3292       len = 0;
3293       rcp = strchr (right, *lcp);
3294       while (rcp != NULL)
3295         {
3296           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3297             continue;
3298           if (i > len)
3299             len = i;
3300           rcp = strchr (rcp + 1, *lcp);
3301         }
3302       if (len == 0)
3303         continue;
3304       if ((cpp = enlist(cpp, lcp, len)) == NULL)
3305         break;
3306     }
3307   return cpp;
3308 }
3309
3310 static char **
3311 addlists (char **old, char **new)
3312 {
3313   int i;
3314
3315   if (old == NULL || new == NULL)
3316     return NULL;
3317   for (i = 0; new[i] != NULL; ++i)
3318     {
3319       old = enlist(old, new[i], strlen(new[i]));
3320       if (old == NULL)
3321         break;
3322     }
3323   return old;
3324 }
3325
3326 /* Given two lists of substrings, return a new list giving substrings
3327    common to both. */
3328 static char **
3329 inboth (char **left, char **right)
3330 {
3331   char **both;
3332   char **temp;
3333   int lnum, rnum;
3334
3335   if (left == NULL || right == NULL)
3336     return NULL;
3337   both = (char **) malloc(sizeof *both);
3338   if (both == NULL)
3339     return NULL;
3340   both[0] = NULL;
3341   for (lnum = 0; left[lnum] != NULL; ++lnum)
3342     {
3343       for (rnum = 0; right[rnum] != NULL; ++rnum)
3344         {
3345           temp = comsubs(left[lnum], right[rnum]);
3346           if (temp == NULL)
3347             {
3348               freelist(both);
3349               return NULL;
3350             }
3351           both = addlists(both, temp);
3352           freelist(temp);
3353           free(temp);
3354           if (both == NULL)
3355             return NULL;
3356         }
3357     }
3358   return both;
3359 }
3360
3361 typedef struct
3362 {
3363   char **in;
3364   char *left;
3365   char *right;
3366   char *is;
3367 } must;
3368
3369 static void
3370 resetmust (must *mp)
3371 {
3372   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3373   freelist(mp->in);
3374 }
3375
3376 static void
3377 dfamust (struct dfa *d)
3378 {
3379   must *musts;
3380   must *mp;
3381   char *result;
3382   int ri;
3383   int i;
3384   int exact;
3385   token t;
3386   static must must0;
3387   struct dfamust *dm;
3388   static char empty_string[] = "";
3389
3390   result = empty_string;
3391   exact = 0;
3392   musts = (must *) malloc((d->tindex + 1) * sizeof *musts);
3393   if (musts == NULL)
3394     return;
3395   mp = musts;
3396   for (i = 0; i <= d->tindex; ++i)
3397     mp[i] = must0;
3398   for (i = 0; i <= d->tindex; ++i)
3399     {
3400       mp[i].in = (char **) malloc(sizeof *mp[i].in);
3401       mp[i].left = malloc(2);
3402       mp[i].right = malloc(2);
3403       mp[i].is = malloc(2);
3404       if (mp[i].in == NULL || mp[i].left == NULL ||
3405           mp[i].right == NULL || mp[i].is == NULL)
3406         goto done;
3407       mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3408       mp[i].in[0] = NULL;
3409     }
3410 #ifdef DEBUG
3411   fprintf(stderr, "dfamust:\n");
3412   for (i = 0; i < d->tindex; ++i)
3413     {
3414       fprintf(stderr, " %d:", i);
3415       prtok(d->tokens[i]);
3416     }
3417   putc('\n', stderr);
3418 #endif
3419   for (ri = 0; ri < d->tindex; ++ri)
3420     {
3421       switch (t = d->tokens[ri])
3422         {
3423         case LPAREN:
3424         case RPAREN:
3425           goto done;            /* "cannot happen" */
3426         case EMPTY:
3427         case BEGLINE:
3428         case ENDLINE:
3429         case BEGWORD:
3430         case ENDWORD:
3431         case LIMWORD:
3432         case NOTLIMWORD:
3433         case BACKREF:
3434           resetmust(mp);
3435           break;
3436         case STAR:
3437         case QMARK:
3438           if (mp <= musts)
3439             goto done;          /* "cannot happen" */
3440           --mp;
3441           resetmust(mp);
3442           break;
3443         case OR:
3444         case ORTOP:
3445           if (mp < &musts[2])
3446             goto done;          /* "cannot happen" */
3447           {
3448             char **new;
3449             must *lmp;
3450             must *rmp;
3451             int j, ln, rn, n;
3452
3453             rmp = --mp;
3454             lmp = --mp;
3455             /* Guaranteed to be.  Unlikely, but. . . */
3456             if (strcmp(lmp->is, rmp->is) != 0)
3457               lmp->is[0] = '\0';
3458             /* Left side--easy */
3459             i = 0;
3460             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3461               ++i;
3462             lmp->left[i] = '\0';
3463             /* Right side */
3464             ln = strlen(lmp->right);
3465             rn = strlen(rmp->right);
3466             n = ln;
3467             if (n > rn)
3468               n = rn;
3469             for (i = 0; i < n; ++i)
3470               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3471                 break;
3472             for (j = 0; j < i; ++j)
3473               lmp->right[j] = lmp->right[(ln - i) + j];
3474             lmp->right[j] = '\0';
3475             new = inboth(lmp->in, rmp->in);
3476             if (new == NULL)
3477               goto done;
3478             freelist(lmp->in);
3479             free(lmp->in);
3480             lmp->in = new;
3481           }
3482           break;
3483         case PLUS:
3484           if (mp <= musts)
3485             goto done;          /* "cannot happen" */
3486           --mp;
3487           mp->is[0] = '\0';
3488           break;
3489         case END:
3490           if (mp != &musts[1])
3491             goto done;          /* "cannot happen" */
3492           for (i = 0; musts[0].in[i] != NULL; ++i)
3493             if (strlen(musts[0].in[i]) > strlen(result))
3494               result = musts[0].in[i];
3495           if (strcmp(result, musts[0].is) == 0)
3496             exact = 1;
3497           goto done;
3498         case CAT:
3499           if (mp < &musts[2])
3500             goto done;          /* "cannot happen" */
3501           {
3502             must *lmp;
3503             must *rmp;
3504
3505             rmp = --mp;
3506             lmp = --mp;
3507             /* In.  Everything in left, plus everything in
3508                right, plus catenation of
3509                left's right and right's left. */
3510             lmp->in = addlists(lmp->in, rmp->in);
3511             if (lmp->in == NULL)
3512               goto done;
3513             if (lmp->right[0] != '\0' &&
3514                 rmp->left[0] != '\0')
3515               {
3516                 char *tp;
3517
3518                 tp = icpyalloc(lmp->right);
3519                 if (tp == NULL)
3520                   goto done;
3521                 tp = icatalloc(tp, rmp->left);
3522                 if (tp == NULL)
3523                   goto done;
3524                 lmp->in = enlist(lmp->in, tp,
3525                                  strlen(tp));
3526                 free(tp);
3527                 if (lmp->in == NULL)
3528                   goto done;
3529               }
3530             /* Left-hand */
3531             if (lmp->is[0] != '\0')
3532               {
3533                 lmp->left = icatalloc(lmp->left,
3534                                       rmp->left);
3535                 if (lmp->left == NULL)
3536                   goto done;
3537               }
3538             /* Right-hand */
3539             if (rmp->is[0] == '\0')
3540               lmp->right[0] = '\0';
3541             lmp->right = icatalloc(lmp->right, rmp->right);
3542             if (lmp->right == NULL)
3543               goto done;
3544             /* Guaranteed to be */
3545             if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3546               {
3547                 lmp->is = icatalloc(lmp->is, rmp->is);
3548                 if (lmp->is == NULL)
3549                   goto done;
3550               }
3551             else
3552               lmp->is[0] = '\0';
3553           }
3554           break;
3555         default:
3556           if (t < END)
3557             {
3558               /* "cannot happen" */
3559               goto done;
3560             }
3561           else if (t == '\0')
3562             {
3563               /* not on *my* shift */
3564               goto done;
3565             }
3566           else if (t >= CSET
3567 #ifdef MBS_SUPPORT
3568                    || t == ANYCHAR
3569                    || t == MBCSET
3570 #endif /* MBS_SUPPORT */
3571                    )
3572             {
3573               /* easy enough */
3574               resetmust(mp);
3575             }
3576           else
3577             {
3578               /* plain character */
3579               resetmust(mp);
3580               mp->is[0] = mp->left[0] = mp->right[0] = t;
3581               mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3582               mp->in = enlist(mp->in, mp->is, (size_t)1);
3583               if (mp->in == NULL)
3584                 goto done;
3585             }
3586           break;
3587         }
3588 #ifdef DEBUG
3589       fprintf(stderr, " node: %d:", ri);
3590       prtok(d->tokens[ri]);
3591       fprintf(stderr, "\n  in:");
3592       for (i = 0; mp->in[i]; ++i)
3593         fprintf(stderr, " \"%s\"", mp->in[i]);
3594       fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
3595       fprintf(stderr, "  left: \"%s\"\n", mp->left);
3596       fprintf(stderr, "  right: \"%s\"\n", mp->right);
3597 #endif
3598       ++mp;
3599     }
3600  done:
3601   if (strlen(result))
3602     {
3603       MALLOC(dm, struct dfamust, 1);
3604       dm->exact = exact;
3605       MALLOC(dm->must, char, strlen(result) + 1);
3606       strcpy(dm->must, result);
3607       dm->next = d->musts;
3608       d->musts = dm;
3609     }
3610   mp = musts;
3611   for (i = 0; i <= d->tindex; ++i)
3612     {
3613       freelist(mp[i].in);
3614       free(mp[i].in);
3615       free(mp[i].left);
3616       free(mp[i].right);
3617       free(mp[i].is);
3618     }
3619   free(mp);
3620 }
3621 /* vim:set shiftwidth=2: */