gnu/dist/grep/src/dfa.c

   1 /*      $NetBSD: dfa.c,v 1.1.1.3 2003/01/26 23:15:29 wiz Exp $  */
   2
   3 /* dfa.c - deterministic extended regexp routines for GNU
   4    Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software
  18    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #ifdef HAVE_CONFIG_H
  24 #include <config.h>
  25 #endif
  26
  27 #include <assert.h>
  28 #include <ctype.h>
  29 #include <stdio.h>
  30
  31 #include <sys/types.h>
  32 #ifdef STDC_HEADERS
  33 #include <stdlib.h>
  34 #else
  35 extern char *calloc(), *malloc(), *realloc();
  36 extern void free();
  37 #endif
  38
  39 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
  40 #include <string.h>
  41 #else
  42 #include <strings.h>
  43 #endif
  44
  45 #if HAVE_SETLOCALE
  46 # include <locale.h>
  47 #endif
  48
  49 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC && defined HAVE_WCTYPE
  50 /* We can handle multibyte string.  */
  51 # define MBS_SUPPORT
  52 #endif
  53
  54 #ifdef MBS_SUPPORT
  55 # include <wchar.h>
  56 # include <wctype.h>
  57 #endif
  58
  59 #ifndef DEBUG   /* use the same approach as regex.c */
  60 #undef assert
  61 #define assert(e)
  62 #endif /* DEBUG */
  63
  64 #ifndef isgraph
  65 #define isgraph(C) (isprint(C) && !isspace(C))
  66 #endif
  67
  68 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
  69 #define ISALPHA(C) isalpha(C)
  70 #define ISUPPER(C) isupper(C)
  71 #define ISLOWER(C) islower(C)
  72 #define ISDIGIT(C) isdigit(C)
  73 #define ISXDIGIT(C) isxdigit(C)
  74 #define ISSPACE(C) isspace(C)
  75 #define ISPUNCT(C) ispunct(C)
  76 #define ISALNUM(C) isalnum(C)
  77 #define ISPRINT(C) isprint(C)
  78 #define ISGRAPH(C) isgraph(C)
  79 #define ISCNTRL(C) iscntrl(C)
  80 #else
  81 #define ISALPHA(C) (isascii(C) && isalpha(C))
  82 #define ISUPPER(C) (isascii(C) && isupper(C))
  83 #define ISLOWER(C) (isascii(C) && islower(C))
  84 #define ISDIGIT(C) (isascii(C) && isdigit(C))
  85 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
  86 #define ISSPACE(C) (isascii(C) && isspace(C))
  87 #define ISPUNCT(C) (isascii(C) && ispunct(C))
  88 #define ISALNUM(C) (isascii(C) && isalnum(C))
  89 #define ISPRINT(C) (isascii(C) && isprint(C))
  90 #define ISGRAPH(C) (isascii(C) && isgraph(C))
  91 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
  92 #endif
  93
  94 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
  95    - Its arg may be any int or unsigned int; it need not be an unsigned char.
  96    - It's guaranteed to evaluate its argument exactly once.
  97    - It's typically faster.
  98    Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
  99    only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
 100    it's important to use the locale's definition of `digit' even when the
 101    host does not conform to Posix.  */
 102 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
 103
 104 /* If we (don't) have I18N.  */
 105 /* glibc defines _ */
 106 #ifndef _
 107 # ifdef HAVE_LIBINTL_H
 108 #  include <libintl.h>
 109 #  ifndef _
 110 #   define _(Str) gettext (Str)
 111 #  endif
 112 # else
 113 #  define _(Str) (Str)
 114 # endif
 115 #endif
 116
 117 #include "regex.h"
 118 #include "dfa.h"
 119 #include "hard-locale.h"
 120
 121 /* HPUX, define those as macros in sys/param.h */
 122 #ifdef setbit
 123 # undef setbit
 124 #endif
 125 #ifdef clrbit
 126 # undef clrbit
 127 #endif
 128
 129 static void dfamust PARAMS ((struct dfa *dfa));
 130 static void regexp PARAMS ((int toplevel));
 131
 132 static ptr_t
 133 xcalloc (size_t n, size_t s)
 134 {
 135   ptr_t r = calloc(n, s);
 136
 137   if (!r)
 138     dfaerror(_("Memory exhausted"));
 139   return r;
 140 }
 141
 142 static ptr_t
 143 xmalloc (size_t n)
 144 {
 145   ptr_t r = malloc(n);
 146
 147   assert(n != 0);
 148   if (!r)
 149     dfaerror(_("Memory exhausted"));
 150   return r;
 151 }
 152
 153 static ptr_t
 154 xrealloc (ptr_t p, size_t n)
 155 {
 156   ptr_t r = realloc(p, n);
 157
 158   assert(n != 0);
 159   if (!r)
 160     dfaerror(_("Memory exhausted"));
 161   return r;
 162 }
 163
 164 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
 165 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
 166 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
 167
 168 /* Reallocate an array of type t if nalloc is too small for index. */
 169 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
 170   if ((index) >= (nalloc))                        \
 171     {                                             \
 172       do                                          \
 173         (nalloc) *= 2;                            \
 174       while ((index) >= (nalloc));                \
 175       REALLOC(p, t, nalloc);                      \
 176     }
 177
 178 #ifdef DEBUG
 179
 180 static void
 181 prtok (token t)
 182 {
 183   char const *s;
 184
 185   if (t < 0)
 186     fprintf(stderr, "END");
 187   else if (t < NOTCHAR)
 188     fprintf(stderr, "%c", t);
 189   else
 190     {
 191       switch (t)
 192         {
 193         case EMPTY: s = "EMPTY"; break;
 194         case BACKREF: s = "BACKREF"; break;
 195         case BEGLINE: s = "BEGLINE"; break;
 196         case ENDLINE: s = "ENDLINE"; break;
 197         case BEGWORD: s = "BEGWORD"; break;
 198         case ENDWORD: s = "ENDWORD"; break;
 199         case LIMWORD: s = "LIMWORD"; break;
 200         case NOTLIMWORD: s = "NOTLIMWORD"; break;
 201         case QMARK: s = "QMARK"; break;
 202         case STAR: s = "STAR"; break;
 203         case PLUS: s = "PLUS"; break;
 204         case CAT: s = "CAT"; break;
 205         case OR: s = "OR"; break;
 206         case ORTOP: s = "ORTOP"; break;
 207         case LPAREN: s = "LPAREN"; break;
 208         case RPAREN: s = "RPAREN"; break;
 209         case CRANGE: s = "CRANGE"; break;
 210 #ifdef MBS_SUPPORT
 211         case ANYCHAR: s = "ANYCHAR"; break;
 212         case MBCSET: s = "MBCSET"; break;
 213 #endif /* MBS_SUPPORT */
 214         default: s = "CSET"; break;
 215         }
 216       fprintf(stderr, "%s", s);
 217     }
 218 }
 219 #endif /* DEBUG */
 220
 221 /* Stuff pertaining to charclasses. */
 222
 223 static int
 224 tstbit (unsigned b, charclass c)
 225 {
 226   return c[b / INTBITS] & 1 << b % INTBITS;
 227 }
 228
 229 static void
 230 setbit (unsigned b, charclass c)
 231 {
 232   c[b / INTBITS] |= 1 << b % INTBITS;
 233 }
 234
 235 static void
 236 clrbit (unsigned b, charclass c)
 237 {
 238   c[b / INTBITS] &= ~(1 << b % INTBITS);
 239 }
 240
 241 static void
 242 copyset (charclass src, charclass dst)
 243 {
 244   memcpy (dst, src, sizeof (charclass));
 245 }
 246
 247 static void
 248 zeroset (charclass s)
 249 {
 250   memset (s, 0, sizeof (charclass));
 251 }
 252
 253 static void
 254 notset (charclass s)
 255 {
 256   int i;
 257
 258   for (i = 0; i < CHARCLASS_INTS; ++i)
 259     s[i] = ~s[i];
 260 }
 261
 262 static int
 263 equal (charclass s1, charclass s2)
 264 {
 265   return memcmp (s1, s2, sizeof (charclass)) == 0;
 266 }
 267
 268 /* A pointer to the current dfa is kept here during parsing. */
 269 static struct dfa *dfa;
 270
 271 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
 272 static int
 273 charclass_index (charclass s)
 274 {
 275   int i;
 276
 277   for (i = 0; i < dfa->cindex; ++i)
 278     if (equal(s, dfa->charclasses[i]))
 279       return i;
 280   REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
 281   ++dfa->cindex;
 282   copyset(s, dfa->charclasses[i]);
 283   return i;
 284 }
 285
 286 /* Syntax bits controlling the behavior of the lexical analyzer. */
 287 static reg_syntax_t syntax_bits, syntax_bits_set;
 288
 289 /* Flag for case-folding letters into sets. */
 290 static int case_fold;
 291
 292 /* End-of-line byte in data.  */
 293 static unsigned char eolbyte;
 294
 295 /* Entry point to set syntax options. */
 296 void
 297 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
 298 {
 299   syntax_bits_set = 1;
 300   syntax_bits = bits;
 301   case_fold = fold;
 302   eolbyte = eol;
 303 }
 304
 305 /* Like setbit, but if case is folded, set both cases of a letter.  */
 306 static void
 307 setbit_case_fold (unsigned b, charclass c)
 308 {
 309   setbit (b, c);
 310   if (case_fold)
 311     {
 312       if (ISUPPER (b))
 313         setbit (tolower (b), c);
 314       else if (ISLOWER (b))
 315         setbit (toupper (b), c);
 316     }
 317 }
 318
 319 /* Lexical analyzer.  All the dross that deals with the obnoxious
 320    GNU Regex syntax bits is located here.  The poor, suffering
 321    reader is referred to the GNU Regex documentation for the
 322    meaning of the @#%!@#%^!@ syntax bits. */
 323
 324 static char const *lexstart;    /* Pointer to beginning of input string. */
 325 static char const *lexptr;      /* Pointer to next input character. */
 326 static int lexleft;             /* Number of characters remaining. */
 327 static token lasttok;           /* Previous token returned; initially END. */
 328 static int laststart;           /* True if we're separated from beginning or (, |
 329                                    only by zero-width characters. */
 330 static int parens;              /* Count of outstanding left parens. */
 331 static int minrep, maxrep;      /* Repeat counts for {m,n}. */
 332 static int hard_LC_COLLATE;     /* Nonzero if LC_COLLATE is hard.  */
 333
 334 #ifdef MBS_SUPPORT
 335 /* These variables are used only if (MB_CUR_MAX > 1).  */
 336 static mbstate_t mbs;           /* Mbstate for mbrlen().  */
 337 static int cur_mb_len;          /* Byte length of the current scanning
 338                                    multibyte character.  */
 339 static int cur_mb_index;        /* Byte index of the current scanning multibyte
 340                                    character.
 341
 342                                    singlebyte character : cur_mb_index = 0
 343                                    multibyte character
 344                                        1st byte : cur_mb_index = 1
 345                                        2nd byte : cur_mb_index = 2
 346                                          ...
 347                                        nth byte : cur_mb_index = n  */
 348 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
 349                                   Each element store the amount of remain
 350                                   byte of corresponding multibyte character
 351                                   in the input string.  A element's value
 352                                   is 0 if corresponding character is a
 353                                   singlebyte chracter.
 354                                   e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
 355                                    mblen_buf :   0,       3,       2,       1
 356                                */
 357 static wchar_t *inputwcs;       /* Wide character representation of input
 358                                    string in dfaexec().
 359                                    The length of this array is same as
 360                                    the length of input string(char array).
 361                                    inputstring[i] is a single-byte char,
 362                                    or 1st byte of a multibyte char.
 363                                    And inputwcs[i] is the codepoint.  */
 364 static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
 365 static unsigned char const *buf_end;    /* refference to end in dfaexec().  */
 366 #endif /* MBS_SUPPORT  */
 367
 368 #ifdef MBS_SUPPORT
 369 /* This function update cur_mb_len, and cur_mb_index.
 370    p points current lexptr, len is the remaining buffer length.  */
 371 static void
 372 update_mb_len_index (unsigned char const *p, int len)
 373 {
 374   /* If last character is a part of a multibyte character,
 375      we update cur_mb_index.  */
 376   if (cur_mb_index)
 377     cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
 378                         : cur_mb_index + 1;
 379
 380   /* If last character is a single byte character, or the
 381      last portion of a multibyte character, we check whether
 382      next character is a multibyte character or not.  */
 383   if (! cur_mb_index)
 384     {
 385       cur_mb_len = mbrlen(p, len, &mbs);
 386       if (cur_mb_len > 1)
 387         /* It is a multibyte character.
 388            cur_mb_len was already set by mbrlen().  */
 389         cur_mb_index = 1;
 390       else if (cur_mb_len < 1)
 391         /* Invalid sequence.  We treat it as a singlebyte character.
 392            cur_mb_index is aleady 0.  */
 393         cur_mb_len = 1;
 394       /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
 395          cur_mb_index is aleady 0.  */
 396     }
 397 }
 398 #endif /* MBS_SUPPORT */
 399
 400 #ifdef MBS_SUPPORT
 401 /* Note that characters become unsigned here. */
 402 # define FETCH(c, eoferr)                       \
 403   {                                             \
 404     if (! lexleft)                              \
 405      {                                          \
 406         if (eoferr != 0)                        \
 407           dfaerror (eoferr);                    \
 408         else                                    \
 409           return lasttok = END;                 \
 410       }                                         \
 411     if (MB_CUR_MAX > 1)                         \
 412       update_mb_len_index(lexptr, lexleft);     \
 413     (c) = (unsigned char) *lexptr++;            \
 414     --lexleft;                                  \
 415   }
 416
 417 /* This function fetch a wide character, and update cur_mb_len,
 418    used only if the current locale is a multibyte environment.  */
 419 static wchar_t
 420 fetch_wc (char const *eoferr)
 421 {
 422   wchar_t wc;
 423   if (! lexleft)
 424     {
 425       if (eoferr != 0)
 426         dfaerror (eoferr);
 427       else
 428         return -1;
 429     }
 430
 431   cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
 432   if (cur_mb_len <= 0)
 433    {
 434       cur_mb_len = 1;
 435       wc = *lexptr;
 436     }
 437   lexptr += cur_mb_len;
 438   lexleft -= cur_mb_len;
 439   return wc;
 440 }
 441 #else
 442 /* Note that characters become unsigned here. */
 443 # define FETCH(c, eoferr)             \
 444   {                                   \
 445     if (! lexleft)                    \
 446       {                               \
 447         if (eoferr != 0)              \
 448           dfaerror (eoferr);          \
 449         else                          \
 450           return lasttok = END;       \
 451       }                               \
 452     (c) = (unsigned char) *lexptr++;  \
 453     --lexleft;                        \
 454   }
 455 #endif /* MBS_SUPPORT */
 456
 457 #ifdef MBS_SUPPORT
 458 /* Multibyte character handling sub-routin for lex.
 459    This function  parse a bracket expression and build a struct
 460    mb_char_classes.  */
 461 static void
 462 parse_bracket_exp_mb ()
 463 {
 464   wchar_t wc, wc1, wc2;
 465
 466   /* Work area to build a mb_char_classes.  */
 467   struct mb_char_classes *work_mbc;
 468   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
 469     equivs_al, coll_elems_al;
 470
 471   REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
 472                        dfa->mbcsets_alloc, dfa->nmbcsets + 1);
 473   /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
 474      We will update dfa->multibyte_prop in addtok(), because we can't
 475      decide the index in dfa->tokens[].  */
 476
 477   /* Initialize work are */
 478   work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
 479
 480   chars_al = 1;
 481   range_sts_al = range_ends_al = 0;
 482   ch_classes_al = equivs_al = coll_elems_al = 0;
 483   MALLOC(work_mbc->chars, wchar_t, chars_al);
 484
 485   work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
 486   work_mbc->nequivs = work_mbc->ncoll_elems = 0;
 487   work_mbc->chars = work_mbc->ch_classes = NULL;
 488   work_mbc->range_sts = work_mbc->range_ends = NULL;
 489   work_mbc->equivs = work_mbc->coll_elems = NULL;
 490
 491   wc = fetch_wc(_("Unbalanced ["));
 492   if (wc == L'^')
 493     {
 494       wc = fetch_wc(_("Unbalanced ["));
 495       work_mbc->invert = 1;
 496     }
 497   else
 498     work_mbc->invert = 0;
 499   do
 500     {
 501       wc1 = -1; /* mark wc1 is not initialized".  */
 502
 503       /* Note that if we're looking at some other [:...:] construct,
 504          we just treat it as a bunch of ordinary characters.  We can do
 505          this because we assume regex has checked for syntax errors before
 506          dfa is ever called. */
 507       if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
 508         {
 509 #define BRACKET_BUFFER_SIZE 128
 510           char str[BRACKET_BUFFER_SIZE];
 511           wc1 = wc;
 512           wc = fetch_wc(_("Unbalanced ["));
 513
 514           /* If pattern contains `[[:', `[[.', or `[[='.  */
 515           if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
 516             {
 517               unsigned char c;
 518               unsigned char delim = (unsigned char)wc;
 519               int len = 0;
 520               for (;;)
 521                 {
 522                   if (! lexleft)
 523                     dfaerror (_("Unbalanced ["));
 524                   c = (unsigned char) *lexptr++;
 525                   --lexleft;
 526
 527                   if ((c == delim && *lexptr == ']') || lexleft == 0)
 528                     break;
 529                   if (len < BRACKET_BUFFER_SIZE)
 530                     str[len++] = c;
 531                   else
 532                     /* This is in any case an invalid class name.  */
 533                     str[0] = '\0';
 534                 }
 535               str[len] = '\0';
 536
 537               if (lexleft == 0)
 538                 {
 539                   REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
 540                                        work_mbc->nchars + 2);
 541                   work_mbc->chars[work_mbc->nchars++] = L'[';
 542                   work_mbc->chars[work_mbc->nchars++] = delim;
 543                   break;
 544                 }
 545
 546               if (--lexleft, *lexptr++ != ']')
 547                 dfaerror (_("Unbalanced ["));
 548               if (delim == ':')
 549                 /* build character class.  */
 550                 {
 551                   wctype_t wt;
 552                   /* Query the character class as wctype_t.  */
 553                   wt = wctype (str);
 554
 555                   if (ch_classes_al == 0)
 556                     MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);
 557                   REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
 558                                        ch_classes_al,
 559                                        work_mbc->nch_classes + 1);
 560                   work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
 561
 562                 }
 563               else if (delim == '=' || delim == '.')
 564                 {
 565                   char *elem;
 566                   MALLOC(elem, char, len + 1);
 567                   strncpy(elem, str, len + 1);
 568
 569                   if (delim == '=')
 570                     /* build equivalent class.  */
 571                     {
 572                       if (equivs_al == 0)
 573                         MALLOC(work_mbc->equivs, char*, ++equivs_al);
 574                       REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
 575                                            equivs_al,
 576                                            work_mbc->nequivs + 1);
 577                       work_mbc->equivs[work_mbc->nequivs++] = elem;
 578                     }
 579
 580                   if (delim == '.')
 581                     /* build collating element.  */
 582                     {
 583                       if (coll_elems_al == 0)
 584                         MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
 585                       REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
 586                                            coll_elems_al,
 587                                            work_mbc->ncoll_elems + 1);
 588                       work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
 589                     }
 590                 }
 591               wc = -1;
 592             }
 593           else
 594             /* We treat '[' as a normal character here.  */
 595             {
 596               wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
 597             }
 598         }
 599       else
 600         {
 601           if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
 602             wc = fetch_wc(("Unbalanced ["));
 603         }
 604
 605       if (wc1 == -1)
 606         wc1 = fetch_wc(_("Unbalanced ["));
 607
 608       if (wc1 == L'-')
 609         /* build range characters.  */
 610         {
 611           wc2 = fetch_wc(_("Unbalanced ["));
 612           if (wc2 == L']')
 613             {
 614               /* In the case [x-], the - is an ordinary hyphen,
 615                  which is left in c1, the lookahead character. */
 616               lexptr -= cur_mb_len;
 617               lexleft += cur_mb_len;
 618               wc2 = wc;
 619             }
 620           else
 621             {
 622               if (wc2 == L'\\'
 623                   && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
 624                 wc2 = fetch_wc(_("Unbalanced ["));
 625               wc1 = fetch_wc(_("Unbalanced ["));
 626             }
 627
 628           if (range_sts_al == 0)
 629             {
 630               MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
 631               MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
 632             }
 633           REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
 634                                range_sts_al, work_mbc->nranges + 1);
 635           work_mbc->range_sts[work_mbc->nranges] = wc;
 636           REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
 637                                range_ends_al, work_mbc->nranges + 1);
 638           work_mbc->range_ends[work_mbc->nranges++] = wc2;
 639         }
 640       else if (wc != -1)
 641         /* build normal characters.  */
 642         {
 643           REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
 644                                work_mbc->nchars + 1);
 645           work_mbc->chars[work_mbc->nchars++] = wc;
 646         }
 647     }
 648   while ((wc = wc1) != L']');
 649 }
 650 #endif /* MBS_SUPPORT */
 651
 652 #ifdef __STDC__
 653 #define FUNC(F, P) static int F(int c) { return P(c); }
 654 #else
 655 #define FUNC(F, P) static int F(c) int c; { return P(c); }
 656 #endif
 657
 658 FUNC(is_alpha, ISALPHA)
 659 FUNC(is_upper, ISUPPER)
 660 FUNC(is_lower, ISLOWER)
 661 FUNC(is_digit, ISDIGIT)
 662 FUNC(is_xdigit, ISXDIGIT)
 663 FUNC(is_space, ISSPACE)
 664 FUNC(is_punct, ISPUNCT)
 665 FUNC(is_alnum, ISALNUM)
 666 FUNC(is_print, ISPRINT)
 667 FUNC(is_graph, ISGRAPH)
 668 FUNC(is_cntrl, ISCNTRL)
 669
 670 static int
 671 is_blank (int c)
 672 {
 673    return (c == ' ' || c == '\t');
 674 }
 675
 676 /* The following list maps the names of the Posix named character classes
 677    to predicate functions that determine whether a given character is in
 678    the class.  The leading [ has already been eaten by the lexical analyzer. */
 679 static struct {
 680   const char *name;
 681   int (*pred) PARAMS ((int));
 682 } const prednames[] = {
 683   { ":alpha:]", is_alpha },
 684   { ":upper:]", is_upper },
 685   { ":lower:]", is_lower },
 686   { ":digit:]", is_digit },
 687   { ":xdigit:]", is_xdigit },
 688   { ":space:]", is_space },
 689   { ":punct:]", is_punct },
 690   { ":alnum:]", is_alnum },
 691   { ":print:]", is_print },
 692   { ":graph:]", is_graph },
 693   { ":cntrl:]", is_cntrl },
 694   { ":blank:]", is_blank },
 695   { 0 }
 696 };
 697
 698 /* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
 699 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
 700
 701 static int
 702 looking_at (char const *s)
 703 {
 704   size_t len;
 705
 706   len = strlen(s);
 707   if (lexleft < len)
 708     return 0;
 709   return strncmp(s, lexptr, len) == 0;
 710 }
 711
 712 static token
 713 lex (void)
 714 {
 715   unsigned c, c1, c2;
 716   int backslash = 0, invert;
 717   charclass ccl;
 718   int i;
 719
 720   /* Basic plan: We fetch a character.  If it's a backslash,
 721      we set the backslash flag and go through the loop again.
 722      On the plus side, this avoids having a duplicate of the
 723      main switch inside the backslash case.  On the minus side,
 724      it means that just about every case begins with
 725      "if (backslash) ...".  */
 726   for (i = 0; i < 2; ++i)
 727     {
 728       FETCH(c, 0);
 729 #ifdef MBS_SUPPORT
 730       if (MB_CUR_MAX > 1 && cur_mb_index)
 731         /* If this is a part of a multi-byte character, we must treat
 732            this byte data as a normal character.
 733            e.g. In case of SJIS encoding, some character contains '\',
 734                 but they must not be backslash.  */
 735         goto normal_char;
 736 #endif /* MBS_SUPPORT  */
 737       switch (c)
 738         {
 739         case '\\':
 740           if (backslash)
 741             goto normal_char;
 742           if (lexleft == 0)
 743             dfaerror(_("Unfinished \\ escape"));
 744           backslash = 1;
 745           break;
 746
 747         case '^':
 748           if (backslash)
 749             goto normal_char;
 750           if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
 751               || lasttok == END
 752               || lasttok == LPAREN
 753               || lasttok == OR)
 754             return lasttok = BEGLINE;
 755           goto normal_char;
 756
 757         case '$':
 758           if (backslash)
 759             goto normal_char;
 760           if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
 761               || lexleft == 0
 762               || (syntax_bits & RE_NO_BK_PARENS
 763                   ? lexleft > 0 && *lexptr == ')'
 764                   : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
 765               || (syntax_bits & RE_NO_BK_VBAR
 766                   ? lexleft > 0 && *lexptr == '|'
 767                   : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
 768               || ((syntax_bits & RE_NEWLINE_ALT)
 769                   && lexleft > 0 && *lexptr == '\n'))
 770             return lasttok = ENDLINE;
 771           goto normal_char;
 772
 773         case '1':
 774         case '2':
 775         case '3':
 776         case '4':
 777         case '5':
 778         case '6':
 779         case '7':
 780         case '8':
 781         case '9':
 782           if (backslash && !(syntax_bits & RE_NO_BK_REFS))
 783             {
 784               laststart = 0;
 785               return lasttok = BACKREF;
 786             }
 787           goto normal_char;
 788
 789         case '`':
 790           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 791             return lasttok = BEGLINE;   /* FIXME: should be beginning of string */
 792           goto normal_char;
 793
 794         case '\'':
 795           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 796             return lasttok = ENDLINE;   /* FIXME: should be end of string */
 797           goto normal_char;
 798
 799         case '<':
 800           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 801             return lasttok = BEGWORD;
 802           goto normal_char;
 803
 804         case '>':
 805           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 806             return lasttok = ENDWORD;
 807           goto normal_char;
 808
 809         case 'b':
 810           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 811             return lasttok = LIMWORD;
 812           goto normal_char;
 813
 814         case 'B':
 815           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 816             return lasttok = NOTLIMWORD;
 817           goto normal_char;
 818
 819         case '?':
 820           if (syntax_bits & RE_LIMITED_OPS)
 821             goto normal_char;
 822           if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
 823             goto normal_char;
 824           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 825             goto normal_char;
 826           return lasttok = QMARK;
 827
 828         case '*':
 829           if (backslash)
 830             goto normal_char;
 831           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 832             goto normal_char;
 833           return lasttok = STAR;
 834
 835         case '+':
 836           if (syntax_bits & RE_LIMITED_OPS)
 837             goto normal_char;
 838           if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
 839             goto normal_char;
 840           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 841             goto normal_char;
 842           return lasttok = PLUS;
 843
 844         case '{':
 845           if (!(syntax_bits & RE_INTERVALS))
 846             goto normal_char;
 847           if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
 848             goto normal_char;
 849           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 850             goto normal_char;
 851
 852           if (syntax_bits & RE_NO_BK_BRACES)
 853             {
 854               /* Scan ahead for a valid interval; if it's not valid,
 855                  treat it as a literal '{'.  */
 856               int lo = -1, hi = -1;
 857               char const *p = lexptr;
 858               char const *lim = p + lexleft;
 859               for (;  p != lim && ISASCIIDIGIT (*p);  p++)
 860                 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
 861               if (p != lim && *p == ',')
 862                 while (++p != lim && ISASCIIDIGIT (*p))
 863                   hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
 864               else
 865                 hi = lo;
 866               if (p == lim || *p != '}'
 867                   || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
 868                 goto normal_char;
 869             }
 870
 871           minrep = 0;
 872           /* Cases:
 873              {M} - exact count
 874              {M,} - minimum count, maximum is infinity
 875              {M,N} - M through N */
 876           FETCH(c, _("unfinished repeat count"));
 877           if (ISASCIIDIGIT (c))
 878             {
 879               minrep = c - '0';
 880               for (;;)
 881                 {
 882                   FETCH(c, _("unfinished repeat count"));
 883                   if (! ISASCIIDIGIT (c))
 884                     break;
 885                   minrep = 10 * minrep + c - '0';
 886                 }
 887             }
 888           else
 889             dfaerror(_("malformed repeat count"));
 890           if (c == ',')
 891             {
 892               FETCH (c, _("unfinished repeat count"));
 893               if (! ISASCIIDIGIT (c))
 894                 maxrep = -1;
 895               else
 896                 {
 897                   maxrep = c - '0';
 898                   for (;;)
 899                     {
 900                       FETCH (c, _("unfinished repeat count"));
 901                       if (! ISASCIIDIGIT (c))
 902                         break;
 903                       maxrep = 10 * maxrep + c - '0';
 904                     }
 905                   if (0 <= maxrep && maxrep < minrep)
 906                     dfaerror (_("malformed repeat count"));
 907                 }
 908             }
 909           else
 910             maxrep = minrep;
 911           if (!(syntax_bits & RE_NO_BK_BRACES))
 912             {
 913               if (c != '\\')
 914                 dfaerror(_("malformed repeat count"));
 915               FETCH(c, _("unfinished repeat count"));
 916             }
 917           if (c != '}')
 918             dfaerror(_("malformed repeat count"));
 919           laststart = 0;
 920           return lasttok = REPMN;
 921
 922         case '|':
 923           if (syntax_bits & RE_LIMITED_OPS)
 924             goto normal_char;
 925           if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
 926             goto normal_char;
 927           laststart = 1;
 928           return lasttok = OR;
 929
 930         case '\n':
 931           if (syntax_bits & RE_LIMITED_OPS
 932               || backslash
 933               || !(syntax_bits & RE_NEWLINE_ALT))
 934             goto normal_char;
 935           laststart = 1;
 936           return lasttok = OR;
 937
 938         case '(':
 939           if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
 940             goto normal_char;
 941           ++parens;
 942           laststart = 1;
 943           return lasttok = LPAREN;
 944
 945         case ')':
 946           if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
 947             goto normal_char;
 948           if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
 949             goto normal_char;
 950           --parens;
 951           laststart = 0;
 952           return lasttok = RPAREN;
 953
 954         case '.':
 955           if (backslash)
 956             goto normal_char;
 957 #ifdef MBS_SUPPORT
 958           if (MB_CUR_MAX > 1)
 959             {
 960               /* In multibyte environment period must match with a single
 961                  character not a byte.  So we use ANYCHAR.  */
 962               laststart = 0;
 963               return lasttok = ANYCHAR;
 964             }
 965 #endif /* MBS_SUPPORT */
 966           zeroset(ccl);
 967           notset(ccl);
 968           if (!(syntax_bits & RE_DOT_NEWLINE))
 969             clrbit(eolbyte, ccl);
 970           if (syntax_bits & RE_DOT_NOT_NULL)
 971             clrbit('\0', ccl);
 972           laststart = 0;
 973           return lasttok = CSET + charclass_index(ccl);
 974
 975         case 'w':
 976         case 'W':
 977           if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
 978             goto normal_char;
 979           zeroset(ccl);
 980           for (c2 = 0; c2 < NOTCHAR; ++c2)
 981             if (IS_WORD_CONSTITUENT(c2))
 982               setbit(c2, ccl);
 983           if (c == 'W')
 984             notset(ccl);
 985           laststart = 0;
 986           return lasttok = CSET + charclass_index(ccl);
 987
 988         case '[':
 989           if (backslash)
 990             goto normal_char;
 991           laststart = 0;
 992 #ifdef MBS_SUPPORT
 993           if (MB_CUR_MAX > 1)
 994             {
 995               /* In multibyte environment a bracket expression may contain
 996                  multibyte characters, which must be treated as characters
 997                  (not bytes).  So we parse it by parse_bracket_exp_mb().  */
 998               parse_bracket_exp_mb();
 999               return lasttok = MBCSET;
1000             }
1001 #endif
1002           zeroset(ccl);
1003           FETCH(c, _("Unbalanced ["));
1004           if (c == '^')
1005             {
1006               FETCH(c, _("Unbalanced ["));
1007               invert = 1;
1008             }
1009           else
1010             invert = 0;
1011           do
1012             {
1013               /* Nobody ever said this had to be fast. :-)
1014                  Note that if we're looking at some other [:...:]
1015                  construct, we just treat it as a bunch of ordinary
1016                  characters.  We can do this because we assume
1017                  regex has checked for syntax errors before
1018                  dfa is ever called. */
1019               if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1020                 for (c1 = 0; prednames[c1].name; ++c1)
1021                   if (looking_at(prednames[c1].name))
1022                     {
1023                       int (*pred) PARAMS ((int)) = prednames[c1].pred;
1024
1025                       for (c2 = 0; c2 < NOTCHAR; ++c2)
1026                         if ((*pred)(c2))
1027                           setbit_case_fold (c2, ccl);
1028                       lexptr += strlen(prednames[c1].name);
1029                       lexleft -= strlen(prednames[c1].name);
1030                       FETCH(c1, _("Unbalanced ["));
1031                       goto skip;
1032                     }
1033               if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1034                 FETCH(c, _("Unbalanced ["));
1035               FETCH(c1, _("Unbalanced ["));
1036               if (c1 == '-')
1037                 {
1038                   FETCH(c2, _("Unbalanced ["));
1039                   if (c2 == ']')
1040                     {
1041                       /* In the case [x-], the - is an ordinary hyphen,
1042                          which is left in c1, the lookahead character. */
1043                       --lexptr;
1044                       ++lexleft;
1045                     }
1046                   else
1047                     {
1048                       if (c2 == '\\'
1049                           && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1050                         FETCH(c2, _("Unbalanced ["));
1051                       FETCH(c1, _("Unbalanced ["));
1052                       if (!hard_LC_COLLATE) {
1053                         for (; c <= c2; c++)
1054                           setbit_case_fold (c, ccl);
1055                       } else {
1056                         /* POSIX locales are painful - leave the decision to libc */
1057                         char expr[6] = { '[', c, '-', c2, ']', '\0' };
1058                         regex_t re;
1059                         if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1060                           for (c = 0; c < NOTCHAR; ++c) {
1061                             char buf[2] = { c, '\0' };
1062                             regmatch_t mat;
1063                             if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1064                                && mat.rm_so == 0 && mat.rm_eo == 1)
1065                               setbit_case_fold (c, ccl);
1066                           }
1067                           regfree (&re);
1068                         }
1069                       }
1070                       continue;
1071                     }
1072                 }
1073
1074               setbit_case_fold (c, ccl);
1075
1076             skip:
1077               ;
1078             }
1079           while ((c = c1) != ']');
1080           if (invert)
1081             {
1082               notset(ccl);
1083               if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1084                 clrbit(eolbyte, ccl);
1085             }
1086           return lasttok = CSET + charclass_index(ccl);
1087
1088         default:
1089         normal_char:
1090           laststart = 0;
1091           if (case_fold && ISALPHA(c))
1092             {
1093               zeroset(ccl);
1094               setbit_case_fold (c, ccl);
1095               return lasttok = CSET + charclass_index(ccl);
1096             }
1097           return c;
1098         }
1099     }
1100
1101   /* The above loop should consume at most a backslash
1102      and some other character. */
1103   abort();
1104   return END;   /* keeps pedantic compilers happy. */
1105 }
1106
1107 /* Recursive descent parser for regular expressions. */
1108
1109 static token tok;               /* Lookahead token. */
1110 static int depth;               /* Current depth of a hypothetical stack
1111                                    holding deferred productions.  This is
1112                                    used to determine the depth that will be
1113                                    required of the real stack later on in
1114                                    dfaanalyze(). */
1115
1116 /* Add the given token to the parse tree, maintaining the depth count and
1117    updating the maximum depth if necessary. */
1118 static void
1119 addtok (token t)
1120 {
1121 #ifdef MBS_SUPPORT
1122   if (MB_CUR_MAX > 1)
1123     {
1124       REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1125                            dfa->tindex);
1126       /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
1127       if (t == MBCSET)
1128         dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1129       else if (t < NOTCHAR)
1130         dfa->multibyte_prop[dfa->tindex]
1131           = (cur_mb_len == 1)? 3 /* single-byte char */
1132           : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1133              + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1134       else
1135         /* It may be unnecesssary, but it is safer to treat other
1136            symbols as singlebyte characters.  */
1137         dfa->multibyte_prop[dfa->tindex] = 3;
1138     }
1139 #endif
1140
1141   REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1142   dfa->tokens[dfa->tindex++] = t;
1143
1144   switch (t)
1145     {
1146     case QMARK:
1147     case STAR:
1148     case PLUS:
1149       break;
1150
1151     case CAT:
1152     case OR:
1153     case ORTOP:
1154       --depth;
1155       break;
1156
1157     default:
1158       ++dfa->nleaves;
1159     case EMPTY:
1160       ++depth;
1161       break;
1162     }
1163   if (depth > dfa->depth)
1164     dfa->depth = depth;
1165 }
1166
1167 /* The grammar understood by the parser is as follows.
1168
1169    regexp:
1170      regexp OR branch
1171      branch
1172
1173    branch:
1174      branch closure
1175      closure
1176
1177    closure:
1178      closure QMARK
1179      closure STAR
1180      closure PLUS
1181      closure REPMN
1182      atom
1183
1184    atom:
1185      <normal character>
1186      <multibyte character>
1187      ANYCHAR
1188      MBCSET
1189      CSET
1190      BACKREF
1191      BEGLINE
1192      ENDLINE
1193      BEGWORD
1194      ENDWORD
1195      LIMWORD
1196      NOTLIMWORD
1197      CRANGE
1198      LPAREN regexp RPAREN
1199      <empty>
1200
1201    The parser builds a parse tree in postfix form in an array of tokens. */
1202
1203 static void
1204 atom (void)
1205 {
1206   if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1207       || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1208 #ifdef MBS_SUPPORT
1209       || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1210 #endif /* MBS_SUPPORT */
1211       || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1212     {
1213       addtok(tok);
1214       tok = lex();
1215 #ifdef MBS_SUPPORT
1216       /* We treat a multibyte character as a single atom, so that DFA
1217          can treat a multibyte character as a single expression.
1218
1219          e.g. We construct following tree from "<mb1><mb2>".
1220               <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1221               <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1222       */
1223       if (MB_CUR_MAX > 1)
1224         {
1225           while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1226             {
1227               addtok(tok);
1228               addtok(CAT);
1229               tok = lex();
1230             }
1231         }
1232 #endif /* MBS_SUPPORT  */
1233     }
1234   else if (tok == CRANGE)
1235     {
1236       /* A character range like "[a-z]" in a locale other than "C" or
1237          "POSIX".  This range might any sequence of one or more
1238          characters.  Unfortunately the POSIX locale primitives give
1239          us no practical way to find what character sequences might be
1240          matched.  Treat this approximately like "(.\1)" -- i.e. match
1241          one character, and then punt to the full matcher.  */
1242       charclass ccl;
1243       zeroset (ccl);
1244       notset (ccl);
1245       addtok (CSET + charclass_index (ccl));
1246       addtok (BACKREF);
1247       addtok (CAT);
1248       tok = lex ();
1249     }
1250   else if (tok == LPAREN)
1251     {
1252       tok = lex();
1253       regexp(0);
1254       if (tok != RPAREN)
1255         dfaerror(_("Unbalanced ("));
1256       tok = lex();
1257     }
1258   else
1259     addtok(EMPTY);
1260 }
1261
1262 /* Return the number of tokens in the given subexpression. */
1263 static int
1264 nsubtoks (int tindex)
1265 {
1266   int ntoks1;
1267
1268   switch (dfa->tokens[tindex - 1])
1269     {
1270     default:
1271       return 1;
1272     case QMARK:
1273     case STAR:
1274     case PLUS:
1275       return 1 + nsubtoks(tindex - 1);
1276     case CAT:
1277     case OR:
1278     case ORTOP:
1279       ntoks1 = nsubtoks(tindex - 1);
1280       return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1281     }
1282 }
1283
1284 /* Copy the given subexpression to the top of the tree. */
1285 static void
1286 copytoks (int tindex, int ntokens)
1287 {
1288   int i;
1289
1290   for (i = 0; i < ntokens; ++i)
1291     addtok(dfa->tokens[tindex + i]);
1292 }
1293
1294 static void
1295 closure (void)
1296 {
1297   int tindex, ntokens, i;
1298
1299   atom();
1300   while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1301     if (tok == REPMN)
1302       {
1303         ntokens = nsubtoks(dfa->tindex);
1304         tindex = dfa->tindex - ntokens;
1305         if (maxrep < 0)
1306           addtok(PLUS);
1307         if (minrep == 0)
1308           addtok(QMARK);
1309         for (i = 1; i < minrep; ++i)
1310           {
1311             copytoks(tindex, ntokens);
1312             addtok(CAT);
1313           }
1314         for (; i < maxrep; ++i)
1315           {
1316             copytoks(tindex, ntokens);
1317             addtok(QMARK);
1318             addtok(CAT);
1319           }
1320         tok = lex();
1321       }
1322     else
1323       {
1324         addtok(tok);
1325         tok = lex();
1326       }
1327 }
1328
1329 static void
1330 branch (void)
1331 {
1332   closure();
1333   while (tok != RPAREN && tok != OR && tok >= 0)
1334     {
1335       closure();
1336       addtok(CAT);
1337     }
1338 }
1339
1340 static void
1341 regexp (int toplevel)
1342 {
1343   branch();
1344   while (tok == OR)
1345     {
1346       tok = lex();
1347       branch();
1348       if (toplevel)
1349         addtok(ORTOP);
1350       else
1351         addtok(OR);
1352     }
1353 }
1354
1355 /* Main entry point for the parser.  S is a string to be parsed, len is the
1356    length of the string, so s can include NUL characters.  D is a pointer to
1357    the struct dfa to parse into. */
1358 void
1359 dfaparse (char const *s, size_t len, struct dfa *d)
1360 {
1361   dfa = d;
1362   lexstart = lexptr = s;
1363   lexleft = len;
1364   lasttok = END;
1365   laststart = 1;
1366   parens = 0;
1367 #if ENABLE_NLS
1368   hard_LC_COLLATE = hard_locale (LC_COLLATE);
1369 #endif
1370 #ifdef MBS_SUPPORT
1371   if (MB_CUR_MAX > 1)
1372     {
1373       cur_mb_index = 0;
1374       cur_mb_len = 0;
1375       memset(&mbs, 0, sizeof(mbstate_t));
1376     }
1377 #endif /* MBS_SUPPORT  */
1378
1379   if (! syntax_bits_set)
1380     dfaerror(_("No syntax specified"));
1381
1382   tok = lex();
1383   depth = d->depth;
1384
1385   regexp(1);
1386
1387   if (tok != END)
1388     dfaerror(_("Unbalanced )"));
1389
1390   addtok(END - d->nregexps);
1391   addtok(CAT);
1392
1393   if (d->nregexps)
1394     addtok(ORTOP);
1395
1396   ++d->nregexps;
1397 }
1398
1399 /* Some primitives for operating on sets of positions. */
1400
1401 /* Copy one set to another; the destination must be large enough. */
1402 static void
1403 copy (position_set const *src, position_set *dst)
1404 {
1405   int i;
1406
1407   for (i = 0; i < src->nelem; ++i)
1408     dst->elems[i] = src->elems[i];
1409   dst->nelem = src->nelem;
1410 }
1411
1412 /* Insert a position in a set.  Position sets are maintained in sorted
1413    order according to index.  If position already exists in the set with
1414    the same index then their constraints are logically or'd together.
1415    S->elems must point to an array large enough to hold the resulting set. */
1416 static void
1417 insert (position p, position_set *s)
1418 {
1419   int i;
1420   position t1, t2;
1421
1422   for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1423     continue;
1424   if (i < s->nelem && p.index == s->elems[i].index)
1425     s->elems[i].constraint |= p.constraint;
1426   else
1427     {
1428       t1 = p;
1429       ++s->nelem;
1430       while (i < s->nelem)
1431         {
1432           t2 = s->elems[i];
1433           s->elems[i++] = t1;
1434           t1 = t2;
1435         }
1436     }
1437 }
1438
1439 /* Merge two sets of positions into a third.  The result is exactly as if
1440    the positions of both sets were inserted into an initially empty set. */
1441 static void
1442 merge (position_set const *s1, position_set const *s2, position_set *m)
1443 {
1444   int i = 0, j = 0;
1445
1446   m->nelem = 0;
1447   while (i < s1->nelem && j < s2->nelem)
1448     if (s1->elems[i].index > s2->elems[j].index)
1449       m->elems[m->nelem++] = s1->elems[i++];
1450     else if (s1->elems[i].index < s2->elems[j].index)
1451       m->elems[m->nelem++] = s2->elems[j++];
1452     else
1453       {
1454         m->elems[m->nelem] = s1->elems[i++];
1455         m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1456       }
1457   while (i < s1->nelem)
1458     m->elems[m->nelem++] = s1->elems[i++];
1459   while (j < s2->nelem)
1460     m->elems[m->nelem++] = s2->elems[j++];
1461 }
1462
1463 /* Delete a position from a set. */
1464 static void
1465 delete (position p, position_set *s)
1466 {
1467   int i;
1468
1469   for (i = 0; i < s->nelem; ++i)
1470     if (p.index == s->elems[i].index)
1471       break;
1472   if (i < s->nelem)
1473     for (--s->nelem; i < s->nelem; ++i)
1474       s->elems[i] = s->elems[i + 1];
1475 }
1476
1477 /* Find the index of the state corresponding to the given position set with
1478    the given preceding context, or create a new state if there is no such
1479    state.  Newline and letter tell whether we got here on a newline or
1480    letter, respectively. */
1481 static int
1482 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1483 {
1484   int hash = 0;
1485   int constraint;
1486   int i, j;
1487
1488   newline = newline ? 1 : 0;
1489   letter = letter ? 1 : 0;
1490
1491   for (i = 0; i < s->nelem; ++i)
1492     hash ^= s->elems[i].index + s->elems[i].constraint;
1493
1494   /* Try to find a state that exactly matches the proposed one. */
1495   for (i = 0; i < d->sindex; ++i)
1496     {
1497       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1498           || newline != d->states[i].newline || letter != d->states[i].letter)
1499         continue;
1500       for (j = 0; j < s->nelem; ++j)
1501         if (s->elems[j].constraint
1502             != d->states[i].elems.elems[j].constraint
1503             || s->elems[j].index != d->states[i].elems.elems[j].index)
1504           break;
1505       if (j == s->nelem)
1506         return i;
1507     }
1508
1509   /* We'll have to create a new state. */
1510   REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1511   d->states[i].hash = hash;
1512   MALLOC(d->states[i].elems.elems, position, s->nelem);
1513   copy(s, &d->states[i].elems);
1514   d->states[i].newline = newline;
1515   d->states[i].letter = letter;
1516   d->states[i].backref = 0;
1517   d->states[i].constraint = 0;
1518   d->states[i].first_end = 0;
1519 #ifdef MBS_SUPPORT
1520   if (MB_CUR_MAX > 1)
1521     d->states[i].mbps.nelem = 0;
1522 #endif
1523   for (j = 0; j < s->nelem; ++j)
1524     if (d->tokens[s->elems[j].index] < 0)
1525       {
1526         constraint = s->elems[j].constraint;
1527         if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1528             || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1529             || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1530             || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1531           d->states[i].constraint |= constraint;
1532         if (! d->states[i].first_end)
1533           d->states[i].first_end = d->tokens[s->elems[j].index];
1534       }
1535     else if (d->tokens[s->elems[j].index] == BACKREF)
1536       {
1537         d->states[i].constraint = NO_CONSTRAINT;
1538         d->states[i].backref = 1;
1539       }
1540
1541   ++d->sindex;
1542
1543   return i;
1544 }
1545
1546 /* Find the epsilon closure of a set of positions.  If any position of the set
1547    contains a symbol that matches the empty string in some context, replace
1548    that position with the elements of its follow labeled with an appropriate
1549    constraint.  Repeat exhaustively until no funny positions are left.
1550    S->elems must be large enough to hold the result. */
1551 static void
1552 epsclosure (position_set *s, struct dfa const *d)
1553 {
1554   int i, j;
1555   int *visited;
1556   position p, old;
1557
1558   MALLOC(visited, int, d->tindex);
1559   for (i = 0; i < d->tindex; ++i)
1560     visited[i] = 0;
1561
1562   for (i = 0; i < s->nelem; ++i)
1563     if (d->tokens[s->elems[i].index] >= NOTCHAR
1564         && d->tokens[s->elems[i].index] != BACKREF
1565 #ifdef MBS_SUPPORT
1566         && d->tokens[s->elems[i].index] != ANYCHAR
1567         && d->tokens[s->elems[i].index] != MBCSET
1568 #endif
1569         && d->tokens[s->elems[i].index] < CSET)
1570       {
1571         old = s->elems[i];
1572         p.constraint = old.constraint;
1573         delete(s->elems[i], s);
1574         if (visited[old.index])
1575           {
1576             --i;
1577             continue;
1578           }
1579         visited[old.index] = 1;
1580         switch (d->tokens[old.index])
1581           {
1582           case BEGLINE:
1583             p.constraint &= BEGLINE_CONSTRAINT;
1584             break;
1585           case ENDLINE:
1586             p.constraint &= ENDLINE_CONSTRAINT;
1587             break;
1588           case BEGWORD:
1589             p.constraint &= BEGWORD_CONSTRAINT;
1590             break;
1591           case ENDWORD:
1592             p.constraint &= ENDWORD_CONSTRAINT;
1593             break;
1594           case LIMWORD:
1595             p.constraint &= LIMWORD_CONSTRAINT;
1596             break;
1597           case NOTLIMWORD:
1598             p.constraint &= NOTLIMWORD_CONSTRAINT;
1599             break;
1600           default:
1601             break;
1602           }
1603         for (j = 0; j < d->follows[old.index].nelem; ++j)
1604           {
1605             p.index = d->follows[old.index].elems[j].index;
1606             insert(p, s);
1607           }
1608         /* Force rescan to start at the beginning. */
1609         i = -1;
1610       }
1611
1612   free(visited);
1613 }
1614
1615 /* Perform bottom-up analysis on the parse tree, computing various functions.
1616    Note that at this point, we're pretending constructs like \< are real
1617    characters rather than constraints on what can follow them.
1618
1619    Nullable:  A node is nullable if it is at the root of a regexp that can
1620    match the empty string.
1621    *  EMPTY leaves are nullable.
1622    * No other leaf is nullable.
1623    * A QMARK or STAR node is nullable.
1624    * A PLUS node is nullable if its argument is nullable.
1625    * A CAT node is nullable if both its arguments are nullable.
1626    * An OR node is nullable if either argument is nullable.
1627
1628    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1629    that could correspond to the first character of a string matching the
1630    regexp rooted at the given node.
1631    * EMPTY leaves have empty firstpos.
1632    * The firstpos of a nonempty leaf is that leaf itself.
1633    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1634      argument.
1635    * The firstpos of a CAT node is the firstpos of the left argument, union
1636      the firstpos of the right if the left argument is nullable.
1637    * The firstpos of an OR node is the union of firstpos of each argument.
1638
1639    Lastpos:  The lastpos of a node is the set of positions that could
1640    correspond to the last character of a string matching the regexp at
1641    the given node.
1642    * EMPTY leaves have empty lastpos.
1643    * The lastpos of a nonempty leaf is that leaf itself.
1644    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1645      argument.
1646    * The lastpos of a CAT node is the lastpos of its right argument, union
1647      the lastpos of the left if the right argument is nullable.
1648    * The lastpos of an OR node is the union of the lastpos of each argument.
1649
1650    Follow:  The follow of a position is the set of positions that could
1651    correspond to the character following a character matching the node in
1652    a string matching the regexp.  At this point we consider special symbols
1653    that match the empty string in some context to be just normal characters.
1654    Later, if we find that a special symbol is in a follow set, we will
1655    replace it with the elements of its follow, labeled with an appropriate
1656    constraint.
1657    * Every node in the firstpos of the argument of a STAR or PLUS node is in
1658      the follow of every node in the lastpos.
1659    * Every node in the firstpos of the second argument of a CAT node is in
1660      the follow of every node in the lastpos of the first argument.
1661
1662    Because of the postfix representation of the parse tree, the depth-first
1663    analysis is conveniently done by a linear scan with the aid of a stack.
1664    Sets are stored as arrays of the elements, obeying a stack-like allocation
1665    scheme; the number of elements in each set deeper in the stack can be
1666    used to determine the address of a particular set's array. */
1667 void
1668 dfaanalyze (struct dfa *d, int searchflag)
1669 {
1670   int *nullable;                /* Nullable stack. */
1671   int *nfirstpos;               /* Element count stack for firstpos sets. */
1672   position *firstpos;           /* Array where firstpos elements are stored. */
1673   int *nlastpos;                /* Element count stack for lastpos sets. */
1674   position *lastpos;            /* Array where lastpos elements are stored. */
1675   int *nalloc;                  /* Sizes of arrays allocated to follow sets. */
1676   position_set tmp;             /* Temporary set for merging sets. */
1677   position_set merged;          /* Result of merging sets. */
1678   int wants_newline;            /* True if some position wants newline info. */
1679   int *o_nullable;
1680   int *o_nfirst, *o_nlast;
1681   position *o_firstpos, *o_lastpos;
1682   int i, j;
1683   position *pos;
1684
1685 #ifdef DEBUG
1686   fprintf(stderr, "dfaanalyze:\n");
1687   for (i = 0; i < d->tindex; ++i)
1688     {
1689       fprintf(stderr, " %d:", i);
1690       prtok(d->tokens[i]);
1691     }
1692   putc('\n', stderr);
1693 #endif
1694
1695   d->searchflag = searchflag;
1696
1697   MALLOC(nullable, int, d->depth);
1698   o_nullable = nullable;
1699   MALLOC(nfirstpos, int, d->depth);
1700   o_nfirst = nfirstpos;
1701   MALLOC(firstpos, position, d->nleaves);
1702   o_firstpos = firstpos, firstpos += d->nleaves;
1703   MALLOC(nlastpos, int, d->depth);
1704   o_nlast = nlastpos;
1705   MALLOC(lastpos, position, d->nleaves);
1706   o_lastpos = lastpos, lastpos += d->nleaves;
1707   MALLOC(nalloc, int, d->tindex);
1708   for (i = 0; i < d->tindex; ++i)
1709     nalloc[i] = 0;
1710   MALLOC(merged.elems, position, d->nleaves);
1711
1712   CALLOC(d->follows, position_set, d->tindex);
1713
1714   for (i = 0; i < d->tindex; ++i)
1715 #ifdef DEBUG
1716     {                           /* Nonsyntactic #ifdef goo... */
1717 #endif
1718     switch (d->tokens[i])
1719       {
1720       case EMPTY:
1721         /* The empty set is nullable. */
1722         *nullable++ = 1;
1723
1724         /* The firstpos and lastpos of the empty leaf are both empty. */
1725         *nfirstpos++ = *nlastpos++ = 0;
1726         break;
1727
1728       case STAR:
1729       case PLUS:
1730         /* Every element in the firstpos of the argument is in the follow
1731            of every element in the lastpos. */
1732         tmp.nelem = nfirstpos[-1];
1733         tmp.elems = firstpos;
1734         pos = lastpos;
1735         for (j = 0; j < nlastpos[-1]; ++j)
1736           {
1737             merge(&tmp, &d->follows[pos[j].index], &merged);
1738             REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1739                                  nalloc[pos[j].index], merged.nelem - 1);
1740             copy(&merged, &d->follows[pos[j].index]);
1741           }
1742
1743       case QMARK:
1744         /* A QMARK or STAR node is automatically nullable. */
1745         if (d->tokens[i] != PLUS)
1746           nullable[-1] = 1;
1747         break;
1748
1749       case CAT:
1750         /* Every element in the firstpos of the second argument is in the
1751            follow of every element in the lastpos of the first argument. */
1752         tmp.nelem = nfirstpos[-1];
1753         tmp.elems = firstpos;
1754         pos = lastpos + nlastpos[-1];
1755         for (j = 0; j < nlastpos[-2]; ++j)
1756           {
1757             merge(&tmp, &d->follows[pos[j].index], &merged);
1758             REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1759                                  nalloc[pos[j].index], merged.nelem - 1);
1760             copy(&merged, &d->follows[pos[j].index]);
1761           }
1762
1763         /* The firstpos of a CAT node is the firstpos of the first argument,
1764            union that of the second argument if the first is nullable. */
1765         if (nullable[-2])
1766           nfirstpos[-2] += nfirstpos[-1];
1767         else
1768           firstpos += nfirstpos[-1];
1769         --nfirstpos;
1770
1771         /* The lastpos of a CAT node is the lastpos of the second argument,
1772            union that of the first argument if the second is nullable. */
1773         if (nullable[-1])
1774           nlastpos[-2] += nlastpos[-1];
1775         else
1776           {
1777             pos = lastpos + nlastpos[-2];
1778             for (j = nlastpos[-1] - 1; j >= 0; --j)
1779               pos[j] = lastpos[j];
1780             lastpos += nlastpos[-2];
1781             nlastpos[-2] = nlastpos[-1];
1782           }
1783         --nlastpos;
1784
1785         /* A CAT node is nullable if both arguments are nullable. */
1786         nullable[-2] = nullable[-1] && nullable[-2];
1787         --nullable;
1788         break;
1789
1790       case OR:
1791       case ORTOP:
1792         /* The firstpos is the union of the firstpos of each argument. */
1793         nfirstpos[-2] += nfirstpos[-1];
1794         --nfirstpos;
1795
1796         /* The lastpos is the union of the lastpos of each argument. */
1797         nlastpos[-2] += nlastpos[-1];
1798         --nlastpos;
1799
1800         /* An OR node is nullable if either argument is nullable. */
1801         nullable[-2] = nullable[-1] || nullable[-2];
1802         --nullable;
1803         break;
1804
1805       default:
1806         /* Anything else is a nonempty position.  (Note that special
1807            constructs like \< are treated as nonempty strings here;
1808            an "epsilon closure" effectively makes them nullable later.
1809            Backreferences have to get a real position so we can detect
1810            transitions on them later.  But they are nullable. */
1811         *nullable++ = d->tokens[i] == BACKREF;
1812
1813         /* This position is in its own firstpos and lastpos. */
1814         *nfirstpos++ = *nlastpos++ = 1;
1815         --firstpos, --lastpos;
1816         firstpos->index = lastpos->index = i;
1817         firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1818
1819         /* Allocate the follow set for this position. */
1820         nalloc[i] = 1;
1821         MALLOC(d->follows[i].elems, position, nalloc[i]);
1822         break;
1823       }
1824 #ifdef DEBUG
1825     /* ... balance the above nonsyntactic #ifdef goo... */
1826       fprintf(stderr, "node %d:", i);
1827       prtok(d->tokens[i]);
1828       putc('\n', stderr);
1829       fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1830       fprintf(stderr, " firstpos:");
1831       for (j = nfirstpos[-1] - 1; j >= 0; --j)
1832         {
1833           fprintf(stderr, " %d:", firstpos[j].index);
1834           prtok(d->tokens[firstpos[j].index]);
1835         }
1836       fprintf(stderr, "\n lastpos:");
1837       for (j = nlastpos[-1] - 1; j >= 0; --j)
1838         {
1839           fprintf(stderr, " %d:", lastpos[j].index);
1840           prtok(d->tokens[lastpos[j].index]);
1841         }
1842       putc('\n', stderr);
1843     }
1844 #endif
1845
1846   /* For each follow set that is the follow set of a real position, replace
1847      it with its epsilon closure. */
1848   for (i = 0; i < d->tindex; ++i)
1849     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1850 #ifdef MBS_SUPPORT
1851         || d->tokens[i] == ANYCHAR
1852         || d->tokens[i] == MBCSET
1853 #endif
1854         || d->tokens[i] >= CSET)
1855       {
1856 #ifdef DEBUG
1857         fprintf(stderr, "follows(%d:", i);
1858         prtok(d->tokens[i]);
1859         fprintf(stderr, "):");
1860         for (j = d->follows[i].nelem - 1; j >= 0; --j)
1861           {
1862             fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1863             prtok(d->tokens[d->follows[i].elems[j].index]);
1864           }
1865         putc('\n', stderr);
1866 #endif
1867         copy(&d->follows[i], &merged);
1868         epsclosure(&merged, d);
1869         if (d->follows[i].nelem < merged.nelem)
1870           REALLOC(d->follows[i].elems, position, merged.nelem);
1871         copy(&merged, &d->follows[i]);
1872       }
1873
1874   /* Get the epsilon closure of the firstpos of the regexp.  The result will
1875      be the set of positions of state 0. */
1876   merged.nelem = 0;
1877   for (i = 0; i < nfirstpos[-1]; ++i)
1878     insert(firstpos[i], &merged);
1879   epsclosure(&merged, d);
1880
1881   /* Check if any of the positions of state 0 will want newline context. */
1882   wants_newline = 0;
1883   for (i = 0; i < merged.nelem; ++i)
1884     if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1885       wants_newline = 1;
1886
1887   /* Build the initial state. */
1888   d->salloc = 1;
1889   d->sindex = 0;
1890   MALLOC(d->states, dfa_state, d->salloc);
1891   state_index(d, &merged, wants_newline, 0);
1892
1893   free(o_nullable);
1894   free(o_nfirst);
1895   free(o_firstpos);
1896   free(o_nlast);
1897   free(o_lastpos);
1898   free(nalloc);
1899   free(merged.elems);
1900 }
1901
1902 /* Find, for each character, the transition out of state s of d, and store
1903    it in the appropriate slot of trans.
1904
1905    We divide the positions of s into groups (positions can appear in more
1906    than one group).  Each group is labeled with a set of characters that
1907    every position in the group matches (taking into account, if necessary,
1908    preceding context information of s).  For each group, find the union
1909    of the its elements' follows.  This set is the set of positions of the
1910    new state.  For each character in the group's label, set the transition
1911    on this character to be to a state corresponding to the set's positions,
1912    and its associated backward context information, if necessary.
1913
1914    If we are building a searching matcher, we include the positions of state
1915    0 in every state.
1916
1917    The collection of groups is constructed by building an equivalence-class
1918    partition of the positions of s.
1919
1920    For each position, find the set of characters C that it matches.  Eliminate
1921    any characters from C that fail on grounds of backward context.
1922
1923    Search through the groups, looking for a group whose label L has nonempty
1924    intersection with C.  If L - C is nonempty, create a new group labeled
1925    L - C and having the same positions as the current group, and set L to
1926    the intersection of L and C.  Insert the position in this group, set
1927    C = C - L, and resume scanning.
1928
1929    If after comparing with every group there are characters remaining in C,
1930    create a new group labeled with the characters of C and insert this
1931    position in that group. */
1932 void
1933 dfastate (int s, struct dfa *d, int trans[])
1934 {
1935   position_set grps[NOTCHAR];   /* As many as will ever be needed. */
1936   charclass labels[NOTCHAR];    /* Labels corresponding to the groups. */
1937   int ngrps = 0;                /* Number of groups actually used. */
1938   position pos;                 /* Current position being considered. */
1939   charclass matches;            /* Set of matching characters. */
1940   int matchesf;                 /* True if matches is nonempty. */
1941   charclass intersect;          /* Intersection with some label set. */
1942   int intersectf;               /* True if intersect is nonempty. */
1943   charclass leftovers;          /* Stuff in the label that didn't match. */
1944   int leftoversf;               /* True if leftovers is nonempty. */
1945   static charclass letters;     /* Set of characters considered letters. */
1946   static charclass newline;     /* Set of characters that aren't newline. */
1947   position_set follows;         /* Union of the follows of some group. */
1948   position_set tmp;             /* Temporary space for merging sets. */
1949   int state;                    /* New state. */
1950   int wants_newline;            /* New state wants to know newline context. */
1951   int state_newline;            /* New state on a newline transition. */
1952   int wants_letter;             /* New state wants to know letter context. */
1953   int state_letter;             /* New state on a letter transition. */
1954   static int initialized;       /* Flag for static initialization. */
1955 #ifdef MBS_SUPPORT
1956   int next_isnt_1st_byte = 0;   /* Flag If we can't add state0.  */
1957 #endif
1958   int i, j, k;
1959
1960   /* Initialize the set of letters, if necessary. */
1961   if (! initialized)
1962     {
1963       initialized = 1;
1964       for (i = 0; i < NOTCHAR; ++i)
1965         if (IS_WORD_CONSTITUENT(i))
1966           setbit(i, letters);
1967       setbit(eolbyte, newline);
1968     }
1969
1970   zeroset(matches);
1971
1972   for (i = 0; i < d->states[s].elems.nelem; ++i)
1973     {
1974       pos = d->states[s].elems.elems[i];
1975       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1976         setbit(d->tokens[pos.index], matches);
1977       else if (d->tokens[pos.index] >= CSET)
1978         copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1979 #ifdef MBS_SUPPORT
1980       else if (d->tokens[pos.index] == ANYCHAR
1981                || d->tokens[pos.index] == MBCSET)
1982       /* MB_CUR_MAX > 1  */
1983         {
1984           /* ANYCHAR and MBCSET must match with a single character, so we
1985              must put it to d->states[s].mbps, which contains the positions
1986              which can match with a single character not a byte.  */
1987           if (d->states[s].mbps.nelem == 0)
1988             {
1989               MALLOC(d->states[s].mbps.elems, position,
1990                      d->states[s].elems.nelem);
1991             }
1992           insert(pos, &(d->states[s].mbps));
1993           continue;
1994         }
1995 #endif /* MBS_SUPPORT */
1996       else
1997         continue;
1998
1999       /* Some characters may need to be eliminated from matches because
2000          they fail in the current context. */
2001       if (pos.constraint != 0xFF)
2002         {
2003           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2004                                          d->states[s].newline, 1))
2005             clrbit(eolbyte, matches);
2006           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2007                                          d->states[s].newline, 0))
2008             for (j = 0; j < CHARCLASS_INTS; ++j)
2009               matches[j] &= newline[j];
2010           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2011                                         d->states[s].letter, 1))
2012             for (j = 0; j < CHARCLASS_INTS; ++j)
2013               matches[j] &= ~letters[j];
2014           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2015                                         d->states[s].letter, 0))
2016             for (j = 0; j < CHARCLASS_INTS; ++j)
2017               matches[j] &= letters[j];
2018
2019           /* If there are no characters left, there's no point in going on. */
2020           for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2021             continue;
2022           if (j == CHARCLASS_INTS)
2023             continue;
2024         }
2025
2026       for (j = 0; j < ngrps; ++j)
2027         {
2028           /* If matches contains a single character only, and the current
2029              group's label doesn't contain that character, go on to the
2030              next group. */
2031           if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2032               && !tstbit(d->tokens[pos.index], labels[j]))
2033             continue;
2034
2035           /* Check if this group's label has a nonempty intersection with
2036              matches. */
2037           intersectf = 0;
2038           for (k = 0; k < CHARCLASS_INTS; ++k)
2039             (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2040           if (! intersectf)
2041             continue;
2042
2043           /* It does; now find the set differences both ways. */
2044           leftoversf = matchesf = 0;
2045           for (k = 0; k < CHARCLASS_INTS; ++k)
2046             {
2047               /* Even an optimizing compiler can't know this for sure. */
2048               int match = matches[k], label = labels[j][k];
2049
2050               (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2051               (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2052             }
2053
2054           /* If there were leftovers, create a new group labeled with them. */
2055           if (leftoversf)
2056             {
2057               copyset(leftovers, labels[ngrps]);
2058               copyset(intersect, labels[j]);
2059               MALLOC(grps[ngrps].elems, position, d->nleaves);
2060               copy(&grps[j], &grps[ngrps]);
2061               ++ngrps;
2062             }
2063
2064           /* Put the position in the current group.  Note that there is no
2065              reason to call insert() here. */
2066           grps[j].elems[grps[j].nelem++] = pos;
2067
2068           /* If every character matching the current position has been
2069              accounted for, we're done. */
2070           if (! matchesf)
2071             break;
2072         }
2073
2074       /* If we've passed the last group, and there are still characters
2075          unaccounted for, then we'll have to create a new group. */
2076       if (j == ngrps)
2077         {
2078           copyset(matches, labels[ngrps]);
2079           zeroset(matches);
2080           MALLOC(grps[ngrps].elems, position, d->nleaves);
2081           grps[ngrps].nelem = 1;
2082           grps[ngrps].elems[0] = pos;
2083           ++ngrps;
2084         }
2085     }
2086
2087   MALLOC(follows.elems, position, d->nleaves);
2088   MALLOC(tmp.elems, position, d->nleaves);
2089
2090   /* If we are a searching matcher, the default transition is to a state
2091      containing the positions of state 0, otherwise the default transition
2092      is to fail miserably. */
2093   if (d->searchflag)
2094     {
2095       wants_newline = 0;
2096       wants_letter = 0;
2097       for (i = 0; i < d->states[0].elems.nelem; ++i)
2098         {
2099           if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2100             wants_newline = 1;
2101           if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2102             wants_letter = 1;
2103         }
2104       copy(&d->states[0].elems, &follows);
2105       state = state_index(d, &follows, 0, 0);
2106       if (wants_newline)
2107         state_newline = state_index(d, &follows, 1, 0);
2108       else
2109         state_newline = state;
2110       if (wants_letter)
2111         state_letter = state_index(d, &follows, 0, 1);
2112       else
2113         state_letter = state;
2114       for (i = 0; i < NOTCHAR; ++i)
2115         trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2116       trans[eolbyte] = state_newline;
2117     }
2118   else
2119     for (i = 0; i < NOTCHAR; ++i)
2120       trans[i] = -1;
2121
2122   for (i = 0; i < ngrps; ++i)
2123     {
2124       follows.nelem = 0;
2125
2126       /* Find the union of the follows of the positions of the group.
2127          This is a hideously inefficient loop.  Fix it someday. */
2128       for (j = 0; j < grps[i].nelem; ++j)
2129         for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2130           insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2131
2132 #ifdef MBS_SUPPORT
2133       if (MB_CUR_MAX > 1)
2134         {
2135           /* If a token in follows.elems is not 1st byte of a multibyte
2136              character, or the states of follows must accept the bytes
2137              which are not 1st byte of the multibyte character.
2138              Then, if a state of follows encounter a byte, it must not be
2139              a 1st byte of a multibyte character nor singlebyte character.
2140              We cansel to add state[0].follows to next state, because
2141              state[0] must accept 1st-byte
2142
2143              For example, we assume <sb a> is a certain singlebyte
2144              character, <mb A> is a certain multibyte character, and the
2145              codepoint of <sb a> equals the 2nd byte of the codepoint of
2146              <mb A>.
2147              When state[0] accepts <sb a>, state[i] transit to state[i+1]
2148              by accepting accepts 1st byte of <mb A>, and state[i+1]
2149              accepts 2nd byte of <mb A>, if state[i+1] encounter the
2150              codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2151              <mb A>, so we can not add state[0].  */
2152
2153           next_isnt_1st_byte = 0;
2154           for (j = 0; j < follows.nelem; ++j)
2155             {
2156               if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2157                 {
2158                   next_isnt_1st_byte = 1;
2159                   break;
2160                 }
2161             }
2162         }
2163 #endif
2164
2165       /* If we are building a searching matcher, throw in the positions
2166          of state 0 as well. */
2167 #ifdef MBS_SUPPORT
2168       if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2169 #else
2170       if (d->searchflag)
2171 #endif
2172         for (j = 0; j < d->states[0].elems.nelem; ++j)
2173           insert(d->states[0].elems.elems[j], &follows);
2174
2175       /* Find out if the new state will want any context information. */
2176       wants_newline = 0;
2177       if (tstbit(eolbyte, labels[i]))
2178         for (j = 0; j < follows.nelem; ++j)
2179           if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2180             wants_newline = 1;
2181
2182       wants_letter = 0;
2183       for (j = 0; j < CHARCLASS_INTS; ++j)
2184         if (labels[i][j] & letters[j])
2185           break;
2186       if (j < CHARCLASS_INTS)
2187         for (j = 0; j < follows.nelem; ++j)
2188           if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2189             wants_letter = 1;
2190
2191       /* Find the state(s) corresponding to the union of the follows. */
2192       state = state_index(d, &follows, 0, 0);
2193       if (wants_newline)
2194         state_newline = state_index(d, &follows, 1, 0);
2195       else
2196         state_newline = state;
2197       if (wants_letter)
2198         state_letter = state_index(d, &follows, 0, 1);
2199       else
2200         state_letter = state;
2201
2202       /* Set the transitions for each character in the current label. */
2203       for (j = 0; j < CHARCLASS_INTS; ++j)
2204         for (k = 0; k < INTBITS; ++k)
2205           if (labels[i][j] & 1 << k)
2206             {
2207               int c = j * INTBITS + k;
2208
2209               if (c == eolbyte)
2210                 trans[c] = state_newline;
2211               else if (IS_WORD_CONSTITUENT(c))
2212                 trans[c] = state_letter;
2213               else if (c < NOTCHAR)
2214                 trans[c] = state;
2215             }
2216     }
2217
2218   for (i = 0; i < ngrps; ++i)
2219     free(grps[i].elems);
2220   free(follows.elems);
2221   free(tmp.elems);
2222 }
2223
2224 /* Some routines for manipulating a compiled dfa's transition tables.
2225    Each state may or may not have a transition table; if it does, and it
2226    is a non-accepting state, then d->trans[state] points to its table.
2227    If it is an accepting state then d->fails[state] points to its table.
2228    If it has no table at all, then d->trans[state] is NULL.
2229    TODO: Improve this comment, get rid of the unnecessary redundancy. */
2230
2231 static void
2232 build_state (int s, struct dfa *d)
2233 {
2234   int *trans;                   /* The new transition table. */
2235   int i;
2236
2237   /* Set an upper limit on the number of transition tables that will ever
2238      exist at once.  1024 is arbitrary.  The idea is that the frequently
2239      used transition tables will be quickly rebuilt, whereas the ones that
2240      were only needed once or twice will be cleared away. */
2241   if (d->trcount >= 1024)
2242     {
2243       for (i = 0; i < d->tralloc; ++i)
2244         if (d->trans[i])
2245           {
2246             free((ptr_t) d->trans[i]);
2247             d->trans[i] = NULL;
2248           }
2249         else if (d->fails[i])
2250           {
2251             free((ptr_t) d->fails[i]);
2252             d->fails[i] = NULL;
2253           }
2254       d->trcount = 0;
2255     }
2256
2257   ++d->trcount;
2258
2259   /* Set up the success bits for this state. */
2260   d->success[s] = 0;
2261   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2262       s, *d))
2263     d->success[s] |= 4;
2264   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2265       s, *d))
2266     d->success[s] |= 2;
2267   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2268       s, *d))
2269     d->success[s] |= 1;
2270
2271   MALLOC(trans, int, NOTCHAR);
2272   dfastate(s, d, trans);
2273
2274   /* Now go through the new transition table, and make sure that the trans
2275      and fail arrays are allocated large enough to hold a pointer for the
2276      largest state mentioned in the table. */
2277   for (i = 0; i < NOTCHAR; ++i)
2278     if (trans[i] >= d->tralloc)
2279       {
2280         int oldalloc = d->tralloc;
2281
2282         while (trans[i] >= d->tralloc)
2283           d->tralloc *= 2;
2284         REALLOC(d->realtrans, int *, d->tralloc + 1);
2285         d->trans = d->realtrans + 1;
2286         REALLOC(d->fails, int *, d->tralloc);
2287         REALLOC(d->success, int, d->tralloc);
2288         while (oldalloc < d->tralloc)
2289           {
2290             d->trans[oldalloc] = NULL;
2291             d->fails[oldalloc++] = NULL;
2292           }
2293       }
2294
2295   /* Newline is a sentinel.  */
2296   trans[eolbyte] = -1;
2297
2298   if (ACCEPTING(s, *d))
2299     d->fails[s] = trans;
2300   else
2301     d->trans[s] = trans;
2302 }
2303
2304 static void
2305 build_state_zero (struct dfa *d)
2306 {
2307   d->tralloc = 1;
2308   d->trcount = 0;
2309   CALLOC(d->realtrans, int *, d->tralloc + 1);
2310   d->trans = d->realtrans + 1;
2311   CALLOC(d->fails, int *, d->tralloc);
2312   MALLOC(d->success, int, d->tralloc);
2313   build_state(0, d);
2314 }
2315
2316 #ifdef MBS_SUPPORT
2317 /* Multibyte character handling sub-routins for dfaexec.  */
2318
2319 /* Initial state may encounter the byte which is not a singlebyte character
2320    nor 1st byte of a multibyte character.  But it is incorrect for initial
2321    state to accept such a byte.
2322    For example, in sjis encoding the regular expression like "\\" accepts
2323    the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2324    0x815c. Then Initial state must skip the bytes which are not a singlebyte
2325    character nor 1st byte of a multibyte character.  */
2326 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)          \
2327   if (s == 0)                                           \
2328     {                                                   \
2329       while (inputwcs[p - buf_begin] == 0               \
2330             && mblen_buf[p - buf_begin] > 0             \
2331             && p < buf_end)                             \
2332         ++p;                                            \
2333       if (p >= end)                                     \
2334         {                                               \
2335           free(mblen_buf);                              \
2336           free(inputwcs);                               \
2337           return (size_t) -1;                           \
2338         }                                               \
2339     }
2340
2341 static void
2342 realloc_trans_if_necessary(struct dfa *d, int new_state)
2343 {
2344   /* Make sure that the trans and fail arrays are allocated large enough
2345      to hold a pointer for the new state. */
2346   if (new_state >= d->tralloc)
2347     {
2348       int oldalloc = d->tralloc;
2349
2350       while (new_state >= d->tralloc)
2351         d->tralloc *= 2;
2352       REALLOC(d->realtrans, int *, d->tralloc + 1);
2353       d->trans = d->realtrans + 1;
2354       REALLOC(d->fails, int *, d->tralloc);
2355       REALLOC(d->success, int, d->tralloc);
2356       while (oldalloc < d->tralloc)
2357         {
2358           d->trans[oldalloc] = NULL;
2359           d->fails[oldalloc++] = NULL;
2360         }
2361     }
2362 }
2363
2364 /* Return values of transit_state_singlebyte(), and
2365    transit_state_consume_1char.  */
2366 typedef enum
2367 {
2368   TRANSIT_STATE_IN_PROGRESS,    /* State transition has not finished.  */
2369   TRANSIT_STATE_DONE,           /* State transition has finished.  */
2370   TRANSIT_STATE_END_BUFFER      /* Reach the end of the buffer.  */
2371 } status_transit_state;
2372
2373 /* Consume a single byte and transit state from 's' to '*next_state'.
2374    This function is almost same as the state transition routin in dfaexec().
2375    But state transition is done just once, otherwise matching succeed or
2376    reach the end of the buffer.  */
2377 static status_transit_state
2378 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2379                                   int *next_state)
2380 {
2381   int *t;
2382   int works = s;
2383
2384   status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2385
2386   while (rval == TRANSIT_STATE_IN_PROGRESS)
2387     {
2388       if ((t = d->trans[works]) != NULL)
2389         {
2390           works = t[*p];
2391           rval = TRANSIT_STATE_DONE;
2392           if (works < 0)
2393             works = 0;
2394         }
2395       else if (works < 0)
2396         {
2397           if (p == buf_end)
2398             /* At the moment, it must not happen.  */
2399             return TRANSIT_STATE_END_BUFFER;
2400           works = 0;
2401         }
2402       else if (d->fails[works])
2403         {
2404           works = d->fails[works][*p];
2405           rval = TRANSIT_STATE_DONE;
2406         }
2407       else
2408         {
2409           build_state(works, d);
2410         }
2411     }
2412   *next_state = works;
2413   return rval;
2414 }
2415
2416 /* Check whether period can match or not in the current context.  If it can,
2417    return the amount of the bytes with which period can match, otherwise
2418    return 0.
2419    `pos' is the position of the period.  `index' is the index from the
2420    buf_begin, and it is the current position in the buffer.  */
2421 static int
2422 match_anychar (struct dfa *d, int s, position pos, int index)
2423 {
2424   int newline = 0;
2425   int letter = 0;
2426   wchar_t wc;
2427   int mbclen;
2428
2429   wc = inputwcs[index];
2430   mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2431
2432   /* Check context.  */
2433   if (wc == (wchar_t)eolbyte)
2434     {
2435       if (!(syntax_bits & RE_DOT_NEWLINE))
2436         return 0;
2437       newline = 1;
2438     }
2439   else if (wc == (wchar_t)'\0')
2440     {
2441       if (syntax_bits & RE_DOT_NOT_NULL)
2442         return 0;
2443       newline = 1;
2444     }
2445
2446   if (iswalnum(wc) || wc == L'_')
2447     letter = 1;
2448
2449   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2450                            newline, d->states[s].letter, letter))
2451     return 0;
2452
2453   return mbclen;
2454 }
2455
2456 /* Check whether bracket expression can match or not in the current context.
2457    If it can, return the amount of the bytes with which expression can match,
2458    otherwise return 0.
2459    `pos' is the position of the bracket expression.  `index' is the index
2460    from the buf_begin, and it is the current position in the buffer.  */
2461 int
2462 match_mb_charset (struct dfa *d, int s, position pos, int index)
2463 {
2464   int i;
2465   int match;            /* Flag which represent that matching succeed.  */
2466   int match_len;        /* Length of the character (or collating element)
2467                            with which this operator match.  */
2468   int op_len;           /* Length of the operator.  */
2469   char buffer[128];
2470   wchar_t wcbuf[6];
2471
2472   /* Pointer to the structure to which we are currently reffering.  */
2473   struct mb_char_classes *work_mbc;
2474
2475   int newline = 0;
2476   int letter = 0;
2477   wchar_t wc;           /* Current reffering character.  */
2478
2479   wc = inputwcs[index];
2480
2481   /* Check context.  */
2482   if (wc == (wchar_t)eolbyte)
2483     {
2484       if (!(syntax_bits & RE_DOT_NEWLINE))
2485         return 0;
2486       newline = 1;
2487     }
2488   else if (wc == (wchar_t)'\0')
2489     {
2490       if (syntax_bits & RE_DOT_NOT_NULL)
2491         return 0;
2492       newline = 1;
2493     }
2494   if (iswalnum(wc) || wc == L'_')
2495     letter = 1;
2496   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2497                            newline, d->states[s].letter, letter))
2498     return 0;
2499
2500   /* Assign the current reffering operator to work_mbc.  */
2501   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2502   match = !work_mbc->invert;
2503   match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2504
2505   /* match with a character class?  */
2506   for (i = 0; i<work_mbc->nch_classes; i++)
2507     {
2508       if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2509         goto charset_matched;
2510     }
2511
2512   strncpy(buffer, buf_begin + index, match_len);
2513   buffer[match_len] = '\0';
2514
2515   /* match with an equivalent class?  */
2516   for (i = 0; i<work_mbc->nequivs; i++)
2517     {
2518       op_len = strlen(work_mbc->equivs[i]);
2519       strncpy(buffer, buf_begin + index, op_len);
2520       buffer[op_len] = '\0';
2521       if (strcoll(work_mbc->equivs[i], buffer) == 0)
2522         {
2523           match_len = op_len;
2524           goto charset_matched;
2525         }
2526     }
2527
2528   /* match with a collating element?  */
2529   for (i = 0; i<work_mbc->ncoll_elems; i++)
2530     {
2531       op_len = strlen(work_mbc->coll_elems[i]);
2532       strncpy(buffer, buf_begin + index, op_len);
2533       buffer[op_len] = '\0';
2534
2535       if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2536         {
2537           match_len = op_len;
2538           goto charset_matched;
2539         }
2540     }
2541
2542   wcbuf[0] = wc;
2543   wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2544
2545   /* match with a range?  */
2546   for (i = 0; i<work_mbc->nranges; i++)
2547     {
2548       wcbuf[2] = work_mbc->range_sts[i];
2549       wcbuf[4] = work_mbc->range_ends[i];
2550
2551       if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2552           wcscoll(wcbuf+4, wcbuf) >= 0)
2553         goto charset_matched;
2554     }
2555
2556   /* match with a character?  */
2557   for (i = 0; i<work_mbc->nchars; i++)
2558     {
2559       if (wc == work_mbc->chars[i])
2560         goto charset_matched;
2561     }
2562
2563   match = !match;
2564
2565  charset_matched:
2566   return match ? match_len : 0;
2567 }
2568
2569 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2570    array which corresponds to `d->states[s].mbps.elem' and each element of
2571    the array contains the amount of the bytes with which the element can
2572    match.
2573    `index' is the index from the buf_begin, and it is the current position
2574    in the buffer.
2575    Caller MUST free the array which this function return.  */
2576 static int*
2577 check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2578 {
2579   int i;
2580   int* rarray;
2581
2582   MALLOC(rarray, int, d->states[s].mbps.nelem);
2583   for (i = 0; i < d->states[s].mbps.nelem; ++i)
2584     {
2585       position pos = d->states[s].mbps.elems[i];
2586       switch(d->tokens[pos.index])
2587         {
2588         case ANYCHAR:
2589           rarray[i] = match_anychar(d, s, pos, index);
2590           break;
2591         case MBCSET:
2592           rarray[i] = match_mb_charset(d, s, pos, index);
2593           break;
2594         default:
2595           break; /* can not happen.  */
2596         }
2597     }
2598   return rarray;
2599 }
2600
2601 /* Consume a single character and enumerate all of the positions which can
2602    be next position from the state `s'.
2603    `match_lens' is the input. It can be NULL, but it can also be the output
2604    of check_matching_with_multibyte_ops() for optimization.
2605    `mbclen' and `pps' are the output.  `mbclen' is the length of the
2606    character consumed, and `pps' is the set this function enumerate.  */
2607 static status_transit_state
2608 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2609                              int *match_lens, int *mbclen, position_set *pps)
2610 {
2611   int i, j;
2612   int s1, s2;
2613   int* work_mbls;
2614   status_transit_state rs = TRANSIT_STATE_DONE;
2615
2616   /* Calculate the length of the (single/multi byte) character
2617      to which p points.  */
2618   *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2619     : mblen_buf[*pp - buf_begin];
2620
2621   /* Calculate the state which can be reached from the state `s' by
2622      consuming `*mbclen' single bytes from the buffer.  */
2623   s1 = s;
2624   for (i = 0; i < *mbclen; i++)
2625     {
2626       s2 = s1;
2627       rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2628     }
2629   /* Copy the positions contained by `s1' to the set `pps'.  */
2630   copy(&(d->states[s1].elems), pps);
2631
2632   /* Check (inputed)match_lens, and initialize if it is NULL.  */
2633   if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2634     work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2635   else
2636     work_mbls = match_lens;
2637
2638   /* Add all of the positions which can be reached from `s' by consuming
2639      a single character.  */
2640   for (i = 0; i < d->states[s].mbps.nelem ; i++)
2641    {
2642       if (work_mbls[i] == *mbclen)
2643         for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2644              j++)
2645           insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2646                  pps);
2647     }
2648
2649   if (match_lens == NULL && work_mbls != NULL)
2650     free(work_mbls);
2651   return rs;
2652 }
2653
2654 /* Transit state from s, then return new state and update the pointer of the
2655    buffer.  This function is for some operator which can match with a multi-
2656    byte character or a collating element(which may be multi characters).  */
2657 static int
2658 transit_state (struct dfa *d, int s, unsigned char const **pp)
2659 {
2660   int s1;
2661   int mbclen;           /* The length of current input multibyte character. */
2662   int maxlen = 0;
2663   int i, j;
2664   int *match_lens = NULL;
2665   int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2666   position_set follows;
2667   unsigned char const *p1 = *pp;
2668   status_transit_state rs;
2669   wchar_t wc;
2670
2671   if (nelem > 0)
2672     /* This state has (a) multibyte operator(s).
2673        We check whether each of them can match or not.  */
2674     {
2675       /* Note: caller must free the return value of this function.  */
2676       match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2677
2678       for (i = 0; i < nelem; i++)
2679         /* Search the operator which match the longest string,
2680            in this state.  */
2681         {
2682           if (match_lens[i] > maxlen)
2683             maxlen = match_lens[i];
2684         }
2685     }
2686
2687   if (nelem == 0 || maxlen == 0)
2688     /* This state has no multibyte operator which can match.
2689        We need to  check only one singlebyte character.  */
2690     {
2691       status_transit_state rs;
2692       rs = transit_state_singlebyte(d, s, *pp, &s1);
2693
2694       /* We must update the pointer if state transition succeeded.  */
2695       if (rs == TRANSIT_STATE_DONE)
2696         ++*pp;
2697
2698       if (match_lens != NULL)
2699         free(match_lens);
2700       return s1;
2701     }
2702
2703   /* This state has some operators which can match a multibyte character.  */
2704   follows.nelem = 0;
2705   MALLOC(follows.elems, position, d->nleaves);
2706
2707   /* `maxlen' may be longer than the length of a character, because it may
2708      not be a character but a (multi character) collating element.
2709      We enumerate all of the positions which `s' can reach by consuming
2710      `maxlen' bytes.  */
2711   rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2712
2713   wc = inputwcs[*pp - mbclen - buf_begin];
2714   s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2715   realloc_trans_if_necessary(d, s1);
2716
2717   while (*pp - p1 < maxlen)
2718     {
2719       follows.nelem = 0;
2720       rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2721
2722       for (i = 0; i < nelem ; i++)
2723         {
2724           if (match_lens[i] == *pp - p1)
2725             for (j = 0;
2726                  j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2727               insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2728                      &follows);
2729         }
2730
2731       wc = inputwcs[*pp - mbclen - buf_begin];
2732       s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2733       realloc_trans_if_necessary(d, s1);
2734     }
2735   free(match_lens);
2736   free(follows.elems);
2737   return s1;
2738 }
2739
2740 #endif
2741
2742 /* Search through a buffer looking for a match to the given struct dfa.
2743    Find the first occurrence of a string matching the regexp in the buffer,
2744    and the shortest possible version thereof.  Return the offset of the first
2745    character after the match, or (size_t) -1 if none is found.  BEGIN points to
2746    the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
2747    is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
2748    where we're supposed to store a 1 if backreferencing happened and the
2749    match needs to be verified by a backtracking matcher.  Otherwise
2750    we store a 0 in *backref. */
2751 size_t
2752 dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2753 {
2754   register int s;       /* Current state. */
2755   register unsigned char const *p; /* Current input character. */
2756   register unsigned char const *end; /* One past the last input character.  */
2757   register int **trans, *t;     /* Copy of d->trans so it can be optimized
2758                                    into a register. */
2759   register unsigned char eol = eolbyte; /* Likewise for eolbyte.  */
2760   static int sbit[NOTCHAR];     /* Table for anding with d->success. */
2761   static int sbit_init;
2762
2763   if (! sbit_init)
2764     {
2765       int i;
2766
2767       sbit_init = 1;
2768       for (i = 0; i < NOTCHAR; ++i)
2769         sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2770       sbit[eol] = 4;
2771     }
2772
2773   if (! d->tralloc)
2774     build_state_zero(d);
2775
2776   s = 0;
2777   p = (unsigned char const *) begin;
2778   end = p + size;
2779   trans = d->trans;
2780
2781 #ifdef MBS_SUPPORT
2782   if (MB_CUR_MAX > 1)
2783     {
2784       int remain_bytes, i;
2785       buf_begin = begin;
2786       buf_end = end;
2787
2788       /* initialize mblen_buf, and inputwcs.  */
2789       MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2790       MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2791       memset(&mbs, 0, sizeof(mbstate_t));
2792       remain_bytes = 0;
2793       for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2794         {
2795           if (remain_bytes == 0)
2796             {
2797               remain_bytes
2798                 = mbrtowc(inputwcs + i, begin + i,
2799                           end - (unsigned char const *)begin - i + 1, &mbs);
2800               if (remain_bytes <= 1)
2801                 {
2802                   remain_bytes = 0;
2803                   inputwcs[i] = (wchar_t)begin[i];
2804                   mblen_buf[i] = 0;
2805                 }
2806               else
2807                 {
2808                   mblen_buf[i] = remain_bytes;
2809                   remain_bytes--;
2810                 }
2811             }
2812           else
2813             {
2814               mblen_buf[i] = remain_bytes;
2815               inputwcs[i] = 0;
2816               remain_bytes--;
2817             }
2818         }
2819       mblen_buf[i] = 0;
2820       inputwcs[i] = 0; /* sentinel */
2821     }
2822 #endif /* MBS_SUPPORT */
2823
2824   for (;;)
2825     {
2826 #ifdef MBS_SUPPORT
2827       if (MB_CUR_MAX > 1)
2828         while ((t = trans[s]))
2829           {
2830             if (d->states[s].mbps.nelem != 0)
2831               {
2832                 /* Can match with a multibyte character( and multi character
2833                    collating element).  */
2834                 unsigned char const *nextp;
2835
2836                 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2837
2838                 nextp = p;
2839                 s = transit_state(d, s, &nextp);
2840                 p = nextp;
2841
2842                 /* Trans table might be updated.  */
2843                 trans = d->trans;
2844               }
2845             else
2846               {
2847                 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2848                 s = t[*p++];
2849               }
2850           }
2851       else
2852 #endif /* MBS_SUPPORT */
2853         while ((t = trans[s]))
2854           s = t[*p++];
2855
2856       if (s < 0)
2857         {
2858           if (p == end)
2859             {
2860 #ifdef MBS_SUPPORT
2861               if (MB_CUR_MAX > 1)
2862                 {
2863                   free(mblen_buf);
2864                   free(inputwcs);
2865                 }
2866 #endif /* MBS_SUPPORT */
2867               return (size_t) -1;
2868             }
2869           s = 0;
2870         }
2871       else if ((t = d->fails[s]))
2872         {
2873           if (d->success[s] & sbit[*p])
2874             {
2875               if (backref)
2876                 *backref = (d->states[s].backref != 0);
2877 #ifdef MBS_SUPPORT
2878               if (MB_CUR_MAX > 1)
2879                 {
2880                   free(mblen_buf);
2881                   free(inputwcs);
2882                 }
2883 #endif /* MBS_SUPPORT */
2884               return (char const *) p - begin;
2885             }
2886
2887 #ifdef MBS_SUPPORT
2888           if (MB_CUR_MAX > 1)
2889             {
2890                 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2891                 if (d->states[s].mbps.nelem != 0)
2892                   {
2893                     /* Can match with a multibyte character( and multi
2894                        character collating element).  */
2895                     unsigned char const *nextp;
2896                     nextp = p;
2897                     s = transit_state(d, s, &nextp);
2898                     p = nextp;
2899
2900                     /* Trans table might be updated.  */
2901                     trans = d->trans;
2902                   }
2903                 else
2904                 s = t[*p++];
2905             }
2906           else
2907 #endif /* MBS_SUPPORT */
2908           s = t[*p++];
2909         }
2910       else
2911         {
2912           build_state(s, d);
2913           trans = d->trans;
2914         }
2915     }
2916 }
2917
2918 /* Initialize the components of a dfa that the other routines don't
2919    initialize for themselves. */
2920 void
2921 dfainit (struct dfa *d)
2922 {
2923   d->calloc = 1;
2924   MALLOC(d->charclasses, charclass, d->calloc);
2925   d->cindex = 0;
2926
2927   d->talloc = 1;
2928   MALLOC(d->tokens, token, d->talloc);
2929   d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2930 #ifdef MBS_SUPPORT
2931   if (MB_CUR_MAX > 1)
2932     {
2933       d->nmultibyte_prop = 1;
2934       MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2935       d->nmbcsets = 0;
2936       d->mbcsets_alloc = 1;
2937       MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2938     }
2939 #endif
2940
2941   d->searchflag = 0;
2942   d->tralloc = 0;
2943
2944   d->musts = 0;
2945 }
2946
2947 /* Parse and analyze a single string of the given length. */
2948 void
2949 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2950 {
2951   if (case_fold)        /* dummy folding in service of dfamust() */
2952     {
2953       char *lcopy;
2954       int i;
2955
2956       lcopy = malloc(len);
2957       if (!lcopy)
2958         dfaerror(_("out of memory"));
2959
2960       /* This is a kludge. */
2961       case_fold = 0;
2962       for (i = 0; i < len; ++i)
2963         if (ISUPPER ((unsigned char) s[i]))
2964           lcopy[i] = tolower ((unsigned char) s[i]);
2965         else
2966           lcopy[i] = s[i];
2967
2968       dfainit(d);
2969       dfaparse(lcopy, len, d);
2970       free(lcopy);
2971       dfamust(d);
2972       d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2973       case_fold = 1;
2974       dfaparse(s, len, d);
2975       dfaanalyze(d, searchflag);
2976     }
2977   else
2978     {
2979         dfainit(d);
2980         dfaparse(s, len, d);
2981         dfamust(d);
2982         dfaanalyze(d, searchflag);
2983     }
2984 }
2985
2986 /* Free the storage held by the components of a dfa. */
2987 void
2988 dfafree (struct dfa *d)
2989 {
2990   int i;
2991   struct dfamust *dm, *ndm;
2992
2993   free((ptr_t) d->charclasses);
2994   free((ptr_t) d->tokens);
2995
2996 #ifdef MBS_SUPPORT
2997   if (MB_CUR_MAX > 1)
2998     {
2999       free((ptr_t) d->multibyte_prop);
3000       for (i = 0; i < d->nmbcsets; ++i)
3001         {
3002           int j;
3003           struct mb_char_classes *p = &(d->mbcsets[i]);
3004           if (p->chars != NULL)
3005             free(p->chars);
3006           if (p->ch_classes != NULL)
3007             free(p->ch_classes);
3008           if (p->range_sts != NULL)
3009             free(p->range_sts);
3010           if (p->range_ends != NULL)
3011             free(p->range_ends);
3012
3013           for (j = 0; j < p->nequivs; ++j)
3014             free(p->equivs[j]);
3015           if (p->equivs != NULL)
3016             free(p->equivs);
3017
3018           for (j = 0; j < p->ncoll_elems; ++j)
3019             free(p->coll_elems[j]);
3020           if (p->coll_elems != NULL)
3021             free(p->coll_elems);
3022         }
3023       free((ptr_t) d->mbcsets);
3024     }
3025 #endif /* MBS_SUPPORT */
3026
3027   for (i = 0; i < d->sindex; ++i)
3028     free((ptr_t) d->states[i].elems.elems);
3029   free((ptr_t) d->states);
3030   for (i = 0; i < d->tindex; ++i)
3031     if (d->follows[i].elems)
3032       free((ptr_t) d->follows[i].elems);
3033   free((ptr_t) d->follows);
3034   for (i = 0; i < d->tralloc; ++i)
3035     if (d->trans[i])
3036       free((ptr_t) d->trans[i]);
3037     else if (d->fails[i])
3038       free((ptr_t) d->fails[i]);
3039   if (d->realtrans) free((ptr_t) d->realtrans);
3040   if (d->fails) free((ptr_t) d->fails);
3041   if (d->success) free((ptr_t) d->success);
3042   for (dm = d->musts; dm; dm = ndm)
3043     {
3044       ndm = dm->next;
3045       free(dm->must);
3046       free((ptr_t) dm);
3047     }
3048 }
3049
3050 /* Having found the postfix representation of the regular expression,
3051    try to find a long sequence of characters that must appear in any line
3052    containing the r.e.
3053    Finding a "longest" sequence is beyond the scope here;
3054    we take an easy way out and hope for the best.
3055    (Take "(ab|a)b"--please.)
3056
3057    We do a bottom-up calculation of sequences of characters that must appear
3058    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3059    representation:
3060         sequences that must appear at the left of the match ("left")
3061         sequences that must appear at the right of the match ("right")
3062         lists of sequences that must appear somewhere in the match ("in")
3063         sequences that must constitute the match ("is")
3064
3065    When we get to the root of the tree, we use one of the longest of its
3066    calculated "in" sequences as our answer.  The sequence we find is returned in
3067    d->must (where "d" is the single argument passed to "dfamust");
3068    the length of the sequence is returned in d->mustn.
3069
3070    The sequences calculated for the various types of node (in pseudo ANSI c)
3071    are shown below.  "p" is the operand of unary operators (and the left-hand
3072    operand of binary operators); "q" is the right-hand operand of binary
3073    operators.
3074
3075    "ZERO" means "a zero-length sequence" below.
3076
3077         Type    left            right           is              in
3078         ----    ----            -----           --              --
3079         char c  # c             # c             # c             # c
3080
3081         ANYCHAR ZERO            ZERO            ZERO            ZERO
3082
3083         MBCSET  ZERO            ZERO            ZERO            ZERO
3084
3085         CSET    ZERO            ZERO            ZERO            ZERO
3086
3087         STAR    ZERO            ZERO            ZERO            ZERO
3088
3089         QMARK   ZERO            ZERO            ZERO            ZERO
3090
3091         PLUS    p->left         p->right        ZERO            p->in
3092
3093         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3094                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3095                 p->is##q->left  p->right##q->is p->is##q->is :  p->right##q->left
3096                                                 ZERO
3097
3098         OR      longest common  longest common  (do p->is and   substrings common to
3099                 leading         trailing        q->is have same p->in and q->in
3100                 (sub)sequence   (sub)sequence   length and
3101                 of p->left      of p->right     content) ?
3102                 and q->left     and q->right    p->is : NULL
3103
3104    If there's anything else we recognize in the tree, all four sequences get set
3105    to zero-length sequences.  If there's something we don't recognize in the tree,
3106    we just return a zero-length sequence.
3107
3108    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3109    'aaa')?
3110
3111    And. . .is it here or someplace that we might ponder "optimizations" such as
3112         egrep 'psi|epsilon'     ->      egrep 'psi'
3113         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3114                                         (Yes, we now find "epsi" as a "string
3115                                         that must occur", but we might also
3116                                         simplify the *entire* r.e. being sought)
3117         grep '[c]'              ->      grep 'c'
3118         grep '(ab|a)b'          ->      grep 'ab'
3119         grep 'ab*'              ->      grep 'a'
3120         grep 'a*b'              ->      grep 'b'
3121
3122    There are several issues:
3123
3124    Is optimization easy (enough)?
3125
3126    Does optimization actually accomplish anything,
3127    or is the automaton you get from "psi|epsilon" (for example)
3128    the same as the one you get from "psi" (for example)?
3129
3130    Are optimizable r.e.'s likely to be used in real-life situations
3131    (something like 'ab*' is probably unlikely; something like is
3132    'psi|epsilon' is likelier)? */
3133
3134 static char *
3135 icatalloc (char *old, char *new)
3136 {
3137   char *result;
3138   size_t oldsize, newsize;
3139
3140   newsize = (new == NULL) ? 0 : strlen(new);
3141   if (old == NULL)
3142     oldsize = 0;
3143   else if (newsize == 0)
3144     return old;
3145   else  oldsize = strlen(old);
3146   if (old == NULL)
3147     result = (char *) malloc(newsize + 1);
3148   else
3149     result = (char *) realloc((void *) old, oldsize + newsize + 1);
3150   if (result != NULL && new != NULL)
3151     (void) strcpy(result + oldsize, new);
3152   return result;
3153 }
3154
3155 static char *
3156 icpyalloc (char *string)
3157 {
3158   return icatalloc((char *) NULL, string);
3159 }
3160
3161 static char *
3162 istrstr (char *lookin, char *lookfor)
3163 {
3164   char *cp;
3165   size_t len;
3166
3167   len = strlen(lookfor);
3168   for (cp = lookin; *cp != '\0'; ++cp)
3169     if (strncmp(cp, lookfor, len) == 0)
3170       return cp;
3171   return NULL;
3172 }
3173
3174 static void
3175 ifree (char *cp)
3176 {
3177   if (cp != NULL)
3178     free(cp);
3179 }
3180
3181 static void
3182 freelist (char **cpp)
3183 {
3184   int i;
3185
3186   if (cpp == NULL)
3187     return;
3188   for (i = 0; cpp[i] != NULL; ++i)
3189     {
3190       free(cpp[i]);
3191       cpp[i] = NULL;
3192     }
3193 }
3194
3195 static char **
3196 enlist (char **cpp, char *new, size_t len)
3197 {
3198   int i, j;
3199
3200   if (cpp == NULL)
3201     return NULL;
3202   if ((new = icpyalloc(new)) == NULL)
3203     {
3204       freelist(cpp);
3205       return NULL;
3206     }
3207   new[len] = '\0';
3208   /* Is there already something in the list that's new (or longer)? */
3209   for (i = 0; cpp[i] != NULL; ++i)
3210     if (istrstr(cpp[i], new) != NULL)
3211       {
3212         free(new);
3213         return cpp;
3214       }
3215   /* Eliminate any obsoleted strings. */
3216   j = 0;
3217   while (cpp[j] != NULL)
3218     if (istrstr(new, cpp[j]) == NULL)
3219       ++j;
3220     else
3221       {
3222         free(cpp[j]);
3223         if (--i == j)
3224           break;
3225         cpp[j] = cpp[i];
3226         cpp[i] = NULL;
3227       }
3228   /* Add the new string. */
3229   cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3230   if (cpp == NULL)
3231     return NULL;
3232   cpp[i] = new;
3233   cpp[i + 1] = NULL;
3234   return cpp;
3235 }
3236
3237 /* Given pointers to two strings, return a pointer to an allocated
3238    list of their distinct common substrings. Return NULL if something
3239    seems wild. */
3240 static char **
3241 comsubs (char *left, char *right)
3242 {
3243   char **cpp;
3244   char *lcp;
3245   char *rcp;
3246   size_t i, len;
3247
3248   if (left == NULL || right == NULL)
3249     return NULL;
3250   cpp = (char **) malloc(sizeof *cpp);
3251   if (cpp == NULL)
3252     return NULL;
3253   cpp[0] = NULL;
3254   for (lcp = left; *lcp != '\0'; ++lcp)
3255     {
3256       len = 0;
3257       rcp = strchr (right, *lcp);
3258       while (rcp != NULL)
3259         {
3260           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3261             continue;
3262           if (i > len)
3263             len = i;
3264           rcp = strchr (rcp + 1, *lcp);
3265         }
3266       if (len == 0)
3267         continue;
3268       if ((cpp = enlist(cpp, lcp, len)) == NULL)
3269         break;
3270     }
3271   return cpp;
3272 }
3273
3274 static char **
3275 addlists (char **old, char **new)
3276 {
3277   int i;
3278
3279   if (old == NULL || new == NULL)
3280     return NULL;
3281   for (i = 0; new[i] != NULL; ++i)
3282     {
3283       old = enlist(old, new[i], strlen(new[i]));
3284       if (old == NULL)
3285         break;
3286     }
3287   return old;
3288 }
3289
3290 /* Given two lists of substrings, return a new list giving substrings
3291    common to both. */
3292 static char **
3293 inboth (char **left, char **right)
3294 {
3295   char **both;
3296   char **temp;
3297   int lnum, rnum;
3298
3299   if (left == NULL || right == NULL)
3300     return NULL;
3301   both = (char **) malloc(sizeof *both);
3302   if (both == NULL)
3303     return NULL;
3304   both[0] = NULL;
3305   for (lnum = 0; left[lnum] != NULL; ++lnum)
3306     {
3307       for (rnum = 0; right[rnum] != NULL; ++rnum)
3308         {
3309           temp = comsubs(left[lnum], right[rnum]);
3310           if (temp == NULL)
3311             {
3312               freelist(both);
3313               return NULL;
3314             }
3315           both = addlists(both, temp);
3316           freelist(temp);
3317           free(temp);
3318           if (both == NULL)
3319             return NULL;
3320         }
3321     }
3322   return both;
3323 }
3324
3325 typedef struct
3326 {
3327   char **in;
3328   char *left;
3329   char *right;
3330   char *is;
3331 } must;
3332
3333 static void
3334 resetmust (must *mp)
3335 {
3336   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3337   freelist(mp->in);
3338 }
3339
3340 static void
3341 dfamust (struct dfa *dfa)
3342 {
3343   must *musts;
3344   must *mp;
3345   char *result;
3346   int ri;
3347   int i;
3348   int exact;
3349   token t;
3350   static must must0;
3351   struct dfamust *dm;
3352   static char empty_string[] = "";
3353
3354   result = empty_string;
3355   exact = 0;
3356   musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3357   if (musts == NULL)
3358     return;
3359   mp = musts;
3360   for (i = 0; i <= dfa->tindex; ++i)
3361     mp[i] = must0;
3362   for (i = 0; i <= dfa->tindex; ++i)
3363     {
3364       mp[i].in = (char **) malloc(sizeof *mp[i].in);
3365       mp[i].left = malloc(2);
3366       mp[i].right = malloc(2);
3367       mp[i].is = malloc(2);
3368       if (mp[i].in == NULL || mp[i].left == NULL ||
3369           mp[i].right == NULL || mp[i].is == NULL)
3370         goto done;
3371       mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3372       mp[i].in[0] = NULL;
3373     }
3374 #ifdef DEBUG
3375   fprintf(stderr, "dfamust:\n");
3376   for (i = 0; i < dfa->tindex; ++i)
3377     {
3378       fprintf(stderr, " %d:", i);
3379       prtok(dfa->tokens[i]);
3380     }
3381   putc('\n', stderr);
3382 #endif
3383   for (ri = 0; ri < dfa->tindex; ++ri)
3384     {
3385       switch (t = dfa->tokens[ri])
3386         {
3387         case LPAREN:
3388         case RPAREN:
3389           goto done;            /* "cannot happen" */
3390         case EMPTY:
3391         case BEGLINE:
3392         case ENDLINE:
3393         case BEGWORD:
3394         case ENDWORD:
3395         case LIMWORD:
3396         case NOTLIMWORD:
3397         case BACKREF:
3398           resetmust(mp);
3399           break;
3400         case STAR:
3401         case QMARK:
3402           if (mp <= musts)
3403             goto done;          /* "cannot happen" */
3404           --mp;
3405           resetmust(mp);
3406           break;
3407         case OR:
3408         case ORTOP:
3409           if (mp < &musts[2])
3410             goto done;          /* "cannot happen" */
3411           {
3412             char **new;
3413             must *lmp;
3414             must *rmp;
3415             int j, ln, rn, n;
3416
3417             rmp = --mp;
3418             lmp = --mp;
3419             /* Guaranteed to be.  Unlikely, but. . . */
3420             if (strcmp(lmp->is, rmp->is) != 0)
3421               lmp->is[0] = '\0';
3422             /* Left side--easy */
3423             i = 0;
3424             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3425               ++i;
3426             lmp->left[i] = '\0';
3427             /* Right side */
3428             ln = strlen(lmp->right);
3429             rn = strlen(rmp->right);
3430             n = ln;
3431             if (n > rn)
3432               n = rn;
3433             for (i = 0; i < n; ++i)
3434               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3435                 break;
3436             for (j = 0; j < i; ++j)
3437               lmp->right[j] = lmp->right[(ln - i) + j];
3438             lmp->right[j] = '\0';
3439             new = inboth(lmp->in, rmp->in);
3440             if (new == NULL)
3441               goto done;
3442             freelist(lmp->in);
3443             free((char *) lmp->in);
3444             lmp->in = new;
3445           }
3446           break;
3447         case PLUS:
3448           if (mp <= musts)
3449             goto done;          /* "cannot happen" */
3450           --mp;
3451           mp->is[0] = '\0';
3452           break;
3453         case END:
3454           if (mp != &musts[1])
3455             goto done;          /* "cannot happen" */
3456           for (i = 0; musts[0].in[i] != NULL; ++i)
3457             if (strlen(musts[0].in[i]) > strlen(result))
3458               result = musts[0].in[i];
3459           if (strcmp(result, musts[0].is) == 0)
3460             exact = 1;
3461           goto done;
3462         case CAT:
3463           if (mp < &musts[2])
3464             goto done;          /* "cannot happen" */
3465           {
3466             must *lmp;
3467             must *rmp;
3468
3469             rmp = --mp;
3470             lmp = --mp;
3471             /* In.  Everything in left, plus everything in
3472                right, plus catenation of
3473                left's right and right's left. */
3474             lmp->in = addlists(lmp->in, rmp->in);
3475             if (lmp->in == NULL)
3476               goto done;
3477             if (lmp->right[0] != '\0' &&
3478                 rmp->left[0] != '\0')
3479               {
3480                 char *tp;
3481
3482                 tp = icpyalloc(lmp->right);
3483                 if (tp == NULL)
3484                   goto done;
3485                 tp = icatalloc(tp, rmp->left);
3486                 if (tp == NULL)
3487                   goto done;
3488                 lmp->in = enlist(lmp->in, tp,
3489                                  strlen(tp));
3490                 free(tp);
3491                 if (lmp->in == NULL)
3492                   goto done;
3493               }
3494             /* Left-hand */
3495             if (lmp->is[0] != '\0')
3496               {
3497                 lmp->left = icatalloc(lmp->left,
3498                                       rmp->left);
3499                 if (lmp->left == NULL)
3500                   goto done;
3501               }
3502             /* Right-hand */
3503             if (rmp->is[0] == '\0')
3504               lmp->right[0] = '\0';
3505             lmp->right = icatalloc(lmp->right, rmp->right);
3506             if (lmp->right == NULL)
3507               goto done;
3508             /* Guaranteed to be */
3509             if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3510               {
3511                 lmp->is = icatalloc(lmp->is, rmp->is);
3512                 if (lmp->is == NULL)
3513                   goto done;
3514               }
3515             else
3516               lmp->is[0] = '\0';
3517           }
3518           break;
3519         default:
3520           if (t < END)
3521             {
3522               /* "cannot happen" */
3523               goto done;
3524             }
3525           else if (t == '\0')
3526             {
3527               /* not on *my* shift */
3528               goto done;
3529             }
3530           else if (t >= CSET
3531 #ifdef MBS_SUPPORT
3532                    || t == ANYCHAR
3533                    || t == MBCSET
3534 #endif /* MBS_SUPPORT */
3535                    )
3536             {
3537               /* easy enough */
3538               resetmust(mp);
3539             }
3540           else
3541             {
3542               /* plain character */
3543               resetmust(mp);
3544               mp->is[0] = mp->left[0] = mp->right[0] = t;
3545               mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3546               mp->in = enlist(mp->in, mp->is, (size_t)1);
3547               if (mp->in == NULL)
3548                 goto done;
3549             }
3550           break;
3551         }
3552 #ifdef DEBUG
3553       fprintf(stderr, " node: %d:", ri);
3554       prtok(dfa->tokens[ri]);
3555       fprintf(stderr, "\n  in:");
3556       for (i = 0; mp->in[i]; ++i)
3557         fprintf(stderr, " \"%s\"", mp->in[i]);
3558       fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
3559       fprintf(stderr, "  left: \"%s\"\n", mp->left);
3560       fprintf(stderr, "  right: \"%s\"\n", mp->right);
3561 #endif
3562       ++mp;
3563     }
3564  done:
3565   if (strlen(result))
3566     {
3567       dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3568       dm->exact = exact;
3569       dm->must = malloc(strlen(result) + 1);
3570       strcpy(dm->must, result);
3571       dm->next = dfa->musts;
3572       dfa->musts = dm;
3573     }
3574   mp = musts;
3575   for (i = 0; i <= dfa->tindex; ++i)
3576     {
3577       freelist(mp[i].in);
3578       ifree((char *) mp[i].in);
3579       ifree(mp[i].left);
3580       ifree(mp[i].right);
3581       ifree(mp[i].is);
3582     }
3583   free((char *) mp);
3584 }
3585 /* vim:set shiftwidth=2: */