lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2025 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17
  18 /* Written June, 1988 by Mike Haertel
  19    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  20
  21 #include <config.h>
  22
  23 #include "dfa.h"
  24
  25 #include "flexmember.h"
  26 #include "idx.h"
  27 #include "verify.h"
  28
  29 #include <assert.h>
  30 #include <ctype.h>
  31 #include <stdint.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <limits.h>
  35 #include <string.h>
  36 #include <wchar.h>
  37
  38 #include "xalloc.h"
  39 #include "localeinfo.h"
  40
  41 #include "gettext.h"
  42 #define _(msgid) dgettext ("gnulib", msgid)
  43
  44 #if GAWK
  45 /* Use ISO C 99 API.  */
  46 # include <wctype.h>
  47 # define char32_t wchar_t
  48 # define mbrtoc32 mbrtowc
  49 # define c32rtomb wcrtomb
  50 # define c32tob wctob
  51 # define c32isprint iswprint
  52 # define c32isspace iswspace
  53 # define mbszero(p) memset ((p), 0, sizeof (mbstate_t))
  54 #else
  55 /* Use ISO C 11 + gnulib API.  */
  56 # include <uchar.h>
  57 #endif
  58
  59 /* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
  60    understandably cannot deduce that the input comes from a
  61    well-formed regular expression.  There's little point to the
  62    runtime overhead of 'assert' instead of 'assume_nonnull' when the
  63    MMU will check anyway.  */
  64 #define assume_nonnull(x) assume ((x) != NULL)
  65
  66 static bool
  67 str_eq (char const *a, char const *b)
  68 {
  69   return strcmp (a, b) == 0;
  70 }
  71
  72 static bool
  73 c_isdigit (char c)
  74 {
  75   return '0' <= c && c <= '9';
  76 }
  77
  78 #ifndef FALLTHROUGH
  79 # if 201710L < __STDC_VERSION__
  80 #  define FALLTHROUGH [[__fallthrough__]]
  81 # elif ((__GNUC__ >= 7 && !defined __clang__) \
  82         || (defined __apple_build_version__ \
  83             ? __apple_build_version__ >= 12000000 \
  84             : __clang_major__ >= 10))
  85 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  86 # else
  87 #  define FALLTHROUGH ((void) 0)
  88 # endif
  89 #endif
  90
  91 #ifndef MIN
  92 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  93 #endif
  94
  95 /* HPUX defines these as macros in sys/param.h.  */
  96 #ifdef setbit
  97 # undef setbit
  98 #endif
  99 #ifdef clrbit
 100 # undef clrbit
 101 #endif
 102
 103 /* For code that does not use Gnulib’s isblank module.  */
 104 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
 105 # define isblank dfa_isblank
 106 static int
 107 isblank (int c)
 108 {
 109   return c == ' ' || c == '\t';
 110 }
 111 #endif
 112
 113 /* First integer value that is greater than any character code.  */
 114 enum { NOTCHAR = 1 << CHAR_BIT };
 115
 116 #ifdef UINT_LEAST64_MAX
 117
 118 /* Number of bits used in a charclass word.  */
 119 enum { CHARCLASS_WORD_BITS = 64 };
 120
 121 /* This represents part of a character class.  It must be unsigned and
 122    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
 123 typedef uint_least64_t charclass_word;
 124
 125 /* Part of a charclass initializer that represents 64 bits' worth of a
 126    charclass, where LO and HI are the low and high-order 32 bits of
 127    the 64-bit quantity.  */
 128 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
 129
 130 #else
 131 /* Fallbacks for pre-C99 hosts that lack 64-bit integers.  */
 132 enum { CHARCLASS_WORD_BITS = 32 };
 133 typedef unsigned long charclass_word;
 134 # define CHARCLASS_PAIR(lo, hi) lo, hi
 135 #endif
 136
 137 /* An initializer for a charclass whose 32-bit words are A through H.  */
 138 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
 139    {{                                                   \
 140       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 141       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 142    }}
 143
 144 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 145 static charclass_word const CHARCLASS_WORD_MASK
 146   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 147
 148 /* Number of words required to hold a bit for every character.  */
 149 enum
 150 {
 151   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 152 };
 153
 154 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 155 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 156
 157 /* Convert a possibly-signed character to an unsigned character.  This is
 158    a bit safer than casting to unsigned char, since it catches some type
 159    errors that the cast doesn't.  */
 160 static unsigned char
 161 to_uchar (char ch)
 162 {
 163   return ch;
 164 }
 165
 166 /* Contexts tell us whether a character is a newline or a word constituent.
 167    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 168    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 169    a particular character class.
 170
 171    A state also stores a context value, which is a bitmask of CTX_* values.
 172    A state's context represents a set of characters that the state's
 173    predecessors must match.  For example, a state whose context does not
 174    include CTX_LETTER will never have transitions where the previous
 175    character is a word constituent.  A state whose context is CTX_ANY
 176    might have transitions from any character.  */
 177
 178 enum
 179   {
 180     CTX_NONE = 1,
 181     CTX_LETTER = 2,
 182     CTX_NEWLINE = 4,
 183     CTX_ANY = 7
 184   };
 185
 186 /* Sometimes characters can only be matched depending on the surrounding
 187    context.  Such context decisions depend on what the previous character
 188    was, and the value of the current (lookahead) character.  Context
 189    dependent constraints are encoded as 9-bit integers.  Each bit that
 190    is set indicates that the constraint succeeds in the corresponding
 191    context.
 192
 193    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 194    bit 3-5  - valid contexts when next character is CTX_LETTER
 195    bit 0-2  - valid contexts when next character is CTX_NONE
 196
 197    succeeds_in_context determines whether a given constraint
 198    succeeds in a particular context.  Prev is a bitmask of possible
 199    context values for the previous character, curr is the (single-bit)
 200    context value for the lookahead character.  */
 201 static int
 202 newline_constraint (int constraint)
 203 {
 204   return (constraint >> 6) & 7;
 205 }
 206 static int
 207 letter_constraint (int constraint)
 208 {
 209   return (constraint >> 3) & 7;
 210 }
 211 static int
 212 other_constraint (int constraint)
 213 {
 214   return constraint & 7;
 215 }
 216
 217 static bool
 218 succeeds_in_context (int constraint, int prev, int curr)
 219 {
 220   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 221               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 222               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 223              & prev);
 224 }
 225
 226 /* The following describe what a constraint depends on.  */
 227 static bool
 228 prev_newline_dependent (int constraint)
 229 {
 230   return ((constraint ^ constraint >> 2) & 0111) != 0;
 231 }
 232 static bool
 233 prev_letter_dependent (int constraint)
 234 {
 235   return ((constraint ^ constraint >> 1) & 0111) != 0;
 236 }
 237
 238 /* Tokens that match the empty string subject to some constraint actually
 239    work by applying that constraint to determine what may follow them,
 240    taking into account what has gone before.  The following values are
 241    the constraints corresponding to the special tokens previously defined.  */
 242 enum
 243   {
 244     NO_CONSTRAINT = 0777,
 245     BEGLINE_CONSTRAINT = 0444,
 246     ENDLINE_CONSTRAINT = 0700,
 247     BEGWORD_CONSTRAINT = 0050,
 248     ENDWORD_CONSTRAINT = 0202,
 249     LIMWORD_CONSTRAINT = 0252,
 250     NOTLIMWORD_CONSTRAINT = 0525
 251   };
 252
 253 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 254    are operators and others are terminal symbols.  Most (but not all) of these
 255    codes are returned by the lexical analyzer.  */
 256
 257 typedef ptrdiff_t token;
 258 static token const TOKEN_MAX = PTRDIFF_MAX;
 259
 260 /* States are indexed by state_num values.  These are normally
 261    nonnegative but -1 is used as a special value.  */
 262 typedef ptrdiff_t state_num;
 263
 264 /* Predefined token values.  */
 265 enum
 266 {
 267   END = -1,                     /* END is a terminal symbol that matches the
 268                                    end of input; any value of END or less in
 269                                    the parse tree is such a symbol.  Accepting
 270                                    states of the DFA are those that would have
 271                                    a transition on END.  This is -1, not some
 272                                    more-negative value, to tweak the speed of
 273                                    comparisons to END.  */
 274
 275   /* Ordinary character values are terminal symbols that match themselves.  */
 276
 277   /* CSET must come last in the following list of special tokens.  Otherwise,
 278      the list order matters only for performance.  Related special tokens
 279      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 280      || CSET <= t) can be done with a single machine-level comparison.  */
 281
 282   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 283                                    the empty string.  */
 284
 285   QMARK,                        /* QMARK is an operator of one argument that
 286                                    matches zero or one occurrences of its
 287                                    argument.  */
 288
 289   STAR,                         /* STAR is an operator of one argument that
 290                                    matches the Kleene closure (zero or more
 291                                    occurrences) of its argument.  */
 292
 293   PLUS,                         /* PLUS is an operator of one argument that
 294                                    matches the positive closure (one or more
 295                                    occurrences) of its argument.  */
 296
 297   REPMN,                        /* REPMN is a lexical token corresponding
 298                                    to the {m,n} construct.  REPMN never
 299                                    appears in the compiled token vector.  */
 300
 301   CAT,                          /* CAT is an operator of two arguments that
 302                                    matches the concatenation of its
 303                                    arguments.  CAT is never returned by the
 304                                    lexical analyzer.  */
 305
 306   OR,                           /* OR is an operator of two arguments that
 307                                    matches either of its arguments.  */
 308
 309   LPAREN,                       /* LPAREN never appears in the parse tree,
 310                                    it is only a lexeme.  */
 311
 312   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 313
 314   WCHAR,                        /* Only returned by lex.  wctok contains the
 315                                    32-bit wide character representation.  */
 316
 317   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 318                                    a valid multibyte (or single byte) character.
 319                                    It is used only if MB_CUR_MAX > 1.  */
 320
 321   BEG,                          /* BEG is an initial symbol that matches the
 322                                    beginning of input.  */
 323
 324   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 325                                    the empty string at the beginning of a
 326                                    line.  */
 327
 328   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 329                                    the empty string at the end of a line.  */
 330
 331   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 332                                    the empty string at the beginning of a
 333                                    word.  */
 334
 335   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 336                                    the empty string at the end of a word.  */
 337
 338   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 339                                    the empty string at the beginning or the
 340                                    end of a word.  */
 341
 342   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 343                                    matches the empty string not at
 344                                    the beginning or end of a word.  */
 345
 346   BACKREF,                      /* BACKREF is generated by \<digit>
 347                                    or by any other construct that
 348                                    is not completely handled.  If the scanner
 349                                    detects a transition on backref, it returns
 350                                    a kind of "semi-success" indicating that
 351                                    the match will have to be verified with
 352                                    a backtracking matcher.  */
 353
 354   MBCSET,                       /* MBCSET is similar to CSET, but for
 355                                    multibyte characters.  */
 356
 357   CSET                          /* CSET and (and any value greater) is a
 358                                    terminal symbol that matches any of a
 359                                    class of characters.  */
 360 };
 361
 362
 363 /* States of the recognizer correspond to sets of positions in the parse
 364    tree, together with the constraints under which they may be matched.
 365    So a position is encoded as an index into the parse tree together with
 366    a constraint.  */
 367 typedef struct
 368 {
 369   idx_t index;                  /* Index into the parse array.  */
 370   unsigned int constraint;      /* Constraint for matching this position.  */
 371 } position;
 372
 373 /* Sets of positions are stored as arrays.  */
 374 typedef struct
 375 {
 376   position *elems;              /* Elements of this position set.  */
 377   idx_t nelem;                  /* Number of elements in this set.  */
 378   idx_t alloc;                  /* Number of elements allocated in ELEMS.  */
 379 } position_set;
 380
 381 /* A state of the dfa consists of a set of positions, some flags,
 382    and the token value of the lowest-numbered position of the state that
 383    contains an END token.  */
 384 typedef struct
 385 {
 386   size_t hash;                  /* Hash of the positions of this state.  */
 387   position_set elems;           /* Positions this state could match.  */
 388   unsigned char context;        /* Context from previous state.  */
 389   unsigned short constraint;    /* Constraint for this state to accept.  */
 390   position_set mbps;            /* Positions which can match multibyte
 391                                    characters or the follows, e.g., period.
 392                                    Used only if MB_CUR_MAX > 1.  */
 393   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 394                                    negative if the state does not have
 395                                    ANYCHAR.  */
 396 } dfa_state;
 397
 398 /* Maximum for any transition table count.  This should be at least 3,
 399    for the initial state setup.  */
 400 enum { MAX_TRCOUNT = 1024 };
 401
 402 /* A bracket operator.
 403    e.g., [a-c], [[:alpha:]], etc.  */
 404 struct mb_char_classes
 405 {
 406   ptrdiff_t cset;
 407   bool invert;
 408   char32_t *chars;              /* Normal characters.  */
 409   idx_t nchars;
 410   idx_t nchars_alloc;
 411 };
 412
 413 struct regex_syntax
 414 {
 415   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 416   reg_syntax_t syntax_bits;
 417   int dfaopts;
 418   bool syntax_bits_set;
 419
 420   /* Flag for case-folding letters into sets.  */
 421   bool case_fold;
 422
 423   /* End-of-line byte in data.  */
 424   unsigned char eolbyte;
 425
 426   /* Cache of char-context values.  */
 427   char sbit[NOTCHAR];
 428
 429   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 430      multibyte character.  */
 431   bool never_trail[NOTCHAR];
 432
 433   /* Set of characters considered letters.  */
 434   charclass letters;
 435
 436   /* Set of characters that are newline.  */
 437   charclass newline;
 438 };
 439
 440 /* Lexical analyzer.  All the dross that deals with the obnoxious
 441    GNU Regex syntax bits is located here.  The poor, suffering
 442    reader is referred to the GNU Regex documentation for the
 443    meaning of the @#%!@#%^!@ syntax bits.  */
 444 struct lexer_state
 445 {
 446   char const *ptr;      /* Pointer to next input character.  */
 447   idx_t left;           /* Number of characters remaining.  */
 448   token lasttok;        /* Previous token returned; initially END.  */
 449   idx_t parens;         /* Count of outstanding left parens.  */
 450   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 451
 452   /* 32-bit wide character representation of the current multibyte character,
 453      or WEOF if there was an encoding error.  Used only if
 454      MB_CUR_MAX > 1.  */
 455   wint_t wctok;
 456
 457   /* The most recently analyzed multibyte bracket expression.  */
 458   struct mb_char_classes brack;
 459
 460   /* We're separated from beginning or (, | only by zero-width characters.  */
 461   bool laststart;
 462 };
 463
 464 /* Recursive descent parser for regular expressions.  */
 465
 466 struct parser_state
 467 {
 468   token tok;               /* Lookahead token.  */
 469   idx_t depth;             /* Current depth of a hypothetical stack
 470                               holding deferred productions.  This is
 471                               used to determine the depth that will be
 472                               required of the real stack later on in
 473                               dfaanalyze.  */
 474 };
 475
 476 /* A compiled regular expression.  */
 477 struct dfa
 478 {
 479   /* Fields filled by the scanner.  */
 480   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 481   idx_t cindex;                 /* Index for adding new charclasses.  */
 482   idx_t calloc;                 /* Number of charclasses allocated.  */
 483   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
 484
 485   /* Scanner state */
 486   struct lexer_state lex;
 487
 488   /* Parser state */
 489   struct parser_state parse;
 490
 491   /* Fields filled by the parser.  */
 492   token *tokens;                /* Postfix parse array.  */
 493   idx_t tindex;                 /* Index for adding new tokens.  */
 494   idx_t talloc;                 /* Number of tokens currently allocated.  */
 495   idx_t depth;                  /* Depth required of an evaluation stack
 496                                    used for depth-first traversal of the
 497                                    parse tree.  */
 498   idx_t nleaves;                /* Number of non-EMPTY leaves
 499                                    in the parse tree.  */
 500   idx_t nregexps;               /* Count of parallel regexps being built
 501                                    with dfaparse.  */
 502   bool fast;                    /* The DFA is fast.  */
 503   bool epsilon;                 /* Does a token match only the empty string?  */
 504   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
 505   mbstate_t mbs;                /* Multibyte conversion state.  */
 506
 507   /* The following are valid only if MB_CUR_MAX > 1.  */
 508
 509   /* The value of multibyte_prop[i] is defined by following rule.
 510      if tokens[i] < NOTCHAR
 511      bit 0 : tokens[i] is the first byte of a character, including
 512      single-byte characters.
 513      bit 1 : tokens[i] is the last byte of a character, including
 514      single-byte characters.
 515
 516      e.g.
 517      tokens
 518      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 519      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 520      multibyte_prop
 521      = 3     , 1               ,  0              ,  2              , 3
 522    */
 523   char *multibyte_prop;
 524
 525   /* Fields filled by the superset.  */
 526   struct dfa *superset;             /* Hint of the dfa.  */
 527
 528   /* Fields filled by the state builder.  */
 529   dfa_state *states;            /* States of the dfa.  */
 530   state_num sindex;             /* Index for adding new states.  */
 531   idx_t salloc;                 /* Number of states currently allocated.  */
 532
 533   /* Fields filled by the parse tree->NFA conversion.  */
 534   position_set *follows;        /* Array of follow sets, indexed by position
 535                                    index.  The follow of a position is the set
 536                                    of positions containing characters that
 537                                    could conceivably follow a character
 538                                    matching the given position in a string
 539                                    matching the regexp.  Allocated to the
 540                                    maximum possible position index.  */
 541   bool searchflag;              /* We are supposed to build a searching
 542                                    as opposed to an exact matcher.  A searching
 543                                    matcher finds the first and shortest string
 544                                    matching a regexp anywhere in the buffer,
 545                                    whereas an exact matcher finds the longest
 546                                    string matching, but anchored to the
 547                                    beginning of the buffer.  */
 548
 549   /* Fields filled by dfaanalyze.  */
 550   int *constraints;             /* Array of union of accepting constraints
 551                                    in the follow of a position.  */
 552   int *separates;               /* Array of contexts on follow of a
 553                                    position.  */
 554
 555   /* Fields filled by dfaexec.  */
 556   state_num tralloc;            /* Number of transition tables that have
 557                                    slots so far, not counting trans[-1] and
 558                                    trans[-2].  */
 559   int trcount;                  /* Number of transition tables that have
 560                                    been built, other than for initial
 561                                    states.  */
 562   int min_trcount;              /* Number of initial states.  Equivalently,
 563                                    the minimum state number for which trcount
 564                                    counts transitions.  */
 565   state_num **trans;            /* Transition tables for states that can
 566                                    never accept.  If the transitions for a
 567                                    state have not yet been computed, or the
 568                                    state could possibly accept, its entry in
 569                                    this table is NULL.  This points to two
 570                                    past the start of the allocated array,
 571                                    and trans[-1] and trans[-2] are always
 572                                    NULL.  */
 573   state_num **fails;            /* Transition tables after failing to accept
 574                                    on a state that potentially could do so.
 575                                    If trans[i] is non-null, fails[i] must
 576                                    be null.  */
 577   char *success;                /* Table of acceptance conditions used in
 578                                    dfaexec and computed in build_state.  */
 579   state_num *newlines;          /* Transitions on newlines.  The entry for a
 580                                    newline in any transition table is always
 581                                    -1 so we can count lines without wasting
 582                                    too many cycles.  The transition for a
 583                                    newline is stored separately and handled
 584                                    as a special case.  Newline is also used
 585                                    as a sentinel at the end of the buffer.  */
 586   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 587                                    context in multibyte locales, in which we
 588                                    do not distinguish between their contexts,
 589                                    as not supported word.  */
 590   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 591   state_num **mb_trans;         /* Transition tables for states with
 592                                    ANYCHAR.  */
 593   state_num mb_trcount;         /* Number of transition tables for states with
 594                                    ANYCHAR that have actually been built.  */
 595
 596   /* Syntax configuration.  This is near the end so that dfacopysyntax
 597      can memset up to here.  */
 598   struct regex_syntax syntax;
 599
 600   /* Information derived from the locale.  This is at the end so that
 601      a quick memset need not clear it specially.  */
 602
 603   /* dfaexec implementation.  */
 604   char *(*dfaexec) (struct dfa *, char const *, char *,
 605                     bool, idx_t *, bool *);
 606
 607   /* Other cached information derived from the locale.  */
 608   struct localeinfo localeinfo;
 609 };
 610
 611 /* User access to dfa internals.  */
 612
 613 /* S could possibly be an accepting state of R.  */
 614 static bool
 615 accepting (state_num s, struct dfa const *r)
 616 {
 617   return r->states[s].constraint != 0;
 618 }
 619
 620 /* STATE accepts in the specified context.  */
 621 static bool
 622 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 623 {
 624   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 625 }
 626
 627 static void regexp (struct dfa *dfa);
 628
 629 /* Store into *PWC the result of converting the leading bytes of the
 630    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 631    and updating the conversion state in *D.  On conversion error,
 632    convert just a single byte, to WEOF.  Return the number of bytes
 633    converted.
 634
 635    This differs from mbrtoc32 (PWC, S, N, &D->mbs) as follows:
 636
 637    * PWC points to wint_t, not to char32_t.
 638    * The last arg is a dfa *D instead of merely a multibyte conversion
 639      state D->mbs.
 640    * N is idx_t not size_t, and must be at least 1.
 641    * S[N - 1] must be a sentinel byte.
 642    * Shift encodings are not supported.
 643    * The return value is always in the range 1..N.
 644    * D->mbs is always valid afterwards.
 645    * *PWC is always set to something.  */
 646 static int
 647 mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
 648 {
 649   unsigned char uc = s[0];
 650   wint_t wc = d->localeinfo.sbctowc[uc];
 651
 652   if (wc == WEOF)
 653     {
 654       char32_t wch;
 655       size_t nbytes = mbrtoc32 (&wch, s, n, &d->mbs);
 656       if (0 < nbytes && nbytes < (size_t) -2)
 657         {
 658           *pwc = wch;
 659           /* nbytes cannot be == (size) -3 here, since we rely on the
 660              'mbrtoc32-regular' module.  */
 661           return nbytes;
 662         }
 663       mbszero (&d->mbs);
 664     }
 665
 666   *pwc = wc;
 667   return 1;
 668 }
 669
 670 #ifdef DEBUG
 671
 672 static void
 673 prtok (token t)
 674 {
 675   if (t <= END)
 676     fprintf (stderr, "END");
 677   else if (0 <= t && t < NOTCHAR)
 678     {
 679       unsigned int ch = t;
 680       fprintf (stderr, "0x%02x", ch);
 681     }
 682   else
 683     {
 684       char const *s;
 685       switch (t)
 686         {
 687         case BEG:
 688           s = "BEG";
 689           break;
 690         case EMPTY:
 691           s = "EMPTY";
 692           break;
 693         case BACKREF:
 694           s = "BACKREF";
 695           break;
 696         case BEGLINE:
 697           s = "BEGLINE";
 698           break;
 699         case ENDLINE:
 700           s = "ENDLINE";
 701           break;
 702         case BEGWORD:
 703           s = "BEGWORD";
 704           break;
 705         case ENDWORD:
 706           s = "ENDWORD";
 707           break;
 708         case LIMWORD:
 709           s = "LIMWORD";
 710           break;
 711         case NOTLIMWORD:
 712           s = "NOTLIMWORD";
 713           break;
 714         case QMARK:
 715           s = "QMARK";
 716           break;
 717         case STAR:
 718           s = "STAR";
 719           break;
 720         case PLUS:
 721           s = "PLUS";
 722           break;
 723         case CAT:
 724           s = "CAT";
 725           break;
 726         case OR:
 727           s = "OR";
 728           break;
 729         case LPAREN:
 730           s = "LPAREN";
 731           break;
 732         case RPAREN:
 733           s = "RPAREN";
 734           break;
 735         case ANYCHAR:
 736           s = "ANYCHAR";
 737           break;
 738         case MBCSET:
 739           s = "MBCSET";
 740           break;
 741         default:
 742           s = "CSET";
 743           break;
 744         }
 745       fprintf (stderr, "%s", s);
 746     }
 747 }
 748 #endif /* DEBUG */
 749
 750 /* Stuff pertaining to charclasses.  */
 751
 752 static bool
 753 tstbit (unsigned int b, charclass const *c)
 754 {
 755   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 756 }
 757
 758 static void
 759 setbit (unsigned int b, charclass *c)
 760 {
 761   charclass_word one = 1;
 762   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 763 }
 764
 765 static void
 766 clrbit (unsigned int b, charclass *c)
 767 {
 768   charclass_word one = 1;
 769   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 770 }
 771
 772 static void
 773 zeroset (charclass *s)
 774 {
 775   memset (s, 0, sizeof *s);
 776 }
 777
 778 static void
 779 fillset (charclass *s)
 780 {
 781   for (int i = 0; i < CHARCLASS_WORDS; i++)
 782     s->w[i] = CHARCLASS_WORD_MASK;
 783 }
 784
 785 static void
 786 notset (charclass *s)
 787 {
 788   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 789     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 790 }
 791
 792 static bool
 793 equal (charclass const *s1, charclass const *s2)
 794 {
 795   charclass_word w = 0;
 796   for (int i = 0; i < CHARCLASS_WORDS; i++)
 797     w |= s1->w[i] ^ s2->w[i];
 798   return w == 0;
 799 }
 800
 801 static bool
 802 emptyset (charclass const *s)
 803 {
 804   charclass_word w = 0;
 805   for (int i = 0; i < CHARCLASS_WORDS; i++)
 806     w |= s->w[i];
 807   return w == 0;
 808 }
 809
 810 /* Ensure that the array addressed by PA holds at least I + 1 items.
 811    Either return PA, or reallocate the array and return its new address.
 812    Although PA may be null, the returned value is never null.
 813
 814    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 815    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 816    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 817    ITEM_SIZE is the size of one item; it must be positive.
 818    Avoid O(N**2) behavior on arrays growing linearly.  */
 819 static void *
 820 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
 821                ptrdiff_t nitems_max, idx_t item_size)
 822 {
 823   if (i < *nitems)
 824     return pa;
 825   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 826 }
 827
 828 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 829 static idx_t
 830 charclass_index (struct dfa *d, charclass const *s)
 831 {
 832   idx_t i;
 833
 834   for (i = 0; i < d->cindex; ++i)
 835     if (equal (s, &d->charclasses[i]))
 836       return i;
 837   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 838                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 839   ++d->cindex;
 840   d->charclasses[i] = *s;
 841   return i;
 842 }
 843
 844 static bool
 845 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 846 {
 847   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 848 }
 849
 850 static int
 851 char_context (struct dfa const *dfa, unsigned char c)
 852 {
 853   if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
 854     return CTX_NEWLINE;
 855   if (unibyte_word_constituent (dfa, c))
 856     return CTX_LETTER;
 857   return CTX_NONE;
 858 }
 859
 860 /* Set a bit in the charclass for the given char32_t.  Do nothing if WC
 861    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 862    this may happen when folding case in weird Turkish locales where
 863    dotless i/dotted I are not included in the chosen character set.
 864    Return whether a bit was set in the charclass.  */
 865 static bool
 866 setbit_wc (char32_t wc, charclass *c)
 867 {
 868   int b = c32tob (wc);
 869   if (b < 0)
 870     return false;
 871
 872   setbit (b, c);
 873   return true;
 874 }
 875
 876 /* Set a bit for B and its case variants in the charclass C.
 877    MB_CUR_MAX must be 1.  */
 878 static void
 879 setbit_case_fold_c (int b, charclass *c)
 880 {
 881   int ub = toupper (b);
 882   for (int i = 0; i < NOTCHAR; i++)
 883     if (toupper (i) == ub)
 884       setbit (i, c);
 885 }
 886
 887 /* Fetch the next lexical input character from the pattern.  There
 888    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 889    value of the character or to WEOF depending on whether the input is
 890    a valid multibyte character (possibly of length 1).  Then return
 891    the next input byte value, except return EOF if the input is a
 892    multibyte character of length greater than 1.  */
 893 static int
 894 fetch_wc (struct dfa *dfa)
 895 {
 896   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 897                              dfa);
 898   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 899   dfa->lex.ptr += nbytes;
 900   dfa->lex.left -= nbytes;
 901   return c;
 902 }
 903
 904 /* If there is no more input, report an error about unbalanced brackets.
 905    Otherwise, behave as with fetch_wc (DFA).  */
 906 static int
 907 bracket_fetch_wc (struct dfa *dfa)
 908 {
 909   if (! dfa->lex.left)
 910     dfaerror (_("unbalanced ["));
 911   return fetch_wc (dfa);
 912 }
 913
 914 typedef int predicate (int);
 915
 916 /* The following list maps the names of the Posix named character classes
 917    to predicate functions that determine whether a given character is in
 918    the class.  The leading [ has already been eaten by the lexical
 919    analyzer.  */
 920 struct dfa_ctype
 921 {
 922   const char *name;
 923   predicate *func;
 924   bool single_byte_only;
 925 };
 926
 927 static const struct dfa_ctype prednames[] = {
 928   {"alpha", isalpha, false},
 929   {"upper", isupper, false},
 930   {"lower", islower, false},
 931   {"digit", isdigit, true},
 932   {"xdigit", isxdigit, false},
 933   {"space", isspace, false},
 934   {"punct", ispunct, false},
 935   {"alnum", isalnum, false},
 936   {"print", isprint, false},
 937   {"graph", isgraph, false},
 938   {"cntrl", iscntrl, false},
 939   {"blank", isblank, false},
 940   {NULL, NULL, false}
 941 };
 942
 943 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 944 find_pred (const char *str)
 945 {
 946   for (int i = 0; prednames[i].name; i++)
 947     if (str_eq (str, prednames[i].name))
 948       return &prednames[i];
 949   return NULL;
 950 }
 951
 952 /* Parse a bracket expression, which possibly includes multibyte
 953    characters.  */
 954 static token
 955 parse_bracket_exp (struct dfa *dfa)
 956 {
 957   /* This is a bracket expression that dfaexec is known to
 958      process correctly.  */
 959   bool known_bracket_exp = true;
 960
 961   /* Used to warn about [:space:].
 962      Bit 0 = first character is a colon.
 963      Bit 1 = last character is a colon.
 964      Bit 2 = includes any other character but a colon.
 965      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
 966   int colon_warning_state;
 967
 968   dfa->lex.brack.nchars = 0;
 969   charclass ccl;
 970   zeroset (&ccl);
 971   int c = bracket_fetch_wc (dfa);
 972   bool invert = c == '^';
 973   if (invert)
 974     {
 975       c = bracket_fetch_wc (dfa);
 976       known_bracket_exp = dfa->localeinfo.simple;
 977     }
 978   wint_t wc = dfa->lex.wctok;
 979   int c1;
 980   wint_t wc1;
 981   colon_warning_state = (c == ':');
 982   do
 983     {
 984       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
 985       colon_warning_state &= ~2;
 986
 987       /* Note that if we're looking at some other [:...:] construct,
 988          we just treat it as a bunch of ordinary characters.  We can do
 989          this because we assume regex has checked for syntax errors before
 990          dfa is ever called.  */
 991       if (c == '[')
 992         {
 993           c1 = bracket_fetch_wc (dfa);
 994           wc1 = dfa->lex.wctok;
 995
 996           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
 997               || c1 == '.' || c1 == '=')
 998             {
 999               enum { MAX_BRACKET_STRING_LEN = 32 };
1000               char str[MAX_BRACKET_STRING_LEN + 1];
1001               int len = 0;
1002               for (;;)
1003                 {
1004                   c = bracket_fetch_wc (dfa);
1005                   if (dfa->lex.left == 0
1006                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1007                     break;
1008                   if (len < MAX_BRACKET_STRING_LEN)
1009                     str[len++] = c;
1010                   else
1011                     /* This is in any case an invalid class name.  */
1012                     str[0] = '\0';
1013                 }
1014               str[len] = '\0';
1015
1016               /* Fetch bracket.  */
1017               c = bracket_fetch_wc (dfa);
1018               wc = dfa->lex.wctok;
1019               if (c1 == ':')
1020                 /* Build character class.  POSIX allows character
1021                    classes to match multicharacter collating elements,
1022                    but the regex code does not support that, so do not
1023                    worry about that possibility.  */
1024                 {
1025                   char const *class
1026                     = (dfa->syntax.case_fold && (str_eq (str, "upper")
1027                                                  || str_eq (str, "lower"))
1028                        ? "alpha" : str);
1029                   const struct dfa_ctype *pred = find_pred (class);
1030                   if (!pred)
1031                     dfaerror (_("invalid character class"));
1032
1033                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1034                     known_bracket_exp = false;
1035                   else
1036                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1037                       if (pred->func (c2))
1038                         setbit (c2, &ccl);
1039                 }
1040               else
1041                 known_bracket_exp = false;
1042
1043               colon_warning_state |= 8;
1044
1045               /* Fetch new lookahead character.  */
1046               c1 = bracket_fetch_wc (dfa);
1047               wc1 = dfa->lex.wctok;
1048               continue;
1049             }
1050
1051           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1052              are already set up.  */
1053         }
1054
1055       if (c == '\\'
1056           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1057         {
1058           c = bracket_fetch_wc (dfa);
1059           wc = dfa->lex.wctok;
1060         }
1061
1062       if (c1 == NOTCHAR)
1063         {
1064           c1 = bracket_fetch_wc (dfa);
1065           wc1 = dfa->lex.wctok;
1066         }
1067
1068       if (c1 == '-')
1069         /* build range characters.  */
1070         {
1071           int c2 = bracket_fetch_wc (dfa);
1072           wint_t wc2 = dfa->lex.wctok;
1073
1074           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1075              Treat it like [-a[.aa.]] while parsing it, and
1076              remember that the set is unknown.  */
1077           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1078             {
1079               known_bracket_exp = false;
1080               c2 = ']';
1081             }
1082
1083           if (c2 == ']')
1084             {
1085               /* In the case [x-], the - is an ordinary hyphen,
1086                  which is left in c1, the lookahead character.  */
1087               dfa->lex.ptr--;
1088               dfa->lex.left++;
1089             }
1090           else
1091             {
1092               if (c2 == '\\' && (dfa->syntax.syntax_bits
1093                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1094                 {
1095                   c2 = bracket_fetch_wc (dfa);
1096                   wc2 = dfa->lex.wctok;
1097                 }
1098
1099               colon_warning_state |= 8;
1100               c1 = bracket_fetch_wc (dfa);
1101               wc1 = dfa->lex.wctok;
1102
1103               /* Treat [x-y] as a range if x != y.  */
1104               if (wc != wc2 || wc == WEOF)
1105                 {
1106                   if (dfa->localeinfo.simple
1107                       || (c_isdigit (c) & c_isdigit (c2)))
1108                     {
1109                       for (int ci = c; ci <= c2; ci++)
1110                         if (dfa->syntax.case_fold && isalpha (ci))
1111                           setbit_case_fold_c (ci, &ccl);
1112                         else
1113                           setbit (ci, &ccl);
1114                     }
1115                   else
1116                     known_bracket_exp = false;
1117
1118                   continue;
1119                 }
1120             }
1121         }
1122
1123       colon_warning_state |= (c == ':') ? 2 : 4;
1124
1125       if (!dfa->localeinfo.multibyte)
1126         {
1127           if (dfa->syntax.case_fold && isalpha (c))
1128             setbit_case_fold_c (c, &ccl);
1129           else
1130             setbit (c, &ccl);
1131           continue;
1132         }
1133
1134       if (wc == WEOF)
1135         known_bracket_exp = false;
1136       else
1137         {
1138           char32_t folded[CASE_FOLDED_BUFSIZE + 1];
1139           int n = (dfa->syntax.case_fold
1140                    ? case_folded_counterparts (wc, folded + 1) + 1
1141                    : 1);
1142           folded[0] = wc;
1143           for (int i = 0; i < n; i++)
1144             if (!setbit_wc (folded[i], &ccl))
1145               {
1146                 dfa->lex.brack.chars
1147                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1148                                    &dfa->lex.brack.nchars_alloc, -1,
1149                                    sizeof *dfa->lex.brack.chars);
1150                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1151               }
1152         }
1153     }
1154   while ((wc = wc1, (c = c1) != ']'));
1155
1156   if (colon_warning_state == 7)
1157     {
1158       char const *msg
1159         = _("character class syntax is [[:space:]], not [:space:]");
1160       if (dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR)
1161         dfaerror (msg);
1162       else
1163         dfawarn (msg);
1164     }
1165
1166   if (! known_bracket_exp)
1167     return BACKREF;
1168
1169   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1170     {
1171       dfa->lex.brack.invert = invert;
1172       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1173       return MBCSET;
1174     }
1175
1176   if (invert)
1177     {
1178       notset (&ccl);
1179       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1180         clrbit ('\n', &ccl);
1181     }
1182
1183   return CSET + charclass_index (dfa, &ccl);
1184 }
1185
1186 struct lexptr
1187 {
1188   char const *ptr;
1189   idx_t left;
1190 };
1191
1192 static void
1193 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1194 {
1195   ls->ptr = dfa->lex.ptr;
1196   ls->left = dfa->lex.left;
1197   dfa->lex.ptr = s;
1198   dfa->lex.left = strlen (s);
1199 }
1200
1201 static void
1202 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1203 {
1204   dfa->lex.ptr = ls->ptr;
1205   dfa->lex.left = ls->left;
1206 }
1207
1208 static token
1209 lex (struct dfa *dfa)
1210 {
1211   bool backslash = false;
1212
1213   /* Basic plan: We fetch a character.  If it's a backslash,
1214      we set the backslash flag and go through the loop again.
1215      On the plus side, this avoids having a duplicate of the
1216      main switch inside the backslash case.  On the minus side,
1217      it means that just about every case tests the backslash flag.  */
1218   for (int i = 0; ; i++)
1219     {
1220       /* This loop should consume at most a backslash and some other
1221          character.  */
1222       assume (i < 2);
1223
1224       if (! dfa->lex.left)
1225         return dfa->lex.lasttok = END;
1226       int c = fetch_wc (dfa);
1227
1228       switch (c)
1229         {
1230         case '\\':
1231           if (backslash)
1232             goto normal_char;
1233           if (dfa->lex.left == 0)
1234             dfaerror (_("unfinished \\ escape"));
1235           backslash = true;
1236           break;
1237
1238         case '^':
1239           if (backslash)
1240             goto normal_char;
1241           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1242               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1243               || dfa->lex.lasttok == OR)
1244             return dfa->lex.lasttok = BEGLINE;
1245           goto normal_char;
1246
1247         case '$':
1248           if (backslash)
1249             goto normal_char;
1250           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1251               || dfa->lex.left == 0
1252               || ((dfa->lex.left
1253                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1254                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1255                                    & (dfa->lex.ptr[0] == '\\')]
1256                       == ')'))
1257               || ((dfa->lex.left
1258                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1259                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1260                                    & (dfa->lex.ptr[0] == '\\')]
1261                       == '|'))
1262               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1263                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1264             return dfa->lex.lasttok = ENDLINE;
1265           goto normal_char;
1266
1267         case '1':
1268         case '2':
1269         case '3':
1270         case '4':
1271         case '5':
1272         case '6':
1273         case '7':
1274         case '8':
1275         case '9':
1276           if (!backslash)
1277             goto normal_char;
1278           if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1279             goto stray_backslash;
1280
1281           dfa->lex.laststart = false;
1282           return dfa->lex.lasttok = BACKREF;
1283
1284         case '`':
1285           if (!backslash)
1286             goto normal_char;
1287           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1288             goto stray_backslash;
1289
1290           /* FIXME: should be beginning of string */
1291           return dfa->lex.lasttok = BEGLINE;
1292
1293         case '\'':
1294           if (!backslash)
1295             goto normal_char;
1296           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1297             goto stray_backslash;
1298
1299           /* FIXME: should be end of string */
1300           return dfa->lex.lasttok = ENDLINE;
1301
1302         case '<':
1303           if (!backslash)
1304             goto normal_char;
1305           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1306             goto stray_backslash;
1307
1308           return dfa->lex.lasttok = BEGWORD;
1309
1310         case '>':
1311           if (!backslash)
1312             goto normal_char;
1313           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1314             goto stray_backslash;
1315
1316           return dfa->lex.lasttok = ENDWORD;
1317
1318         case 'b':
1319           if (!backslash)
1320             goto normal_char;
1321           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1322             goto stray_backslash;
1323
1324           return dfa->lex.lasttok = LIMWORD;
1325
1326         case 'B':
1327           if (!backslash)
1328             goto normal_char;
1329           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1330             goto stray_backslash;
1331
1332           return dfa->lex.lasttok = NOTLIMWORD;
1333
1334         case '?':
1335           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1336             goto default_case;
1337           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1338             goto normal_char;
1339           if (dfa->lex.laststart)
1340             {
1341               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1342                 goto default_case;
1343               if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1344                 dfawarn (_("? at start of expression"));
1345             }
1346           return dfa->lex.lasttok = QMARK;
1347
1348         case '*':
1349           if (backslash)
1350             goto normal_char;
1351           if (dfa->lex.laststart)
1352             {
1353               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1354                 goto default_case;
1355               if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1356                 dfawarn (_("* at start of expression"));
1357             }
1358           return dfa->lex.lasttok = STAR;
1359
1360         case '+':
1361           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1362             goto default_case;
1363           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1364             goto normal_char;
1365           if (dfa->lex.laststart)
1366             {
1367               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1368                 goto default_case;
1369               if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1370                 dfawarn (_("+ at start of expression"));
1371             }
1372           return dfa->lex.lasttok = PLUS;
1373
1374         case '{':
1375           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1376             goto default_case;
1377           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1378             goto normal_char;
1379
1380           /* Cases:
1381              {M} - exact count
1382              {M,} - minimum count, maximum is infinity
1383              {,N} - 0 through N
1384              {,} - 0 to infinity (same as '*')
1385              {M,N} - M through N */
1386           {
1387             char const *p = dfa->lex.ptr;
1388             char const *lim = p + dfa->lex.left;
1389             dfa->lex.minrep = dfa->lex.maxrep = -1;
1390             for (; p != lim && c_isdigit (*p); p++)
1391               dfa->lex.minrep = (dfa->lex.minrep < 0
1392                                  ? *p - '0'
1393                                  : MIN (RE_DUP_MAX + 1,
1394                                         dfa->lex.minrep * 10 + *p - '0'));
1395             if (p != lim)
1396               {
1397                 if (*p != ',')
1398                   dfa->lex.maxrep = dfa->lex.minrep;
1399                 else
1400                   {
1401                     if (dfa->lex.minrep < 0)
1402                       dfa->lex.minrep = 0;
1403                     while (++p != lim && c_isdigit (*p))
1404                       dfa->lex.maxrep
1405                         = (dfa->lex.maxrep < 0
1406                            ? *p - '0'
1407                            : MIN (RE_DUP_MAX + 1,
1408                                   dfa->lex.maxrep * 10 + *p - '0'));
1409                   }
1410               }
1411             bool invalid_content
1412               = ! ((! backslash || (p != lim && *p++ == '\\'))
1413                    && p != lim && *p++ == '}'
1414                    && 0 <= dfa->lex.minrep
1415                    && (dfa->lex.maxrep < 0
1416                        || dfa->lex.minrep <= dfa->lex.maxrep));
1417             if (invalid_content
1418                 && (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1419               goto normal_char;
1420             if (dfa->lex.laststart)
1421               {
1422                 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1423                   goto default_case;
1424                 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1425                   dfawarn (_("{...} at start of expression"));
1426               }
1427             if (invalid_content)
1428               dfaerror (_("invalid content of \\{\\}"));
1429             if (RE_DUP_MAX < dfa->lex.maxrep)
1430               dfaerror (_("regular expression too big"));
1431             dfa->lex.ptr = p;
1432             dfa->lex.left = lim - p;
1433           }
1434           dfa->lex.laststart = false;
1435           return dfa->lex.lasttok = REPMN;
1436
1437         case '|':
1438           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1439             goto default_case;
1440           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1441             goto normal_char;
1442           dfa->lex.laststart = true;
1443           return dfa->lex.lasttok = OR;
1444
1445         case '\n':
1446           if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1447             goto default_case;
1448           if (backslash)
1449             goto normal_char;
1450           dfa->lex.laststart = true;
1451           return dfa->lex.lasttok = OR;
1452
1453         case '(':
1454           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1455             goto normal_char;
1456           dfa->lex.parens++;
1457           dfa->lex.laststart = true;
1458           return dfa->lex.lasttok = LPAREN;
1459
1460         case ')':
1461           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1462             goto normal_char;
1463           if (dfa->lex.parens == 0
1464               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1465             goto normal_char;
1466           dfa->lex.parens--;
1467           dfa->lex.laststart = false;
1468           return dfa->lex.lasttok = RPAREN;
1469
1470         case '.':
1471           if (backslash)
1472             goto normal_char;
1473           if (dfa->canychar < 0)
1474             {
1475               charclass ccl;
1476               fillset (&ccl);
1477               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1478                 clrbit ('\n', &ccl);
1479               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1480                 clrbit ('\0', &ccl);
1481               if (dfa->localeinfo.multibyte)
1482                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1483                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1484                     clrbit (c2, &ccl);
1485               dfa->canychar = charclass_index (dfa, &ccl);
1486             }
1487           dfa->lex.laststart = false;
1488           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1489                                      ? ANYCHAR
1490                                      : CSET + dfa->canychar);
1491
1492         case 's':
1493         case 'S':
1494           if (!backslash)
1495             goto normal_char;
1496           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1497             goto stray_backslash;
1498
1499           if (!dfa->localeinfo.multibyte)
1500             {
1501               charclass ccl;
1502               zeroset (&ccl);
1503               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1504                 if (isspace (c2))
1505                   setbit (c2, &ccl);
1506               if (c == 'S')
1507                 notset (&ccl);
1508               dfa->lex.laststart = false;
1509               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1510             }
1511
1512           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1513              add_utf8_anychar, makes sense.  */
1514
1515           /* \s and \S are documented to be equivalent to [[:space:]] and
1516              [^[:space:]] respectively, so tell the lexer to process those
1517              strings, each minus its "already processed" '['.  */
1518           {
1519             struct lexptr ls;
1520             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1521             dfa->lex.lasttok = parse_bracket_exp (dfa);
1522             pop_lex_state (dfa, &ls);
1523           }
1524
1525           dfa->lex.laststart = false;
1526           return dfa->lex.lasttok;
1527
1528         case 'w':
1529         case 'W':
1530           if (!backslash)
1531             goto normal_char;
1532           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1533             goto stray_backslash;
1534
1535           if (!dfa->localeinfo.multibyte)
1536             {
1537               charclass ccl;
1538               zeroset (&ccl);
1539               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1540                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1541                   setbit (c2, &ccl);
1542               if (c == 'W')
1543                 notset (&ccl);
1544               dfa->lex.laststart = false;
1545               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1546             }
1547
1548           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1549              add_utf8_anychar, makes sense.  */
1550
1551           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1552              [^_[:alnum:]] respectively, so tell the lexer to process those
1553              strings, each minus its "already processed" '['.  */
1554           {
1555             struct lexptr ls;
1556             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1557             dfa->lex.lasttok = parse_bracket_exp (dfa);
1558             pop_lex_state (dfa, &ls);
1559           }
1560
1561           dfa->lex.laststart = false;
1562           return dfa->lex.lasttok;
1563
1564         case '[':
1565           if (backslash)
1566             goto normal_char;
1567           dfa->lex.laststart = false;
1568           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1569
1570         default:
1571         default_case:
1572           if (!backslash)
1573             goto normal_char;
1574         stray_backslash:
1575           if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1576             {
1577               char const *msg;
1578               char msgbuf[100];
1579               if (!c32isprint (dfa->lex.wctok))
1580                 msg = _("stray \\ before unprintable character");
1581               else if (c32isspace (dfa->lex.wctok))
1582                 msg = _("stray \\ before white space");
1583               else
1584                 {
1585                   char buf[MB_LEN_MAX + 1];
1586                   mbstate_t s;
1587                   mbszero (&s);
1588                   size_t stored_bytes = c32rtomb (buf, dfa->lex.wctok, &s);
1589                   if (stored_bytes < (size_t) -1)
1590                     {
1591                       buf[stored_bytes] = '\0';
1592                       int n = snprintf (msgbuf, sizeof msgbuf,
1593                                         _("stray \\ before %s"), buf);
1594                       msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1595                     }
1596                   else
1597                     msg = _("stray \\");
1598                 }
1599               dfawarn (msg);
1600             }
1601           FALLTHROUGH;
1602         case ']': case '}':
1603         normal_char:
1604           dfa->lex.laststart = false;
1605           /* For multibyte character sets, folding is done in atom.  Always
1606              return WCHAR.  */
1607           if (dfa->localeinfo.multibyte)
1608             return dfa->lex.lasttok = WCHAR;
1609
1610           if (dfa->syntax.case_fold && isalpha (c))
1611             {
1612               charclass ccl;
1613               zeroset (&ccl);
1614               setbit_case_fold_c (c, &ccl);
1615               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1616             }
1617
1618           return dfa->lex.lasttok = c;
1619         }
1620     }
1621 }
1622
1623 static void
1624 addtok_mb (struct dfa *dfa, token t, char mbprop)
1625 {
1626   if (dfa->talloc == dfa->tindex)
1627     {
1628       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1629                              sizeof *dfa->tokens);
1630       if (dfa->localeinfo.multibyte)
1631         dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1632                                              sizeof *dfa->multibyte_prop);
1633     }
1634   if (dfa->localeinfo.multibyte)
1635     dfa->multibyte_prop[dfa->tindex] = mbprop;
1636   dfa->tokens[dfa->tindex++] = t;
1637
1638   switch (t)
1639     {
1640     case QMARK:
1641     case STAR:
1642     case PLUS:
1643       break;
1644
1645     case CAT:
1646     case OR:
1647       dfa->parse.depth--;
1648       break;
1649
1650     case EMPTY:
1651       dfa->epsilon = true;
1652       goto increment_depth;
1653
1654     case BACKREF:
1655       dfa->fast = false;
1656       goto increment_nleaves;
1657
1658     case BEGLINE:
1659     case ENDLINE:
1660     case BEGWORD:
1661     case ENDWORD:
1662     case LIMWORD:
1663     case NOTLIMWORD:
1664       dfa->epsilon = true;
1665       FALLTHROUGH;
1666     default:
1667     increment_nleaves:
1668       dfa->nleaves++;
1669     increment_depth:
1670       dfa->parse.depth++;
1671       if (dfa->depth < dfa->parse.depth)
1672         dfa->depth = dfa->parse.depth;
1673       break;
1674     }
1675 }
1676
1677 static void addtok_wc (struct dfa *dfa, wint_t wc);
1678
1679 /* Add the given token to the parse tree, maintaining the depth count and
1680    updating the maximum depth if necessary.  */
1681 static void
1682 addtok (struct dfa *dfa, token t)
1683 {
1684   if (dfa->localeinfo.multibyte && t == MBCSET)
1685     {
1686       bool need_or = false;
1687
1688       /* Extract wide characters into alternations for better performance.
1689          This does not require UTF-8.  */
1690       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1691         {
1692           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1693           if (need_or)
1694             addtok (dfa, OR);
1695           need_or = true;
1696         }
1697       dfa->lex.brack.nchars = 0;
1698
1699       /* Wide characters have been handled above, so it is possible
1700          that the set is empty now.  Do nothing in that case.  */
1701       if (dfa->lex.brack.cset != -1)
1702         {
1703           addtok (dfa, CSET + dfa->lex.brack.cset);
1704           if (need_or)
1705             addtok (dfa, OR);
1706         }
1707     }
1708   else
1709     {
1710       addtok_mb (dfa, t, 3);
1711     }
1712 }
1713
1714 /* We treat a multibyte character as a single atom, so that DFA
1715    can treat a multibyte character as a single expression.
1716
1717    e.g., we construct the following tree from "<mb1><mb2>".
1718    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1719    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1720 static void
1721 addtok_wc (struct dfa *dfa, wint_t wc)
1722 {
1723   unsigned char buf[MB_LEN_MAX];
1724   mbstate_t s;
1725   mbszero (&s);
1726   size_t stored_bytes = c32rtomb ((char *) buf, wc, &s);
1727   int buflen;
1728
1729   if (stored_bytes != (size_t) -1)
1730     buflen = stored_bytes;
1731   else
1732     {
1733       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1734          the addtok_mb call altogether can corrupt the heap.  */
1735       buflen = 1;
1736       buf[0] = 0;
1737     }
1738
1739   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1740   for (int i = 1; i < buflen; i++)
1741     {
1742       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1743       addtok (dfa, CAT);
1744     }
1745 }
1746
1747 static void
1748 add_utf8_anychar (struct dfa *dfa)
1749 {
1750   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1751      UTF-8 byte sequence has been defined as follows:
1752
1753      ([\x00-\x7f]
1754      |[\xc2-\xdf][\x80-\xbf]
1755      |[\xe0][\xa0-\xbf][\x80-\xbf]
1756      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1757      |[\xed][\x80-\x9f][\x80-\xbf]
1758      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1759      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1760      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1761
1762      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1763      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1764      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1765      H = [\x80-\x9f], I = [\xf0],
1766      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1767
1768      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
1769
1770   /* Mnemonics for classes containing two or more bytes.  */
1771   enum { A, B, C, E, F, H, J, K, M };
1772
1773   /* Mnemonics for single-byte tokens.  */
1774   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1775
1776   static charclass const utf8_classes[] = {
1777     /* A. 00-7f: 1-byte sequence.  */
1778     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1779
1780     /* B. c2-df: 1st byte of a 2-byte sequence.  */
1781     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1782
1783     /* C. 80-bf: non-leading bytes.  */
1784     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1785
1786     /* D. e0 (just a token).  */
1787
1788     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
1789     CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1790
1791     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
1792     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1793
1794     /* G. ed (just a token).  */
1795
1796     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
1797     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1798
1799     /* I. f0 (just a token).  */
1800
1801     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
1802     CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1803
1804     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
1805     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1806
1807     /* L. f4 (just a token).  */
1808
1809     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
1810     CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1811   };
1812
1813   /* Define the character classes that are needed below.  */
1814   if (dfa->utf8_anychar_classes[0] == 0)
1815     {
1816       charclass c = utf8_classes[0];
1817       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1818         clrbit ('\n', &c);
1819       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1820         clrbit ('\0', &c);
1821       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1822
1823       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1824         dfa->utf8_anychar_classes[i]
1825           = CSET + charclass_index (dfa, &utf8_classes[i]);
1826     }
1827
1828   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1829      The token buffer is in reverse Polish order, so we get
1830      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1831       C CAT OR C CAT OR C CAT OR".  */
1832   addtok (dfa, dfa->utf8_anychar_classes[A]);
1833   addtok (dfa, dfa->utf8_anychar_classes[B]);
1834   addtok (dfa, D_token);
1835   addtok (dfa, dfa->utf8_anychar_classes[E]);
1836   addtok (dfa, CAT);
1837   addtok (dfa, OR);
1838   addtok (dfa, G_token);
1839   addtok (dfa, dfa->utf8_anychar_classes[H]);
1840   addtok (dfa, CAT);
1841   addtok (dfa, OR);
1842   addtok (dfa, dfa->utf8_anychar_classes[F]);
1843   addtok (dfa, I_token);
1844   addtok (dfa, dfa->utf8_anychar_classes[J]);
1845   addtok (dfa, CAT);
1846   addtok (dfa, OR);
1847   addtok (dfa, L_token);
1848   addtok (dfa, dfa->utf8_anychar_classes[M]);
1849   addtok (dfa, CAT);
1850   addtok (dfa, OR);
1851   addtok (dfa, dfa->utf8_anychar_classes[K]);
1852   for (int i = 0; i < 3; i++)
1853     {
1854       addtok (dfa, dfa->utf8_anychar_classes[C]);
1855       addtok (dfa, CAT);
1856       addtok (dfa, OR);
1857     }
1858 }
1859
1860 /* The grammar understood by the parser is as follows.
1861
1862    regexp:
1863      regexp OR branch
1864      branch
1865
1866    branch:
1867      branch closure
1868      closure
1869
1870    closure:
1871      closure QMARK
1872      closure STAR
1873      closure PLUS
1874      closure REPMN
1875      atom
1876
1877    atom:
1878      <normal character>
1879      <multibyte character>
1880      ANYCHAR
1881      MBCSET
1882      CSET
1883      BACKREF
1884      BEGLINE
1885      ENDLINE
1886      BEGWORD
1887      ENDWORD
1888      LIMWORD
1889      NOTLIMWORD
1890      LPAREN regexp RPAREN
1891      <empty>
1892
1893    The parser builds a parse tree in postfix form in an array of tokens.  */
1894
1895 static void
1896 atom (struct dfa *dfa)
1897 {
1898   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1899       || dfa->parse.tok >= CSET
1900       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1901       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1902       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1903       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1904       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1905     {
1906       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1907         {
1908           /* For UTF-8 expand the period to a series of CSETs that define a
1909              valid UTF-8 character.  This avoids using the slow multibyte
1910              path.  I'm pretty sure it would be both profitable and correct to
1911              do it for any encoding; however, the optimization must be done
1912              manually as it is done above in add_utf8_anychar.  So, let's
1913              start with UTF-8: it is the most used, and the structure of the
1914              encoding makes the correctness more obvious.  */
1915           add_utf8_anychar (dfa);
1916         }
1917       else
1918         addtok (dfa, dfa->parse.tok);
1919       dfa->parse.tok = lex (dfa);
1920     }
1921   else if (dfa->parse.tok == WCHAR)
1922     {
1923       if (dfa->lex.wctok == WEOF)
1924         addtok (dfa, BACKREF);
1925       else
1926         {
1927           addtok_wc (dfa, dfa->lex.wctok);
1928
1929           if (dfa->syntax.case_fold)
1930             {
1931               char32_t folded[CASE_FOLDED_BUFSIZE];
1932               int n = case_folded_counterparts (dfa->lex.wctok, folded);
1933               for (int i = 0; i < n; i++)
1934                 {
1935                   addtok_wc (dfa, folded[i]);
1936                   addtok (dfa, OR);
1937                 }
1938             }
1939         }
1940
1941       dfa->parse.tok = lex (dfa);
1942     }
1943   else if (dfa->parse.tok == LPAREN)
1944     {
1945       dfa->parse.tok = lex (dfa);
1946       regexp (dfa);
1947       if (dfa->parse.tok != RPAREN)
1948         dfaerror (_("unbalanced ("));
1949       dfa->parse.tok = lex (dfa);
1950     }
1951   else
1952     addtok (dfa, EMPTY);
1953 }
1954
1955 /* Return the number of tokens in the given subexpression.  */
1956 static idx_t _GL_ATTRIBUTE_PURE
1957 nsubtoks (struct dfa const *dfa, idx_t tindex)
1958 {
1959   switch (dfa->tokens[tindex - 1])
1960     {
1961     default:
1962       return 1;
1963     case QMARK:
1964     case STAR:
1965     case PLUS:
1966       return 1 + nsubtoks (dfa, tindex - 1);
1967     case CAT:
1968     case OR:
1969       {
1970         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1971         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1972       }
1973     }
1974 }
1975
1976 /* Copy the given subexpression to the top of the tree.  */
1977 static void
1978 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1979 {
1980   if (dfa->localeinfo.multibyte)
1981     for (idx_t i = 0; i < ntokens; i++)
1982       addtok_mb (dfa, dfa->tokens[tindex + i],
1983                  dfa->multibyte_prop[tindex + i]);
1984   else
1985     for (idx_t i = 0; i < ntokens; i++)
1986       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1987 }
1988
1989 static void
1990 closure (struct dfa *dfa)
1991 {
1992   atom (dfa);
1993   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1994          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1995     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1996       {
1997         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1998         idx_t tindex = dfa->tindex - ntokens;
1999         if (dfa->lex.maxrep < 0)
2000           addtok (dfa, PLUS);
2001         if (dfa->lex.minrep == 0)
2002           addtok (dfa, QMARK);
2003         int i;
2004         for (i = 1; i < dfa->lex.minrep; i++)
2005           {
2006             copytoks (dfa, tindex, ntokens);
2007             addtok (dfa, CAT);
2008           }
2009         for (; i < dfa->lex.maxrep; i++)
2010           {
2011             copytoks (dfa, tindex, ntokens);
2012             addtok (dfa, QMARK);
2013             addtok (dfa, CAT);
2014           }
2015         dfa->parse.tok = lex (dfa);
2016       }
2017     else if (dfa->parse.tok == REPMN)
2018       {
2019         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
2020         dfa->parse.tok = lex (dfa);
2021         closure (dfa);
2022       }
2023     else
2024       {
2025         addtok (dfa, dfa->parse.tok);
2026         dfa->parse.tok = lex (dfa);
2027       }
2028 }
2029
2030 static void
2031 branch (struct dfa* dfa)
2032 {
2033   closure (dfa);
2034   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2035          && dfa->parse.tok >= 0)
2036     {
2037       closure (dfa);
2038       addtok (dfa, CAT);
2039     }
2040 }
2041
2042 static void
2043 regexp (struct dfa *dfa)
2044 {
2045   branch (dfa);
2046   while (dfa->parse.tok == OR)
2047     {
2048       dfa->parse.tok = lex (dfa);
2049       branch (dfa);
2050       addtok (dfa, OR);
2051     }
2052 }
2053
2054 /* Parse a string S of length LEN into D.  S can include NUL characters.
2055    This is the main entry point for the parser.  */
2056 void
2057 dfaparse (char const *s, idx_t len, struct dfa *d)
2058 {
2059   d->lex.ptr = s;
2060   d->lex.left = len;
2061   d->lex.lasttok = END;
2062   d->lex.laststart = true;
2063
2064   if (!d->syntax.syntax_bits_set)
2065     dfaerror (_("no syntax specified"));
2066
2067   if (!d->nregexps)
2068     addtok (d, BEG);
2069
2070   d->parse.tok = lex (d);
2071   d->parse.depth = d->depth;
2072
2073   regexp (d);
2074
2075   if (d->parse.tok != END)
2076     dfaerror (_("unbalanced )"));
2077
2078   addtok (d, END - d->nregexps);
2079   addtok (d, CAT);
2080
2081   if (d->nregexps)
2082     addtok (d, OR);
2083
2084   ++d->nregexps;
2085 }
2086
2087 /* Some primitives for operating on sets of positions.  */
2088
2089 /* Copy one set to another.  */
2090 static void
2091 copy (position_set const *src, position_set *dst)
2092 {
2093   if (dst->alloc < src->nelem)
2094     {
2095       free (dst->elems);
2096       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2097                             sizeof *dst->elems);
2098     }
2099   dst->nelem = src->nelem;
2100   if (src->nelem != 0)
2101     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2102 }
2103
2104 static void
2105 alloc_position_set (position_set *s, idx_t size)
2106 {
2107   s->elems = xnmalloc (size, sizeof *s->elems);
2108   s->alloc = size;
2109   s->nelem = 0;
2110 }
2111
2112 /* Insert position P in set S.  S is maintained in sorted order on
2113    decreasing index.  If there is already an entry in S with P.index
2114    then merge (logically-OR) P's constraints into the one in S.
2115    S->elems must point to an array large enough to hold the resulting set.  */
2116 static void
2117 insert (position p, position_set *s)
2118 {
2119   idx_t count = s->nelem;
2120   idx_t lo = 0, hi = count;
2121   while (lo < hi)
2122     {
2123       idx_t mid = (lo + hi) >> 1;
2124       if (s->elems[mid].index < p.index)
2125         lo = mid + 1;
2126       else if (s->elems[mid].index == p.index)
2127         {
2128           s->elems[mid].constraint |= p.constraint;
2129           return;
2130         }
2131       else
2132         hi = mid;
2133     }
2134
2135   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2136   for (idx_t i = count; i > lo; i--)
2137     s->elems[i] = s->elems[i - 1];
2138   s->elems[lo] = p;
2139   ++s->nelem;
2140 }
2141
2142 static void
2143 append (position p, position_set *s)
2144 {
2145   idx_t count = s->nelem;
2146   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2147   s->elems[s->nelem++] = p;
2148 }
2149
2150 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2151    result is as if the positions of S1, and of S2 with the additional
2152    constraint C2, were inserted into an initially empty set.  */
2153 static void
2154 merge_constrained (position_set const *s1, position_set const *s2,
2155                    unsigned int c2, position_set *m)
2156 {
2157   idx_t i = 0, j = 0;
2158
2159   if (m->alloc - s1->nelem < s2->nelem)
2160     {
2161       free (m->elems);
2162       m->alloc = s1->nelem;
2163       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2164     }
2165   m->nelem = 0;
2166   while (i < s1->nelem || j < s2->nelem)
2167     if (! (j < s2->nelem)
2168         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2169       {
2170         unsigned int c = ((i < s1->nelem && j < s2->nelem
2171                            && s1->elems[i].index == s2->elems[j].index)
2172                           ? s2->elems[j++].constraint & c2
2173                           : 0);
2174         m->elems[m->nelem].index = s1->elems[i].index;
2175         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2176       }
2177     else
2178       {
2179         if (s2->elems[j].constraint & c2)
2180           {
2181             m->elems[m->nelem].index = s2->elems[j].index;
2182             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2183           }
2184         j++;
2185       }
2186 }
2187
2188 /* Merge two sets of positions into a third.  The result is exactly as if
2189    the positions of both sets were inserted into an initially empty set.  */
2190 static void
2191 merge (position_set const *s1, position_set const *s2, position_set *m)
2192 {
2193   merge_constrained (s1, s2, -1, m);
2194 }
2195
2196 /* Merge into DST all the elements of SRC, possibly destroying
2197    the contents of the temporary M.  */
2198 static void
2199 merge2 (position_set *dst, position_set const *src, position_set *m)
2200 {
2201   if (src->nelem < 4)
2202     {
2203       for (idx_t i = 0; i < src->nelem; i++)
2204         insert (src->elems[i], dst);
2205     }
2206    else
2207     {
2208       merge (src, dst, m);
2209       copy (m, dst);
2210     }
2211 }
2212
2213 /* Delete a position from a set.  Return the nonzero constraint of the
2214    deleted position, or zero if there was no such position.  */
2215 static unsigned int
2216 delete (idx_t del, position_set *s)
2217 {
2218   idx_t count = s->nelem;
2219   idx_t lo = 0, hi = count;
2220   while (lo < hi)
2221     {
2222       idx_t mid = (lo + hi) >> 1;
2223       if (s->elems[mid].index < del)
2224         lo = mid + 1;
2225       else if (s->elems[mid].index == del)
2226         {
2227           unsigned int c = s->elems[mid].constraint;
2228           idx_t i;
2229           for (i = mid; i + 1 < count; i++)
2230             s->elems[i] = s->elems[i + 1];
2231           s->nelem = i;
2232           return c;
2233         }
2234       else
2235         hi = mid;
2236     }
2237   return 0;
2238 }
2239
2240 /* Replace a position with the followed set.  */
2241 static void
2242 replace (position_set *dst, idx_t del, position_set *add,
2243          unsigned int constraint, position_set *tmp)
2244 {
2245   unsigned int c = delete (del, dst) & constraint;
2246
2247   if (c)
2248     {
2249       copy (dst, tmp);
2250       merge_constrained (tmp, add, c, dst);
2251     }
2252 }
2253
2254 /* Find the index of the state corresponding to the given position set with
2255    the given preceding context, or create a new state if there is no such
2256    state.  Context tells whether we got here on a newline or letter.  */
2257 static state_num
2258 state_index (struct dfa *d, position_set const *s, int context)
2259 {
2260   size_t hash = 0;
2261   int constraint = 0;
2262   state_num i;
2263
2264   for (i = 0; i < s->nelem; ++i)
2265     {
2266       idx_t ind = s->elems[i].index;
2267       hash ^= ind + s->elems[i].constraint;
2268     }
2269
2270   /* Try to find a state that exactly matches the proposed one.  */
2271   for (i = 0; i < d->sindex; ++i)
2272     {
2273       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2274           || context != d->states[i].context)
2275         continue;
2276       state_num j;
2277       for (j = 0; j < s->nelem; ++j)
2278         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2279             || s->elems[j].index != d->states[i].elems.elems[j].index)
2280           break;
2281       if (j == s->nelem)
2282         return i;
2283     }
2284
2285 #ifdef DEBUG
2286   fprintf (stderr, "new state %td\n nextpos:", i);
2287   for (state_num j = 0; j < s->nelem; j++)
2288     {
2289       fprintf (stderr, " %td:", s->elems[j].index);
2290       prtok (d->tokens[s->elems[j].index]);
2291     }
2292   fprintf (stderr, "\n context:");
2293   if (context ^ CTX_ANY)
2294     {
2295       if (context & CTX_NONE)
2296         fprintf (stderr, " CTX_NONE");
2297       if (context & CTX_LETTER)
2298         fprintf (stderr, " CTX_LETTER");
2299       if (context & CTX_NEWLINE)
2300         fprintf (stderr, " CTX_NEWLINE");
2301     }
2302   else
2303     fprintf (stderr, " CTX_ANY");
2304   fprintf (stderr, "\n");
2305 #endif
2306
2307   for (state_num j = 0; j < s->nelem; j++)
2308     {
2309       int c = d->constraints[s->elems[j].index];
2310
2311       if (c != 0)
2312         {
2313           if (succeeds_in_context (c, context, CTX_ANY))
2314             constraint |= c;
2315         }
2316       else if (d->tokens[s->elems[j].index] == BACKREF)
2317         constraint = NO_CONSTRAINT;
2318     }
2319
2320
2321   /* Create a new state.  */
2322   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2323                              sizeof *d->states);
2324   d->states[i].hash = hash;
2325   alloc_position_set (&d->states[i].elems, s->nelem);
2326   copy (s, &d->states[i].elems);
2327   d->states[i].context = context;
2328   d->states[i].constraint = constraint;
2329   d->states[i].mbps.nelem = 0;
2330   d->states[i].mbps.elems = NULL;
2331   d->states[i].mb_trindex = -1;
2332
2333   ++d->sindex;
2334
2335   return i;
2336 }
2337
2338 /* Find the epsilon closure of D's set of positions.  If any position of the set
2339    contains a symbol that matches the empty string in some context, replace
2340    that position with the elements of its follow labeled with an appropriate
2341    constraint.  Repeat exhaustively until no funny positions are left.
2342    S->elems must be large enough to hold the result.  BACKWARD is D's
2343    backward set; use and update it too.  */
2344 static void
2345 epsclosure (struct dfa const *d, position_set *backward)
2346 {
2347   position_set tmp;
2348   alloc_position_set (&tmp, d->nleaves);
2349   for (idx_t i = 0; i < d->tindex; i++)
2350     if (0 < d->follows[i].nelem)
2351       {
2352         unsigned int constraint;
2353         switch (d->tokens[i])
2354           {
2355           default:
2356             continue;
2357
2358           case BEGLINE:
2359             constraint = BEGLINE_CONSTRAINT;
2360             break;
2361           case ENDLINE:
2362             constraint = ENDLINE_CONSTRAINT;
2363             break;
2364           case BEGWORD:
2365             constraint = BEGWORD_CONSTRAINT;
2366             break;
2367           case ENDWORD:
2368             constraint = ENDWORD_CONSTRAINT;
2369             break;
2370           case LIMWORD:
2371             constraint = LIMWORD_CONSTRAINT;
2372             break;
2373           case NOTLIMWORD:
2374             constraint = NOTLIMWORD_CONSTRAINT;
2375             break;
2376           case EMPTY:
2377             constraint = NO_CONSTRAINT;
2378             break;
2379           }
2380
2381         delete (i, &d->follows[i]);
2382
2383         for (idx_t j = 0; j < backward[i].nelem; j++)
2384           replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2385                    constraint, &tmp);
2386         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2387           replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2388                    NO_CONSTRAINT, &tmp);
2389       }
2390   free (tmp.elems);
2391 }
2392
2393 /* Returns the set of contexts for which there is at least one
2394    character included in C.  */
2395
2396 static int
2397 charclass_context (struct dfa const *dfa, charclass const *c)
2398 {
2399   int context = 0;
2400
2401   for (int j = 0; j < CHARCLASS_WORDS; j++)
2402     {
2403       if (c->w[j] & dfa->syntax.newline.w[j])
2404         context |= CTX_NEWLINE;
2405       if (c->w[j] & dfa->syntax.letters.w[j])
2406         context |= CTX_LETTER;
2407       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2408         context |= CTX_NONE;
2409     }
2410
2411   return context;
2412 }
2413
2414 /* Returns the contexts on which the position set S depends.  Each context
2415    in the set of returned contexts (let's call it SC) may have a different
2416    follow set than other contexts in SC, and also different from the
2417    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2418    in the complement set will have the same follow set.  */
2419
2420 static int _GL_ATTRIBUTE_PURE
2421 state_separate_contexts (struct dfa *d, position_set const *s)
2422 {
2423   int separate_contexts = 0;
2424
2425   for (idx_t j = 0; j < s->nelem; j++)
2426     separate_contexts |= d->separates[s->elems[j].index];
2427
2428   return separate_contexts;
2429 }
2430
2431 enum
2432 {
2433   /* Single token is repeated.  It is distinguished from non-repeated.  */
2434   OPT_REPEAT = (1 << 0),
2435
2436   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2437      node is not merged.  */
2438   OPT_LPAREN = (1 << 1),
2439
2440   /* Multiple branches are joined.  The node is not merged.  */
2441   OPT_RPAREN = (1 << 2),
2442
2443   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2444      flag is turned on. */
2445   OPT_WALKED = (1 << 3),
2446
2447   /* The node is queued.  The node is not queued again.  */
2448   OPT_QUEUED = (1 << 4)
2449 };
2450
2451 static void
2452 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2453                  position_set *merged)
2454 {
2455   position_set *follows = d->follows;
2456   idx_t nelem = 0;
2457
2458   for (idx_t i = 0; i < follows[tindex].nelem; i++)
2459     {
2460       idx_t sindex = follows[tindex].elems[i].index;
2461
2462       /* Skip the node as pruned in future.  */
2463       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2464       if (iconstraint == 0)
2465         continue;
2466
2467       if (d->tokens[follows[tindex].elems[i].index] <= END)
2468         {
2469           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2470           continue;
2471         }
2472
2473       if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2474         {
2475           idx_t j;
2476
2477           for (j = 0; j < nelem; j++)
2478             {
2479               idx_t dindex = follows[tindex].elems[j].index;
2480
2481               if (dindex == tindex)
2482                 continue;
2483
2484               if (follows[tindex].elems[j].constraint != iconstraint)
2485                 continue;
2486
2487               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2488                 continue;
2489
2490               if (d->tokens[sindex] != d->tokens[dindex])
2491                 continue;
2492
2493               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2494                 continue;
2495
2496               if (flags[sindex] & OPT_REPEAT)
2497                 delete (sindex, &follows[sindex]);
2498
2499               merge2 (&follows[dindex], &follows[sindex], merged);
2500
2501               break;
2502             }
2503
2504           if (j < nelem)
2505             continue;
2506         }
2507
2508       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2509       flags[sindex] |= OPT_QUEUED;
2510     }
2511
2512   follows[tindex].nelem = nelem;
2513 }
2514
2515 static int
2516 compare (const void *a, const void *b)
2517 {
2518   position const *p = a, *q = b;
2519   return (p->index > q->index) - (p->index < q->index);
2520 }
2521
2522 static void
2523 reorder_tokens (struct dfa *d)
2524 {
2525   idx_t nleaves = 0;
2526   ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2527   map[0] = nleaves++;
2528   for (idx_t i = 1; i < d->tindex; i++)
2529     map[i] = -1;
2530
2531   token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2532   position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2533   int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2534   char *multibyte_prop = (d->localeinfo.multibyte
2535                           ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2536                           : NULL);
2537
2538   for (idx_t i = 0; i < d->tindex; i++)
2539     {
2540       if (map[i] < 0)
2541         {
2542           free (d->follows[i].elems);
2543           d->follows[i].elems = NULL;
2544           d->follows[i].nelem = 0;
2545           continue;
2546         }
2547
2548       tokens[map[i]] = d->tokens[i];
2549       follows[map[i]] = d->follows[i];
2550       constraints[map[i]] = d->constraints[i];
2551
2552       if (multibyte_prop != NULL)
2553         multibyte_prop[map[i]] = d->multibyte_prop[i];
2554
2555       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2556         {
2557           if (map[d->follows[i].elems[j].index] == -1)
2558             map[d->follows[i].elems[j].index] = nleaves++;
2559
2560           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2561         }
2562
2563       qsort (d->follows[i].elems, d->follows[i].nelem,
2564              sizeof *d->follows[i].elems, compare);
2565     }
2566
2567   for (idx_t i = 0; i < nleaves; i++)
2568     {
2569       d->tokens[i] = tokens[i];
2570       d->follows[i] = follows[i];
2571       d->constraints[i] = constraints[i];
2572
2573       if (multibyte_prop != NULL)
2574         d->multibyte_prop[i] = multibyte_prop[i];
2575     }
2576
2577   d->tindex = d->nleaves = nleaves;
2578
2579   free (tokens);
2580   free (follows);
2581   free (constraints);
2582   free (multibyte_prop);
2583   free (map);
2584 }
2585
2586 static void
2587 dfaoptimize (struct dfa *d)
2588 {
2589   char *flags = xizalloc (d->tindex);
2590
2591   for (idx_t i = 0; i < d->tindex; i++)
2592     {
2593       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2594         {
2595           if (d->follows[i].elems[j].index == i)
2596             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2597           else if (d->follows[i].elems[j].index < i)
2598             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2599           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2600             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2601           else
2602             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2603         }
2604     }
2605
2606   flags[0] |= OPT_QUEUED;
2607
2608   position_set merged0;
2609   position_set *merged = &merged0;
2610   alloc_position_set (merged, d->nleaves);
2611
2612   d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2613
2614   for (idx_t i = 0; i < d->tindex; i++)
2615     if (flags[i] & OPT_QUEUED)
2616       merge_nfa_state (d, i, flags, merged);
2617
2618   reorder_tokens (d);
2619
2620   free (merged->elems);
2621   free (flags);
2622 }
2623
2624 /* Perform bottom-up analysis on the parse tree, computing various functions.
2625    Note that at this point, we're pretending constructs like \< are real
2626    characters rather than constraints on what can follow them.
2627
2628    Nullable:  A node is nullable if it is at the root of a regexp that can
2629    match the empty string.
2630    *  EMPTY leaves are nullable.
2631    * No other leaf is nullable.
2632    * A QMARK or STAR node is nullable.
2633    * A PLUS node is nullable if its argument is nullable.
2634    * A CAT node is nullable if both its arguments are nullable.
2635    * An OR node is nullable if either argument is nullable.
2636
2637    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2638    that could correspond to the first character of a string matching the
2639    regexp rooted at the given node.
2640    * EMPTY leaves have empty firstpos.
2641    * The firstpos of a nonempty leaf is that leaf itself.
2642    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2643      argument.
2644    * The firstpos of a CAT node is the firstpos of the left argument, union
2645      the firstpos of the right if the left argument is nullable.
2646    * The firstpos of an OR node is the union of firstpos of each argument.
2647
2648    Lastpos:  The lastpos of a node is the set of positions that could
2649    correspond to the last character of a string matching the regexp at
2650    the given node.
2651    * EMPTY leaves have empty lastpos.
2652    * The lastpos of a nonempty leaf is that leaf itself.
2653    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2654      argument.
2655    * The lastpos of a CAT node is the lastpos of its right argument, union
2656      the lastpos of the left if the right argument is nullable.
2657    * The lastpos of an OR node is the union of the lastpos of each argument.
2658
2659    Follow:  The follow of a position is the set of positions that could
2660    correspond to the character following a character matching the node in
2661    a string matching the regexp.  At this point we consider special symbols
2662    that match the empty string in some context to be just normal characters.
2663    Later, if we find that a special symbol is in a follow set, we will
2664    replace it with the elements of its follow, labeled with an appropriate
2665    constraint.
2666    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2667      the follow of every node in the lastpos.
2668    * Every node in the firstpos of the second argument of a CAT node is in
2669      the follow of every node in the lastpos of the first argument.
2670
2671    Because of the postfix representation of the parse tree, the depth-first
2672    analysis is conveniently done by a linear scan with the aid of a stack.
2673    Sets are stored as arrays of the elements, obeying a stack-like allocation
2674    scheme; the number of elements in each set deeper in the stack can be
2675    used to determine the address of a particular set's array.  */
2676 static void
2677 dfaanalyze (struct dfa *d, bool searchflag)
2678 {
2679   /* Array allocated to hold position sets.  */
2680   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2681   /* Firstpos and lastpos elements.  */
2682   position *firstpos = posalloc;
2683   position *lastpos = firstpos + d->nleaves;
2684   position pos;
2685   position_set tmp;
2686
2687   /* Stack for element counts and nullable flags.  */
2688   struct
2689   {
2690     /* Whether the entry is nullable.  */
2691     bool nullable;
2692
2693     /* Counts of firstpos and lastpos sets.  */
2694     idx_t nfirstpos;
2695     idx_t nlastpos;
2696   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2697
2698   position_set merged;          /* Result of merging sets.  */
2699
2700   addtok (d, CAT);
2701   idx_t tindex = d->tindex;
2702
2703 #ifdef DEBUG
2704   fprintf (stderr, "dfaanalyze:\n");
2705   for (idx_t i = 0; i < tindex; i++)
2706     {
2707       fprintf (stderr, " %td:", i);
2708       prtok (d->tokens[i]);
2709     }
2710   putc ('\n', stderr);
2711 #endif
2712
2713   d->searchflag = searchflag;
2714   alloc_position_set (&merged, d->nleaves);
2715   d->follows = xicalloc (tindex, sizeof *d->follows);
2716   position_set *backward
2717     = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2718
2719   for (idx_t i = 0; i < tindex; i++)
2720     {
2721       switch (d->tokens[i])
2722         {
2723         case EMPTY:
2724           /* The empty set is nullable.  */
2725           stk->nullable = true;
2726
2727           /* The firstpos and lastpos of the empty leaf are both empty.  */
2728           stk->nfirstpos = stk->nlastpos = 0;
2729           stk++;
2730           break;
2731
2732         case STAR:
2733         case PLUS:
2734           /* Every element in the lastpos of the argument is in the backward
2735              set of every element in the firstpos.  */
2736           if (d->epsilon)
2737             {
2738               tmp.elems = lastpos - stk[-1].nlastpos;
2739               tmp.nelem = stk[-1].nlastpos;
2740               for (position *p = firstpos - stk[-1].nfirstpos;
2741                    p < firstpos; p++)
2742                 merge2 (&backward[p->index], &tmp, &merged);
2743             }
2744
2745           /* Every element in the firstpos of the argument is in the follow
2746              of every element in the lastpos.  */
2747           {
2748             tmp.elems = firstpos - stk[-1].nfirstpos;
2749             tmp.nelem = stk[-1].nfirstpos;
2750             for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2751               merge2 (&d->follows[p->index], &tmp, &merged);
2752           }
2753           FALLTHROUGH;
2754         case QMARK:
2755           /* A QMARK or STAR node is automatically nullable.  */
2756           if (d->tokens[i] != PLUS)
2757             stk[-1].nullable = true;
2758           break;
2759
2760         case CAT:
2761           /* Every element in the lastpos of the first argument is in
2762              the backward set of every element in the firstpos of the
2763              second argument.  */
2764           if (backward)
2765             {
2766               tmp.nelem = stk[-2].nlastpos;
2767               tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2768               for (position *p = firstpos - stk[-1].nfirstpos;
2769                    p < firstpos; p++)
2770                 merge2 (&backward[p->index], &tmp, &merged);
2771             }
2772
2773           /* Every element in the firstpos of the second argument is in the
2774              follow of every element in the lastpos of the first argument.  */
2775           {
2776             tmp.nelem = stk[-1].nfirstpos;
2777             tmp.elems = firstpos - stk[-1].nfirstpos;
2778             for (position *plim = lastpos - stk[-1].nlastpos,
2779                    *p = plim - stk[-2].nlastpos;
2780                  p < plim; p++)
2781               merge2 (&d->follows[p->index], &tmp, &merged);
2782           }
2783
2784           /* The firstpos of a CAT node is the firstpos of the first argument,
2785              union that of the second argument if the first is nullable.  */
2786           if (stk[-2].nullable)
2787             stk[-2].nfirstpos += stk[-1].nfirstpos;
2788           else
2789             firstpos -= stk[-1].nfirstpos;
2790
2791           /* The lastpos of a CAT node is the lastpos of the second argument,
2792              union that of the first argument if the second is nullable.  */
2793           if (stk[-1].nullable)
2794             stk[-2].nlastpos += stk[-1].nlastpos;
2795           else
2796             {
2797               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2798               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2799                 p[j] = p[j + stk[-2].nlastpos];
2800               lastpos -= stk[-2].nlastpos;
2801               stk[-2].nlastpos = stk[-1].nlastpos;
2802             }
2803
2804           /* A CAT node is nullable if both arguments are nullable.  */
2805           stk[-2].nullable &= stk[-1].nullable;
2806           stk--;
2807           break;
2808
2809         case OR:
2810           /* The firstpos is the union of the firstpos of each argument.  */
2811           stk[-2].nfirstpos += stk[-1].nfirstpos;
2812
2813           /* The lastpos is the union of the lastpos of each argument.  */
2814           stk[-2].nlastpos += stk[-1].nlastpos;
2815
2816           /* An OR node is nullable if either argument is nullable.  */
2817           stk[-2].nullable |= stk[-1].nullable;
2818           stk--;
2819           break;
2820
2821         default:
2822           /* Anything else is a nonempty position.  (Note that special
2823              constructs like \< are treated as nonempty strings here;
2824              an "epsilon closure" effectively makes them nullable later.
2825              Backreferences have to get a real position so we can detect
2826              transitions on them later.  But they are nullable.  */
2827           stk->nullable = d->tokens[i] == BACKREF;
2828
2829           /* This position is in its own firstpos and lastpos.  */
2830           stk->nfirstpos = stk->nlastpos = 1;
2831           stk++;
2832
2833           firstpos->index = lastpos->index = i;
2834           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2835           firstpos++, lastpos++;
2836
2837           break;
2838         }
2839 #ifdef DEBUG
2840       /* ... balance the above nonsyntactic #ifdef goo...  */
2841       fprintf (stderr, "node %td:", i);
2842       prtok (d->tokens[i]);
2843       putc ('\n', stderr);
2844       fprintf (stderr,
2845                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2846       fprintf (stderr, " firstpos:");
2847       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2848         {
2849           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2850           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2851         }
2852       fprintf (stderr, "\n lastpos:");
2853       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2854         {
2855           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2856           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2857         }
2858       putc ('\n', stderr);
2859 #endif
2860     }
2861
2862   if (backward)
2863     {
2864       /* For each follow set that is the follow set of a real position,
2865          replace it with its epsilon closure.  */
2866       epsclosure (d, backward);
2867
2868       for (idx_t i = 0; i < tindex; i++)
2869         free (backward[i].elems);
2870       free (backward);
2871     }
2872
2873   dfaoptimize (d);
2874
2875 #ifdef DEBUG
2876   for (idx_t i = 0; i < tindex; i++)
2877     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2878         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2879         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2880       {
2881         fprintf (stderr, "follows(%td:", i);
2882         prtok (d->tokens[i]);
2883         fprintf (stderr, "):");
2884         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2885           {
2886             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2887             prtok (d->tokens[d->follows[i].elems[j].index]);
2888           }
2889         putc ('\n', stderr);
2890       }
2891 #endif
2892
2893   pos.index = 0;
2894   pos.constraint = NO_CONSTRAINT;
2895
2896   alloc_position_set (&tmp, 1);
2897
2898   append (pos, &tmp);
2899
2900   d->separates = xicalloc (tindex, sizeof *d->separates);
2901
2902   for (idx_t i = 0; i < tindex; i++)
2903     {
2904       if (prev_newline_dependent (d->constraints[i]))
2905         d->separates[i] |= CTX_NEWLINE;
2906       if (prev_letter_dependent (d->constraints[i]))
2907         d->separates[i] |= CTX_LETTER;
2908
2909       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2910         {
2911           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2912             d->separates[i] |= CTX_NEWLINE;
2913           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2914             d->separates[i] |= CTX_LETTER;
2915         }
2916     }
2917
2918   /* Context wanted by some position.  */
2919   int separate_contexts = state_separate_contexts (d, &tmp);
2920
2921   /* Build the initial state.  */
2922   if (separate_contexts & CTX_NEWLINE)
2923     state_index (d, &tmp, CTX_NEWLINE);
2924   d->initstate_notbol = d->min_trcount
2925     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2926   if (separate_contexts & CTX_LETTER)
2927     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2928   d->min_trcount++;
2929   d->trcount = 0;
2930
2931   free (posalloc);
2932   free (stkalloc);
2933   free (merged.elems);
2934   free (tmp.elems);
2935 }
2936
2937 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2938 static void
2939 realloc_trans_if_necessary (struct dfa *d)
2940 {
2941   state_num oldalloc = d->tralloc;
2942   if (oldalloc < d->sindex)
2943     {
2944       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2945       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2946       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2947                            -1, sizeof *realtrans);
2948       realtrans[0] = realtrans[1] = NULL;
2949       d->trans = realtrans + 2;
2950       idx_t newalloc = d->tralloc = newalloc1 - 2;
2951       d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2952       d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2953       d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2954       if (d->localeinfo.multibyte)
2955         {
2956           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2957           realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2958           if (oldalloc == 0)
2959             realtrans[0] = realtrans[1] = NULL;
2960           d->mb_trans = realtrans + 2;
2961         }
2962       for (; oldalloc < newalloc; oldalloc++)
2963         {
2964           d->trans[oldalloc] = NULL;
2965           d->fails[oldalloc] = NULL;
2966           if (d->localeinfo.multibyte)
2967             d->mb_trans[oldalloc] = NULL;
2968         }
2969     }
2970 }
2971
2972 /*
2973    Calculate the transition table for a new state derived from state s
2974    for a compiled dfa d after input character uc, and return the new
2975    state number.
2976
2977    Do not worry about all possible input characters; calculate just the group
2978    of positions that match uc.  Label it with the set of characters that
2979    every position in the group matches (taking into account, if necessary,
2980    preceding context information of s).  Then find the union
2981    of these positions' follows, i.e., the set of positions of the
2982    new state.  For each character in the group's label, set the transition
2983    on this character to be to a state corresponding to the set's positions,
2984    and its associated backward context information, if necessary.
2985
2986    When building a searching matcher, include the positions of state
2987    0 in every state.
2988
2989    The group is constructed by building an equivalence-class
2990    partition of the positions of s.
2991
2992    For each position, find the set of characters C that it matches.  Eliminate
2993    any characters from C that fail on grounds of backward context.
2994
2995    Check whether the group's label L has nonempty
2996    intersection with C.  If L - C is nonempty, create a new group labeled
2997    L - C and having the same positions as the current group, and set L to
2998    the intersection of L and C.  Insert the position in the group, set
2999    C = C - L, and resume scanning.
3000
3001    If after comparing with every group there are characters remaining in C,
3002    create a new group labeled with the characters of C and insert this
3003    position in that group.  */
3004
3005 static state_num
3006 build_state (state_num s, struct dfa *d, unsigned char uc)
3007 {
3008   position_set follows;         /* Union of the follows for each
3009                                    position of the current state.  */
3010   position_set group;           /* Positions that match the input char.  */
3011   position_set tmp;             /* Temporary space for merging sets.  */
3012   state_num state;              /* New state.  */
3013   state_num state_newline;      /* New state on a newline transition.  */
3014   state_num state_letter;       /* New state on a letter transition.  */
3015
3016 #ifdef DEBUG
3017   fprintf (stderr, "build state %td\n", s);
3018 #endif
3019
3020   /* A pointer to the new transition table, and the table itself.  */
3021   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
3022   state_num *trans = *ptrans;
3023
3024   if (!trans)
3025     {
3026       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
3027          transition tables that can exist at once, other than for
3028          initial states.  Often-used transition tables are quickly
3029          rebuilt, whereas rarely-used ones are cleared away.  */
3030       if (MAX_TRCOUNT <= d->trcount)
3031         {
3032           for (state_num i = d->min_trcount; i < d->tralloc; i++)
3033             {
3034               free (d->trans[i]);
3035               free (d->fails[i]);
3036               d->trans[i] = d->fails[i] = NULL;
3037             }
3038           d->trcount = 0;
3039         }
3040
3041       d->trcount++;
3042       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3043
3044       /* Fill transition table with a default value which means that the
3045          transited state has not been calculated yet.  */
3046       for (int i = 0; i < NOTCHAR; i++)
3047         trans[i] = -2;
3048     }
3049
3050   /* Set up the success bits for this state.  */
3051   d->success[s] = 0;
3052   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3053     d->success[s] |= CTX_NEWLINE;
3054   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3055     d->success[s] |= CTX_LETTER;
3056   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3057     d->success[s] |= CTX_NONE;
3058
3059   alloc_position_set (&follows, d->nleaves);
3060
3061   /* Find the union of the follows of the positions of the group.
3062      This is a hideously inefficient loop.  Fix it someday.  */
3063   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3064     for (idx_t k = 0;
3065          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3066       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3067               &follows);
3068
3069   /* Positions that match the input char.  */
3070   alloc_position_set (&group, d->nleaves);
3071
3072   /* The group's label.  */
3073   charclass label;
3074   fillset (&label);
3075
3076   for (idx_t i = 0; i < follows.nelem; i++)
3077     {
3078       charclass matches;            /* Set of matching characters.  */
3079       position pos = follows.elems[i];
3080       bool matched = false;
3081       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3082         {
3083           zeroset (&matches);
3084           setbit (d->tokens[pos.index], &matches);
3085           if (d->tokens[pos.index] == uc)
3086             matched = true;
3087         }
3088       else if (d->tokens[pos.index] >= CSET)
3089         {
3090           matches = d->charclasses[d->tokens[pos.index] - CSET];
3091           if (tstbit (uc, &matches))
3092             matched = true;
3093         }
3094       else if (d->tokens[pos.index] == ANYCHAR)
3095         {
3096           matches = d->charclasses[d->canychar];
3097           if (tstbit (uc, &matches))
3098             matched = true;
3099
3100           /* ANYCHAR must match with a single character, so we must put
3101              it to D->states[s].mbps which contains the positions which
3102              can match with a single character not a byte.  If all
3103              positions which has ANYCHAR does not depend on context of
3104              next character, we put the follows instead of it to
3105              D->states[s].mbps to optimize.  */
3106           if (succeeds_in_context (pos.constraint, d->states[s].context,
3107                                    CTX_NONE))
3108             {
3109               if (d->states[s].mbps.nelem == 0)
3110                 alloc_position_set (&d->states[s].mbps, 1);
3111               insert (pos, &d->states[s].mbps);
3112             }
3113         }
3114       else
3115         continue;
3116
3117       /* Some characters may need to be eliminated from matches because
3118          they fail in the current context.  */
3119       if (pos.constraint != NO_CONSTRAINT)
3120         {
3121           if (!succeeds_in_context (pos.constraint,
3122                                     d->states[s].context, CTX_NEWLINE))
3123             for (int j = 0; j < CHARCLASS_WORDS; j++)
3124               matches.w[j] &= ~d->syntax.newline.w[j];
3125           if (!succeeds_in_context (pos.constraint,
3126                                     d->states[s].context, CTX_LETTER))
3127             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3128               matches.w[j] &= ~d->syntax.letters.w[j];
3129           if (!succeeds_in_context (pos.constraint,
3130                                     d->states[s].context, CTX_NONE))
3131             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3132               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3133
3134           /* If there are no characters left, there's no point in going on.  */
3135           if (emptyset (&matches))
3136             continue;
3137
3138           /* If we have reset the bit that made us declare "matched", reset
3139              that indicator, too.  This is required to avoid an infinite loop
3140              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3141           if (!tstbit (uc, &matches))
3142             matched = false;
3143         }
3144
3145 #ifdef DEBUG
3146       fprintf (stderr, " nextpos %td:", pos.index);
3147       prtok (d->tokens[pos.index]);
3148       fprintf (stderr, " of");
3149       for (unsigned j = 0; j < NOTCHAR; j++)
3150         if (tstbit (j, &matches))
3151           fprintf (stderr, " 0x%02x", j);
3152       fprintf (stderr, "\n");
3153 #endif
3154
3155       if (matched)
3156         {
3157           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3158             label.w[k] &= matches.w[k];
3159           append (pos, &group);
3160         }
3161       else
3162         {
3163           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3164             label.w[k] &= ~matches.w[k];
3165         }
3166     }
3167
3168   alloc_position_set (&tmp, d->nleaves);
3169
3170   if (group.nelem > 0)
3171     {
3172       /* If we are building a searching matcher, throw in the positions
3173          of state 0 as well, if possible.  */
3174       if (d->searchflag)
3175         {
3176           /* If a token in follows.elems is not 1st byte of a multibyte
3177              character, or the states of follows must accept the bytes
3178              which are not 1st byte of the multibyte character.
3179              Then, if a state of follows encounters a byte, it must not be
3180              a 1st byte of a multibyte character nor a single byte character.
3181              In this case, do not add state[0].follows to next state, because
3182              state[0] must accept 1st-byte.
3183
3184              For example, suppose <sb a> is a certain single byte character,
3185              <mb A> is a certain multibyte character, and the codepoint of
3186              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3187              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3188              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3189              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3190              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3191              not add state[0].  */
3192
3193           bool mergeit = !d->localeinfo.multibyte;
3194           if (!mergeit)
3195             {
3196               mergeit = true;
3197               for (idx_t j = 0; mergeit && j < group.nelem; j++)
3198                 mergeit &= d->multibyte_prop[group.elems[j].index];
3199             }
3200           if (mergeit)
3201             merge2 (&group, &d->states[0].elems, &tmp);
3202         }
3203
3204       /* Find out if the new state will want any context information,
3205          by calculating possible contexts that the group can match,
3206          and separate contexts that the new state wants to know.  */
3207       int possible_contexts = charclass_context (d, &label);
3208       int separate_contexts = state_separate_contexts (d, &group);
3209
3210       /* Find the state(s) corresponding to the union of the follows.  */
3211       if (possible_contexts & ~separate_contexts)
3212         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3213       else
3214         state = -1;
3215       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3216         state_newline = state_index (d, &group, CTX_NEWLINE);
3217       else
3218         state_newline = state;
3219       if (separate_contexts & possible_contexts & CTX_LETTER)
3220         state_letter = state_index (d, &group, CTX_LETTER);
3221       else
3222         state_letter = state;
3223
3224       /* Reallocate now, to reallocate any newline transition properly.  */
3225       realloc_trans_if_necessary (d);
3226     }
3227
3228   /* If we are a searching matcher, the default transition is to a state
3229      containing the positions of state 0, otherwise the default transition
3230      is to fail miserably.  */
3231   else if (d->searchflag)
3232     {
3233       state_newline = 0;
3234       state_letter = d->min_trcount - 1;
3235       state = d->initstate_notbol;
3236     }
3237   else
3238     {
3239       state_newline = -1;
3240       state_letter = -1;
3241       state = -1;
3242     }
3243
3244   /* Set the transitions for each character in the label.  */
3245   for (int i = 0; i < NOTCHAR; i++)
3246     if (tstbit (i, &label))
3247       switch (d->syntax.sbit[i])
3248         {
3249         case CTX_NEWLINE:
3250           trans[i] = state_newline;
3251           break;
3252         case CTX_LETTER:
3253           trans[i] = state_letter;
3254           break;
3255         default:
3256           trans[i] = state;
3257           break;
3258         }
3259
3260 #ifdef DEBUG
3261   fprintf (stderr, "trans table %td", s);
3262   for (int i = 0; i < NOTCHAR; ++i)
3263     {
3264       if (!(i & 0xf))
3265         fprintf (stderr, "\n");
3266       fprintf (stderr, " %2td", trans[i]);
3267     }
3268   fprintf (stderr, "\n");
3269 #endif
3270
3271   free (group.elems);
3272   free (follows.elems);
3273   free (tmp.elems);
3274
3275   /* Keep the newline transition in a special place so we can use it as
3276      a sentinel.  */
3277   if (tstbit (d->syntax.eolbyte, &label))
3278     {
3279       d->newlines[s] = trans[d->syntax.eolbyte];
3280       trans[d->syntax.eolbyte] = -1;
3281     }
3282
3283   return trans[uc];
3284 }
3285
3286 /* Multibyte character handling sub-routines for dfaexec.  */
3287
3288 /* Consume a single byte and transit state from 's' to '*next_state'.
3289    This is almost the same as the state transition routine in dfaexec.
3290    But the state transition is done just once; otherwise, matching succeeds or
3291    we reach the end of the buffer.  */
3292 static state_num
3293 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3294 {
3295   state_num *t;
3296
3297   if (d->trans[s])
3298     t = d->trans[s];
3299   else if (d->fails[s])
3300     t = d->fails[s];
3301   else
3302     {
3303       build_state (s, d, **pp);
3304       if (d->trans[s])
3305         t = d->trans[s];
3306       else
3307         {
3308           t = d->fails[s];
3309           assert (t);
3310         }
3311     }
3312
3313   if (t[**pp] == -2)
3314     build_state (s, d, **pp);
3315
3316   return t[*(*pp)++];
3317 }
3318
3319 /* Transit state from s, then return new state and update the pointer of
3320    the buffer.  This function is for a period operator which can match a
3321    multi-byte character.  */
3322 static state_num
3323 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3324                unsigned char const *end)
3325 {
3326   wint_t wc;
3327
3328   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3329
3330   /* This state has some operators which can match a multibyte character.  */
3331   d->mb_follows.nelem = 0;
3332
3333   /* Calculate the state which can be reached from the state 's' by
3334      consuming 'mbclen' single bytes from the buffer.  */
3335   state_num s1 = s;
3336   int mbci;
3337   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3338     s = transit_state_singlebyte (d, s, pp);
3339   *pp += mbclen - mbci;
3340
3341   if (wc == WEOF)
3342     {
3343       /* It is an invalid character, so ANYCHAR is not accepted.  */
3344       return s;
3345     }
3346
3347   /* If all positions which have ANYCHAR do not depend on the context
3348      of the next character, calculate the next state with
3349      pre-calculated follows and cache the result.  */
3350   if (d->states[s1].mb_trindex < 0)
3351     {
3352       if (MAX_TRCOUNT <= d->mb_trcount)
3353         {
3354           state_num s3;
3355           for (s3 = -1; s3 < d->tralloc; s3++)
3356             {
3357               free (d->mb_trans[s3]);
3358               d->mb_trans[s3] = NULL;
3359             }
3360
3361           for (state_num i = 0; i < d->sindex; i++)
3362             d->states[i].mb_trindex = -1;
3363           d->mb_trcount = 0;
3364         }
3365       d->states[s1].mb_trindex = d->mb_trcount++;
3366     }
3367
3368   if (! d->mb_trans[s])
3369     {
3370       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3371       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3372       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3373       for (int i = 0; i < MAX_TRCOUNT; i++)
3374         d->mb_trans[s][i] = -1;
3375     }
3376   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3377     return d->mb_trans[s][d->states[s1].mb_trindex];
3378
3379   if (s == -1)
3380     copy (&d->states[s1].mbps, &d->mb_follows);
3381   else
3382     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3383
3384   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3385   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3386   realloc_trans_if_necessary (d);
3387
3388   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3389
3390   return s2;
3391 }
3392
3393 /* The initial state may encounter a byte which is not a single byte character
3394    nor the first byte of a multibyte character.  But it is incorrect for the
3395    initial state to accept such a byte.  For example, in Shift JIS the regular
3396    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3397    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3398    that are not a single byte character nor the first byte of a multibyte
3399    character.
3400
3401    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3402    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3403    the result is greater than P, set *WCP to the final wide character
3404    processed, or to WEOF if no wide character is processed.  Otherwise,
3405    if WCP is non-NULL, *WCP may or may not be updated.
3406
3407    Both P and MBP must be no larger than END.  */
3408 static unsigned char const *
3409 skip_remains_mb (struct dfa *d, unsigned char const *p,
3410                  unsigned char const *mbp, char const *end)
3411 {
3412   if (d->syntax.never_trail[*p])
3413     return p;
3414   while (mbp < p)
3415     {
3416       wint_t wc;
3417       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3418                            end - (char const *) mbp, d);
3419     }
3420   return mbp;
3421 }
3422
3423 /* Search through a buffer looking for a match to the struct dfa *D.
3424    Find the first occurrence of a string matching the regexp in the
3425    buffer, and the shortest possible version thereof.  Return a pointer to
3426    the first character after the match, or NULL if none is found.  BEGIN
3427    points to the beginning of the buffer, and END points to the first byte
3428    after its end.  Note however that we store a sentinel byte (usually
3429    newline) in *END, so the actual buffer must be one byte longer.
3430    When ALLOW_NL, newlines may appear in the matching string.
3431    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3432    If MULTIBYTE, the input consists of multibyte characters and/or
3433    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3434    Here is the list of features that make this DFA matcher punt:
3435     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3436     - [^...] in non-simple locale
3437     - [[=foo=]] or [[.foo.]]
3438     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3439     - back-reference: (.)\1
3440     - word-delimiter in multibyte locale: \<, \>, \b, \B
3441    See struct localeinfo.simple for the definition of "simple locale".  */
3442
3443 static inline char *
3444 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3445               idx_t *count, bool multibyte)
3446 {
3447   if (MAX_TRCOUNT <= d->sindex)
3448     {
3449       for (state_num s = d->min_trcount; s < d->sindex; s++)
3450         {
3451           free (d->states[s].elems.elems);
3452           free (d->states[s].mbps.elems);
3453         }
3454       d->sindex = d->min_trcount;
3455
3456       if (d->trans)
3457         {
3458           for (state_num s = 0; s < d->tralloc; s++)
3459             {
3460               free (d->trans[s]);
3461               free (d->fails[s]);
3462               d->trans[s] = d->fails[s] = NULL;
3463             }
3464           d->trcount = 0;
3465         }
3466
3467       if (d->localeinfo.multibyte && d->mb_trans)
3468         {
3469           for (state_num s = -1; s < d->tralloc; s++)
3470             {
3471               free (d->mb_trans[s]);
3472               d->mb_trans[s] = NULL;
3473             }
3474           for (state_num s = 0; s < d->min_trcount; s++)
3475             d->states[s].mb_trindex = -1;
3476           d->mb_trcount = 0;
3477         }
3478     }
3479
3480   if (!d->tralloc)
3481     realloc_trans_if_necessary (d);
3482
3483   /* Current state.  */
3484   state_num s = 0, s1 = 0;
3485
3486   /* Current input character.  */
3487   unsigned char const *p = (unsigned char const *) begin;
3488   unsigned char const *mbp = p;
3489
3490   /* Copy of d->trans so it can be optimized into a register.  */
3491   state_num **trans = d->trans;
3492   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3493   unsigned char saved_end = *(unsigned char *) end;
3494   *end = eol;
3495
3496   if (multibyte)
3497     {
3498       mbszero (&d->mbs);
3499       if (d->mb_follows.alloc == 0)
3500         alloc_position_set (&d->mb_follows, d->nleaves);
3501     }
3502
3503   idx_t nlcount = 0;
3504   for (;;)
3505     {
3506       state_num *t;
3507       while ((t = trans[s]) != NULL)
3508         {
3509           if (s < d->min_trcount)
3510             {
3511               if (!multibyte || d->states[s].mbps.nelem == 0)
3512                 {
3513                   while (t[*p] == s)
3514                     p++;
3515                 }
3516               if (multibyte)
3517                 p = mbp = skip_remains_mb (d, p, mbp, end);
3518             }
3519
3520           if (multibyte)
3521             {
3522               s1 = s;
3523
3524               if (d->states[s].mbps.nelem == 0
3525                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3526                 {
3527                   /* If an input character does not match ANYCHAR, do it
3528                      like a single-byte character.  */
3529                   s = t[*p++];
3530                 }
3531               else
3532                 {
3533                   s = transit_state (d, s, &p, (unsigned char *) end);
3534                   mbp = p;
3535                   trans = d->trans;
3536                 }
3537             }
3538           else
3539             {
3540               s1 = t[*p++];
3541               t = trans[s1];
3542               if (! t)
3543                 {
3544                   state_num tmp = s;
3545                   s = s1;
3546                   s1 = tmp;     /* swap */
3547                   break;
3548                 }
3549               if (s < d->min_trcount)
3550                 {
3551                   while (t[*p] == s1)
3552                     p++;
3553                 }
3554               s = t[*p++];
3555             }
3556         }
3557
3558       if (s < 0)
3559         {
3560           if (s == -2)
3561             {
3562               s = build_state (s1, d, p[-1]);
3563               trans = d->trans;
3564             }
3565           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3566             {
3567               /* The previous character was a newline.  Count it, and skip
3568                  checking of multibyte character boundary until here.  */
3569               nlcount++;
3570               mbp = p;
3571
3572               s = (allow_nl ? d->newlines[s1]
3573                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3574                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3575                    : d->initstate_notbol);
3576             }
3577           else
3578             {
3579               p = NULL;
3580               goto done;
3581             }
3582         }
3583       else if (d->fails[s])
3584         {
3585           if ((d->success[s] & d->syntax.sbit[*p])
3586               || ((char *) p == end
3587                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3588                                          d)))
3589             goto done;
3590
3591           if (multibyte && s < d->min_trcount)
3592             p = mbp = skip_remains_mb (d, p, mbp, end);
3593
3594           s1 = s;
3595           if (!multibyte || d->states[s].mbps.nelem == 0
3596               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3597             {
3598               /* If a input character does not match ANYCHAR, do it
3599                  like a single-byte character.  */
3600               s = d->fails[s][*p++];
3601             }
3602           else
3603             {
3604               s = transit_state (d, s, &p, (unsigned char *) end);
3605               mbp = p;
3606               trans = d->trans;
3607             }
3608         }
3609       else
3610         {
3611           build_state (s, d, p[0]);
3612           trans = d->trans;
3613         }
3614     }
3615
3616  done:
3617   if (count)
3618     *count += nlcount;
3619   *end = saved_end;
3620   return (char *) p;
3621 }
3622
3623 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3624    This is for performance, as dfaexec_main is an inline function.  */
3625
3626 static char *
3627 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3628             bool allow_nl, idx_t *count, bool *backref)
3629 {
3630   return dfaexec_main (d, begin, end, allow_nl, count, true);
3631 }
3632
3633 static char *
3634 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3635             bool allow_nl, idx_t *count, bool *backref)
3636 {
3637   return dfaexec_main (d, begin, end, allow_nl, count, false);
3638 }
3639
3640 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3641    any regexp that uses a construct not supported by this code.  */
3642 static char *
3643 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3644               bool allow_nl, idx_t *count, bool *backref)
3645 {
3646   *backref = true;
3647   return (char *) begin;
3648 }
3649
3650 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3651    but faster and set *BACKREF if the DFA code does not support this
3652    regexp usage.  */
3653
3654 char *
3655 dfaexec (struct dfa *d, char const *begin, char *end,
3656          bool allow_nl, idx_t *count, bool *backref)
3657 {
3658   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3659 }
3660
3661 struct dfa *
3662 dfasuperset (struct dfa const *d)
3663 {
3664   return d->superset;
3665 }
3666
3667 bool
3668 dfaisfast (struct dfa const *d)
3669 {
3670   return d->fast;
3671 }
3672
3673 static void
3674 free_mbdata (struct dfa *d)
3675 {
3676   free (d->multibyte_prop);
3677   free (d->lex.brack.chars);
3678   free (d->mb_follows.elems);
3679
3680   if (d->mb_trans)
3681     {
3682       state_num s;
3683       for (s = -1; s < d->tralloc; s++)
3684         free (d->mb_trans[s]);
3685       free (d->mb_trans - 2);
3686     }
3687 }
3688
3689 /* Return true if every construct in D is supported by this DFA matcher.  */
3690 bool
3691 dfasupported (struct dfa const *d)
3692 {
3693   for (idx_t i = 0; i < d->tindex; i++)
3694     {
3695       switch (d->tokens[i])
3696         {
3697         case BEGWORD:
3698         case ENDWORD:
3699         case LIMWORD:
3700         case NOTLIMWORD:
3701           if (!d->localeinfo.multibyte)
3702             continue;
3703           FALLTHROUGH;
3704         case BACKREF:
3705         case MBCSET:
3706           return false;
3707         }
3708     }
3709   return true;
3710 }
3711
3712 /* Disable use of the superset DFA if it is not likely to help
3713    performance.  */
3714 static void
3715 maybe_disable_superset_dfa (struct dfa *d)
3716 {
3717   if (!d->localeinfo.using_utf8)
3718     return;
3719
3720   bool have_backref = false;
3721   for (idx_t i = 0; i < d->tindex; i++)
3722     {
3723       switch (d->tokens[i])
3724         {
3725         case ANYCHAR:
3726           /* Lowered.  */
3727           assume (false);
3728         case BACKREF:
3729           have_backref = true;
3730           break;
3731         case MBCSET:
3732           /* Requires multi-byte algorithm.  */
3733           return;
3734         default:
3735           break;
3736         }
3737     }
3738
3739   if (!have_backref && d->superset)
3740     {
3741       /* The superset DFA is not likely to be much faster, so remove it.  */
3742       dfafree (d->superset);
3743       free (d->superset);
3744       d->superset = NULL;
3745     }
3746
3747   free_mbdata (d);
3748   d->localeinfo.multibyte = false;
3749   d->dfaexec = dfaexec_sb;
3750   d->fast = true;
3751 }
3752
3753 static void
3754 dfassbuild (struct dfa *d)
3755 {
3756   struct dfa *sup = dfaalloc ();
3757
3758   *sup = *d;
3759   sup->localeinfo.multibyte = false;
3760   sup->dfaexec = dfaexec_sb;
3761   sup->multibyte_prop = NULL;
3762   sup->superset = NULL;
3763   sup->states = NULL;
3764   sup->sindex = 0;
3765   sup->constraints = NULL;
3766   sup->separates = NULL;
3767   sup->follows = NULL;
3768   sup->tralloc = 0;
3769   sup->trans = NULL;
3770   sup->fails = NULL;
3771   sup->success = NULL;
3772   sup->newlines = NULL;
3773
3774   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3775   if (d->cindex)
3776     {
3777       memcpy (sup->charclasses, d->charclasses,
3778               d->cindex * sizeof *sup->charclasses);
3779     }
3780
3781   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3782   sup->talloc = d->tindex * 2;
3783
3784   bool have_achar = false;
3785   bool have_nchar = false;
3786   idx_t j;
3787   for (idx_t i = j = 0; i < d->tindex; i++)
3788     {
3789       switch (d->tokens[i])
3790         {
3791         case ANYCHAR:
3792         case MBCSET:
3793         case BACKREF:
3794           {
3795             charclass ccl;
3796             fillset (&ccl);
3797             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3798             sup->tokens[j++] = STAR;
3799             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3800                 || d->tokens[i + 1] == PLUS)
3801               i++;
3802             have_achar = true;
3803           }
3804           break;
3805         case BEGWORD:
3806         case ENDWORD:
3807         case LIMWORD:
3808         case NOTLIMWORD:
3809           if (d->localeinfo.multibyte)
3810             {
3811               /* These constraints aren't supported in a multibyte locale.
3812                  Ignore them in the superset DFA.  */
3813               sup->tokens[j++] = EMPTY;
3814               break;
3815             }
3816           FALLTHROUGH;
3817         default:
3818           sup->tokens[j++] = d->tokens[i];
3819           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3820               || d->tokens[i] >= CSET)
3821             have_nchar = true;
3822           break;
3823         }
3824     }
3825   sup->tindex = j;
3826
3827   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3828     d->superset = sup;
3829   else
3830     {
3831       dfafree (sup);
3832       free (sup);
3833     }
3834 }
3835
3836 /* Parse a string S of length LEN into D (but skip this step if S is null).
3837    Then analyze D and build a matcher for it.
3838    SEARCHFLAG says whether to build a searching or an exact matcher.  */
3839 void
3840 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3841 {
3842   if (s != NULL)
3843     dfaparse (s, len, d);
3844
3845   dfassbuild (d);
3846
3847   if (dfasupported (d))
3848     {
3849       maybe_disable_superset_dfa (d);
3850       dfaanalyze (d, searchflag);
3851     }
3852   else
3853     {
3854       d->dfaexec = dfaexec_noop;
3855     }
3856
3857   if (d->superset)
3858     {
3859       d->fast = true;
3860       dfaanalyze (d->superset, searchflag);
3861     }
3862 }
3863
3864 /* Free the storage held by the components of a dfa.  */
3865 void
3866 dfafree (struct dfa *d)
3867 {
3868   free (d->charclasses);
3869   free (d->tokens);
3870
3871   if (d->localeinfo.multibyte)
3872     free_mbdata (d);
3873
3874   free (d->constraints);
3875   free (d->separates);
3876
3877   for (idx_t i = 0; i < d->sindex; i++)
3878     {
3879       free (d->states[i].elems.elems);
3880       free (d->states[i].mbps.elems);
3881     }
3882   free (d->states);
3883
3884   if (d->follows)
3885     {
3886       for (idx_t i = 0; i < d->tindex; i++)
3887         free (d->follows[i].elems);
3888       free (d->follows);
3889     }
3890
3891   if (d->trans)
3892     {
3893       for (idx_t i = 0; i < d->tralloc; i++)
3894         {
3895           free (d->trans[i]);
3896           free (d->fails[i]);
3897         }
3898
3899       free (d->trans - 2);
3900       free (d->fails);
3901       free (d->newlines);
3902       free (d->success);
3903     }
3904
3905   if (d->superset)
3906     {
3907       dfafree (d->superset);
3908       free (d->superset);
3909     }
3910 }
3911
3912 /* Having found the postfix representation of the regular expression,
3913    try to find a long sequence of characters that must appear in any line
3914    containing the r.e.
3915    Finding a "longest" sequence is beyond the scope here;
3916    we take an easy way out and hope for the best.
3917    (Take "(ab|a)b"--please.)
3918
3919    We do a bottom-up calculation of sequences of characters that must appear
3920    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3921    representation:
3922         sequences that must appear at the left of the match ("left")
3923         sequences that must appear at the right of the match ("right")
3924         lists of sequences that must appear somewhere in the match ("in")
3925         sequences that must constitute the match ("is")
3926
3927    When we get to the root of the tree, we use one of the longest of its
3928    calculated "in" sequences as our answer.
3929
3930    The sequences calculated for the various types of node (in pseudo ANSI c)
3931    are shown below.  "p" is the operand of unary operators (and the left-hand
3932    operand of binary operators); "q" is the right-hand operand of binary
3933    operators.
3934
3935    "ZERO" means "a zero-length sequence" below.
3936
3937         Type    left            right           is              in
3938         ----    ----            -----           --              --
3939         char c  # c             # c             # c             # c
3940
3941         ANYCHAR ZERO            ZERO            ZERO            ZERO
3942
3943         MBCSET  ZERO            ZERO            ZERO            ZERO
3944
3945         CSET    ZERO            ZERO            ZERO            ZERO
3946
3947         STAR    ZERO            ZERO            ZERO            ZERO
3948
3949         QMARK   ZERO            ZERO            ZERO            ZERO
3950
3951         PLUS    p->left         p->right        ZERO            p->in
3952
3953         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3954                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3955                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3956                                                 ZERO
3957
3958         OR      longest common  longest common  (do p->is and substrings common
3959                 leading         trailing        to q->is have same p->in and
3960                 (sub)sequence   (sub)sequence   q->in length and content) ?
3961                 of p->left      of p->right
3962                 and q->left     and q->right    p->is : NULL
3963
3964    If there's anything else we recognize in the tree, all four sequences get set
3965    to zero-length sequences.  If there's something we don't recognize in the
3966    tree, we just return a zero-length sequence.
3967
3968    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3969    'aaa')?
3970
3971    And ... is it here or someplace that we might ponder "optimizations" such as
3972         egrep 'psi|epsilon'     ->      egrep 'psi'
3973         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3974                                         (Yes, we now find "epsi" as a "string
3975                                         that must occur", but we might also
3976                                         simplify the *entire* r.e. being sought)
3977         grep '[c]'              ->      grep 'c'
3978         grep '(ab|a)b'          ->      grep 'ab'
3979         grep 'ab*'              ->      grep 'a'
3980         grep 'a*b'              ->      grep 'b'
3981
3982    There are several issues:
3983
3984    Is optimization easy (enough)?
3985
3986    Does optimization actually accomplish anything,
3987    or is the automaton you get from "psi|epsilon" (for example)
3988    the same as the one you get from "psi" (for example)?
3989
3990    Are optimizable r.e.'s likely to be used in real-life situations
3991    (something like 'ab*' is probably unlikely; something like is
3992    'psi|epsilon' is likelier)?  */
3993
3994 static char *
3995 icatalloc (char *old, char const *new)
3996 {
3997   idx_t newsize = strlen (new);
3998   if (newsize == 0)
3999     return old;
4000   idx_t oldsize = strlen (old);
4001   char *result = xirealloc (old, oldsize + newsize + 1);
4002   memcpy (result + oldsize, new, newsize + 1);
4003   return result;
4004 }
4005
4006 static void
4007 freelist (char **cpp)
4008 {
4009   while (*cpp)
4010     free (*cpp++);
4011 }
4012
4013 static char **
4014 enlistnew (char **cpp, char *new)
4015 {
4016   /* Is there already something in the list that's new (or longer)?  */
4017   idx_t i;
4018   for (i = 0; cpp[i] != NULL; i++)
4019     if (strstr (cpp[i], new) != NULL)
4020       {
4021         free (new);
4022         return cpp;
4023       }
4024   /* Eliminate any obsoleted strings.  */
4025   for (idx_t j = 0; cpp[j] != NULL; )
4026     if (strstr (new, cpp[j]) == NULL)
4027       ++j;
4028     else
4029       {
4030         free (cpp[j]);
4031         if (--i == j)
4032           break;
4033         cpp[j] = cpp[i];
4034         cpp[i] = NULL;
4035       }
4036   /* Add the new string.  */
4037   cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4038   cpp[i] = new;
4039   cpp[i + 1] = NULL;
4040   return cpp;
4041 }
4042
4043 static char **
4044 enlist (char **cpp, char const *str, idx_t len)
4045 {
4046   return enlistnew (cpp, ximemdup0 (str, len));
4047 }
4048
4049 /* Given pointers to two strings, return a pointer to an allocated
4050    list of their distinct common substrings.  */
4051 static char **
4052 comsubs (char *left, char const *right)
4053 {
4054   char **cpp = xzalloc (sizeof *cpp);
4055
4056   for (char *lcp = left; *lcp != '\0'; lcp++)
4057     {
4058       idx_t len = 0;
4059       char *rcp = strchr (right, *lcp);
4060       while (rcp != NULL)
4061         {
4062           idx_t i;
4063           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4064             continue;
4065           if (i > len)
4066             len = i;
4067           rcp = strchr (rcp + 1, *lcp);
4068         }
4069       if (len != 0)
4070         cpp = enlist (cpp, lcp, len);
4071     }
4072   return cpp;
4073 }
4074
4075 static char **
4076 addlists (char **old, char **new)
4077 {
4078   for (; *new; new++)
4079     old = enlistnew (old, xstrdup (*new));
4080   return old;
4081 }
4082
4083 /* Given two lists of substrings, return a new list giving substrings
4084    common to both.  */
4085 static char **
4086 inboth (char **left, char **right)
4087 {
4088   char **both = xzalloc (sizeof *both);
4089
4090   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4091     {
4092       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4093         {
4094           char **temp = comsubs (left[lnum], right[rnum]);
4095           both = addlists (both, temp);
4096           freelist (temp);
4097           free (temp);
4098         }
4099     }
4100   return both;
4101 }
4102
4103 typedef struct must must;
4104
4105 struct must
4106 {
4107   char **in;
4108   char *left;
4109   char *right;
4110   char *is;
4111   bool begline;
4112   bool endline;
4113   must *prev;
4114 };
4115
4116 static must *
4117 allocmust (must *mp, idx_t size)
4118 {
4119   must *new_mp = xmalloc (sizeof *new_mp);
4120   new_mp->in = xzalloc (sizeof *new_mp->in);
4121   new_mp->left = xizalloc (size);
4122   new_mp->right = xizalloc (size);
4123   new_mp->is = xizalloc (size);
4124   new_mp->begline = false;
4125   new_mp->endline = false;
4126   new_mp->prev = mp;
4127   return new_mp;
4128 }
4129
4130 static void
4131 resetmust (must *mp)
4132 {
4133   freelist (mp->in);
4134   mp->in[0] = NULL;
4135   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4136   mp->begline = false;
4137   mp->endline = false;
4138 }
4139
4140 static void
4141 freemust (must *mp)
4142 {
4143   freelist (mp->in);
4144   free (mp->in);
4145   free (mp->left);
4146   free (mp->right);
4147   free (mp->is);
4148   free (mp);
4149 }
4150
4151 struct dfamust *
4152 dfamust (struct dfa const *d)
4153 {
4154   must *mp = NULL;
4155   char const *result = "";
4156   bool exact = false;
4157   bool begline = false;
4158   bool endline = false;
4159   bool need_begline = false;
4160   bool need_endline = false;
4161   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4162
4163   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4164     {
4165       token t = d->tokens[ri];
4166       switch (t)
4167         {
4168         case BEGLINE:
4169           mp = allocmust (mp, 2);
4170           mp->begline = true;
4171           need_begline = true;
4172           break;
4173         case ENDLINE:
4174           mp = allocmust (mp, 2);
4175           mp->endline = true;
4176           need_endline = true;
4177           break;
4178         case LPAREN:
4179         case RPAREN:
4180           assert (!"neither LPAREN nor RPAREN may appear here");
4181
4182         case EMPTY:
4183         case BEGWORD:
4184         case ENDWORD:
4185         case LIMWORD:
4186         case NOTLIMWORD:
4187         case BACKREF:
4188         case ANYCHAR:
4189         case MBCSET:
4190           mp = allocmust (mp, 2);
4191           break;
4192
4193         case STAR:
4194         case QMARK:
4195           assume_nonnull (mp);
4196           resetmust (mp);
4197           break;
4198
4199         case OR:
4200           {
4201             char **new;
4202             must *rmp = mp;
4203             assume_nonnull (rmp);
4204             must *lmp = mp = mp->prev;
4205             assume_nonnull (lmp);
4206             idx_t j, ln, rn, n;
4207
4208             /* Guaranteed to be.  Unlikely, but ...  */
4209             if (str_eq (lmp->is, rmp->is))
4210               {
4211                 lmp->begline &= rmp->begline;
4212                 lmp->endline &= rmp->endline;
4213               }
4214             else
4215               {
4216                 lmp->is[0] = '\0';
4217                 lmp->begline = false;
4218                 lmp->endline = false;
4219               }
4220             /* Left side--easy */
4221             idx_t i = 0;
4222             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4223               ++i;
4224             lmp->left[i] = '\0';
4225             /* Right side */
4226             ln = strlen (lmp->right);
4227             rn = strlen (rmp->right);
4228             n = ln;
4229             if (n > rn)
4230               n = rn;
4231             for (i = 0; i < n; ++i)
4232               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4233                 break;
4234             for (j = 0; j < i; ++j)
4235               lmp->right[j] = lmp->right[(ln - i) + j];
4236             lmp->right[j] = '\0';
4237             new = inboth (lmp->in, rmp->in);
4238             freelist (lmp->in);
4239             free (lmp->in);
4240             lmp->in = new;
4241             freemust (rmp);
4242           }
4243           break;
4244
4245         case PLUS:
4246           assume_nonnull (mp);
4247           mp->is[0] = '\0';
4248           break;
4249
4250         case END:
4251           assume_nonnull (mp);
4252           assert (!mp->prev);
4253           for (idx_t i = 0; mp->in[i] != NULL; i++)
4254             if (strlen (mp->in[i]) > strlen (result))
4255               result = mp->in[i];
4256           if (str_eq (result, mp->is))
4257             {
4258               if ((!need_begline || mp->begline) && (!need_endline
4259                                                      || mp->endline))
4260                 exact = true;
4261               begline = mp->begline;
4262               endline = mp->endline;
4263             }
4264           goto done;
4265
4266         case CAT:
4267           {
4268             must *rmp = mp;
4269             assume_nonnull (rmp);
4270             must *lmp = mp = mp->prev;
4271             assume_nonnull (lmp);
4272
4273             /* In.  Everything in left, plus everything in
4274                right, plus concatenation of
4275                left's right and right's left.  */
4276             lmp->in = addlists (lmp->in, rmp->in);
4277             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4278               {
4279                 idx_t lrlen = strlen (lmp->right);
4280                 idx_t rllen = strlen (rmp->left);
4281                 char *tp = ximalloc (lrlen + rllen + 1);
4282                 memcpy (tp + lrlen, rmp->left, rllen + 1);
4283                 memcpy (tp, lmp->right, lrlen);
4284                 lmp->in = enlistnew (lmp->in, tp);
4285               }
4286             /* Left-hand */
4287             if (lmp->is[0] != '\0')
4288               lmp->left = icatalloc (lmp->left, rmp->left);
4289             /* Right-hand */
4290             if (rmp->is[0] == '\0')
4291               lmp->right[0] = '\0';
4292             lmp->right = icatalloc (lmp->right, rmp->right);
4293             /* Guaranteed to be */
4294             if ((lmp->is[0] != '\0' || lmp->begline)
4295                 && (rmp->is[0] != '\0' || rmp->endline))
4296               {
4297                 lmp->is = icatalloc (lmp->is, rmp->is);
4298                 lmp->endline = rmp->endline;
4299               }
4300             else
4301               {
4302                 lmp->is[0] = '\0';
4303                 lmp->begline = false;
4304                 lmp->endline = false;
4305               }
4306             freemust (rmp);
4307           }
4308           break;
4309
4310         case '\0':
4311           /* Not on *my* shift.  */
4312           goto done;
4313
4314         default:
4315           if (CSET <= t)
4316             {
4317               /* If T is a singleton, or if case-folding in a unibyte
4318                  locale and T's members all case-fold to the same char,
4319                  convert T to one of its members.  Otherwise, do
4320                  nothing further with T.  */
4321               charclass *ccl = &d->charclasses[t - CSET];
4322               int j;
4323               for (j = 0; j < NOTCHAR; j++)
4324                 if (tstbit (j, ccl))
4325                   break;
4326               if (! (j < NOTCHAR))
4327                 {
4328                   mp = allocmust (mp, 2);
4329                   break;
4330                 }
4331               t = j;
4332               while (++j < NOTCHAR)
4333                 if (tstbit (j, ccl)
4334                     && ! (case_fold_unibyte
4335                           && toupper (j) == toupper (t)))
4336                   break;
4337               if (j < NOTCHAR)
4338                 {
4339                   mp = allocmust (mp, 2);
4340                   break;
4341                 }
4342             }
4343
4344           idx_t rj = ri + 2;
4345           if (d->tokens[ri + 1] == CAT)
4346             {
4347               for (; rj < d->tindex - 1; rj += 2)
4348                 {
4349                   if ((rj != ri && (d->tokens[rj] <= 0
4350                                     || NOTCHAR <= d->tokens[rj]))
4351                       || d->tokens[rj + 1] != CAT)
4352                     break;
4353                 }
4354             }
4355           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4356           mp->is[0] = mp->left[0] = mp->right[0]
4357             = case_fold_unibyte ? toupper (t) : t;
4358
4359           idx_t i;
4360           for (i = 1; ri + 2 < rj; i++)
4361             {
4362               ri += 2;
4363               t = d->tokens[ri];
4364               mp->is[i] = mp->left[i] = mp->right[i]
4365                 = case_fold_unibyte ? toupper (t) : t;
4366             }
4367           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4368           mp->in = enlist (mp->in, mp->is, i);
4369           break;
4370         }
4371     }
4372  done:;
4373
4374   struct dfamust *dm = NULL;
4375   if (*result)
4376     {
4377       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4378       dm->exact = exact;
4379       dm->begline = begline;
4380       dm->endline = endline;
4381       strcpy (dm->must, result);
4382     }
4383
4384   while (mp)
4385     {
4386       must *prev = mp->prev;
4387       freemust (mp);
4388       mp = prev;
4389     }
4390
4391   return dm;
4392 }
4393
4394 void
4395 dfamustfree (struct dfamust *dm)
4396 {
4397   free (dm);
4398 }
4399
4400 struct dfa *
4401 dfaalloc (void)
4402 {
4403   return xmalloc (sizeof (struct dfa));
4404 }
4405
4406 /* Initialize DFA.  */
4407 void
4408 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4409            reg_syntax_t bits, int dfaopts)
4410 {
4411   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4412   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4413   dfa->localeinfo = *linfo;
4414
4415   dfa->fast = !dfa->localeinfo.multibyte;
4416
4417   dfa->canychar = -1;
4418   dfa->syntax.syntax_bits_set = true;
4419   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4420   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4421   dfa->syntax.syntax_bits = bits;
4422   dfa->syntax.dfaopts = dfaopts;
4423
4424   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4425     {
4426       unsigned char uc = i;
4427
4428       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4429       switch (dfa->syntax.sbit[uc])
4430         {
4431         case CTX_LETTER:
4432           setbit (uc, &dfa->syntax.letters);
4433           break;
4434         case CTX_NEWLINE:
4435           setbit (uc, &dfa->syntax.newline);
4436           break;
4437         }
4438
4439       /* POSIX requires that the five bytes in "\n\r./" (including the
4440          terminating NUL) cannot occur inside a multibyte character.  */
4441       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4442                                      ? (uc & 0xc0) != 0x80
4443                                      : strchr ("\n\r./", uc) != NULL);
4444     }
4445 }
4446
4447 /* Initialize TO by copying FROM's syntax settings.  */
4448 void
4449 dfacopysyntax (struct dfa *to, struct dfa const *from)
4450 {
4451   memset (to, 0, offsetof (struct dfa, syntax));
4452   to->canychar = -1;
4453   to->fast = from->fast;
4454   to->syntax = from->syntax;
4455   to->dfaexec = from->dfaexec;
4456   to->localeinfo = from->localeinfo;
4457 }
4458
4459 /* vim:set shiftwidth=2: */