external/gpl2/xcvs/dist/lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           Idx length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
  94                                       bitset sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
 103                                       bitset sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        unsigned REG_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 const char *
 213 re_compile_pattern (const char *pattern, size_t length,
 214                     struct re_pattern_buffer *bufp)
 215 {
 216   reg_errcode_t ret;
 217
 218   /* And GNU code determines whether or not to get register information
 219      by passing null for the REGS argument to re_match, etc., not by
 220      setting re_no_sub, unless REG_NO_SUB is set.  */
 221   bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
 222
 223   /* Match anchors at newline.  */
 224   bufp->re_newline_anchor = 1;
 225
 226   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 227
 228   if (!ret)
 229     return NULL;
 230   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 231 }
 232 #ifdef _LIBC
 233 weak_alias (__re_compile_pattern, re_compile_pattern)
 234 #endif
 235
 236 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 237    also be assigned to arbitrarily: each pattern buffer stores its own
 238    syntax, so it can be changed between regex compilations.  */
 239 /* This has no initializer because initialized variables in Emacs
 240    become read-only after dumping.  */
 241 reg_syntax_t re_syntax_options;
 242
 243
 244 /* Specify the precise syntax of regexps for compilation.  This provides
 245    for compatibility for various utilities which historically have
 246    different, incompatible syntaxes.
 247
 248    The argument SYNTAX is a bit mask comprised of the various bits
 249    defined in regex.h.  We return the old syntax.  */
 250
 251 reg_syntax_t
 252 re_set_syntax (reg_syntax_t syntax)
 253 {
 254   reg_syntax_t ret = re_syntax_options;
 255
 256   re_syntax_options = syntax;
 257   return ret;
 258 }
 259 #ifdef _LIBC
 260 weak_alias (__re_set_syntax, re_set_syntax)
 261 #endif
 262
 263 int
 264 re_compile_fastmap (struct re_pattern_buffer *bufp)
 265 {
 266   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 267   char *fastmap = bufp->re_fastmap;
 268
 269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 271   if (dfa->init_state != dfa->init_state_word)
 272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 273   if (dfa->init_state != dfa->init_state_nl)
 274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 275   if (dfa->init_state != dfa->init_state_begbuf)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 277   bufp->re_fastmap_accurate = 1;
 278   return 0;
 279 }
 280 #ifdef _LIBC
 281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 282 #endif
 283
 284 static inline void
 285 __attribute ((always_inline))
 286 re_set_fastmap (char *fastmap, bool icase, int ch)
 287 {
 288   fastmap[ch] = 1;
 289   if (icase)
 290     fastmap[tolower (ch)] = 1;
 291 }
 292
 293 /* Helper function for re_compile_fastmap.
 294    Compile fastmap for the initial_state INIT_STATE.  */
 295
 296 static void
 297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 298                          char *fastmap)
 299 {
 300   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 301   Idx node_cnt;
 302   bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
 303   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 304     {
 305       Idx node = init_state->nodes.elems[node_cnt];
 306       re_token_type_t type = dfa->nodes[node].type;
 307
 308       if (type == CHARACTER)
 309         {
 310           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 311 #ifdef RE_ENABLE_I18N
 312           if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 313             {
 314               unsigned char buf[MB_LEN_MAX];
 315               unsigned char *p;
 316               wchar_t wc;
 317               mbstate_t state;
 318
 319               p = buf;
 320               *p++ = dfa->nodes[node].opr.c;
 321               while (++node < dfa->nodes_len
 322                      && dfa->nodes[node].type == CHARACTER
 323                      && dfa->nodes[node].mb_partial)
 324                 *p++ = dfa->nodes[node].opr.c;
 325               memset (&state, 0, sizeof (state));
 326               if (mbrtowc (&wc, (const char *) buf, p - buf,
 327                            &state) == p - buf
 328                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 329                       != (size_t) -1))
 330                 re_set_fastmap (fastmap, false, buf[0]);
 331             }
 332 #endif
 333         }
 334       else if (type == SIMPLE_BRACKET)
 335         {
 336           int i, j, ch;
 337           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 338             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 339               if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j))
 340                 re_set_fastmap (fastmap, icase, ch);
 341         }
 342 #ifdef RE_ENABLE_I18N
 343       else if (type == COMPLEX_BRACKET)
 344         {
 345           Idx i;
 346           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 347           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 348               || cset->nranges || cset->nchar_classes)
 349             {
 350 # ifdef _LIBC
 351               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 352                 {
 353                   /* In this case we want to catch the bytes which are
 354                      the first byte of any collation elements.
 355                      e.g. In da_DK, we want to catch 'a' since "aa"
 356                           is a valid collation element, and don't catch
 357                           'b' since 'b' is the only collation element
 358                           which starts from 'b'.  */
 359                   const int32_t *table = (const int32_t *)
 360                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 361                   for (i = 0; i < SBC_MAX; ++i)
 362                     if (table[i] < 0)
 363                       re_set_fastmap (fastmap, icase, i);
 364                 }
 365 # else
 366               if (dfa->mb_cur_max > 1)
 367                 for (i = 0; i < SBC_MAX; ++i)
 368                   if (__btowc (i) == WEOF)
 369                     re_set_fastmap (fastmap, icase, i);
 370 # endif /* not _LIBC */
 371             }
 372           for (i = 0; i < cset->nmbchars; ++i)
 373             {
 374               char buf[256];
 375               mbstate_t state;
 376               memset (&state, '\0', sizeof (state));
 377               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 378                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 379               if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 380                 {
 381                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 382                       != (size_t) -1)
 383                     re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 384                 }
 385             }
 386         }
 387 #endif /* RE_ENABLE_I18N */
 388       else if (type == OP_PERIOD
 389 #ifdef RE_ENABLE_I18N
 390                || type == OP_UTF8_PERIOD
 391 #endif /* RE_ENABLE_I18N */
 392                || type == END_OF_RE)
 393         {
 394           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 395           if (type == END_OF_RE)
 396             bufp->re_can_be_null = 1;
 397           return;
 398         }
 399     }
 400 }
 401 \f
 402 /* Entry point for POSIX code.  */
 403 /* regcomp takes a regular expression as a string and compiles it.
 404
 405    PREG is a regex_t *.  We do not expect any fields to be initialized,
 406    since POSIX says we shouldn't.  Thus, we set
 407
 408      `re_buffer' to the compiled pattern;
 409      `re_used' to the length of the compiled pattern;
 410      `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
 411        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 412        REG_SYNTAX_POSIX_BASIC;
 413      `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
 414      `re_fastmap' to an allocated space for the fastmap;
 415      `re_fastmap_accurate' to zero;
 416      `re_nsub' to the number of subexpressions in PATTERN.
 417
 418    PATTERN is the address of the pattern string.
 419
 420    CFLAGS is a series of bits which affect compilation.
 421
 422      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 423      use POSIX basic syntax.
 424
 425      If REG_NEWLINE is set, then . and [^...] don't match newline.
 426      Also, regexec will try a match beginning after every newline.
 427
 428      If REG_ICASE is set, then we considers upper- and lowercase
 429      versions of letters to be equivalent when matching.
 430
 431      If REG_NOSUB is set, then when PREG is passed to regexec, that
 432      routine will report only success or failure, and nothing about the
 433      registers.
 434
 435    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 436    the return codes and their meanings.)  */
 437
 438 int
 439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 440 {
 441   reg_errcode_t ret;
 442   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
 443                         : REG_SYNTAX_POSIX_BASIC);
 444
 445   preg->re_buffer = NULL;
 446   preg->re_allocated = 0;
 447   preg->re_used = 0;
 448
 449   /* Try to allocate space for the fastmap.  */
 450   preg->re_fastmap = re_malloc (char, SBC_MAX);
 451   if (BE (preg->re_fastmap == NULL, 0))
 452     return REG_ESPACE;
 453
 454   syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
 455
 456   /* If REG_NEWLINE is set, newlines are treated differently.  */
 457   if (cflags & REG_NEWLINE)
 458     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 459       syntax &= ~REG_DOT_NEWLINE;
 460       syntax |= REG_HAT_LISTS_NOT_NEWLINE;
 461       /* It also changes the matching behavior.  */
 462       preg->re_newline_anchor = 1;
 463     }
 464   else
 465     preg->re_newline_anchor = 0;
 466   preg->re_no_sub = !!(cflags & REG_NOSUB);
 467   preg->re_translate = NULL;
 468
 469   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 470
 471   /* POSIX doesn't distinguish between an unmatched open-group and an
 472      unmatched close-group: both are REG_EPAREN.  */
 473   if (ret == REG_ERPAREN)
 474     ret = REG_EPAREN;
 475
 476   /* We have already checked preg->re_fastmap != NULL.  */
 477   if (BE (ret == REG_NOERROR, 1))
 478     /* Compute the fastmap now, since regexec cannot modify the pattern
 479        buffer.  This function never fails in this implementation.  */
 480     (void) re_compile_fastmap (preg);
 481   else
 482     {
 483       /* Some error occurred while compiling the expression.  */
 484       re_free (preg->re_fastmap);
 485       preg->re_fastmap = NULL;
 486     }
 487
 488   return (int) ret;
 489 }
 490 #ifdef _LIBC
 491 weak_alias (__regcomp, regcomp)
 492 #endif
 493
 494 /* Returns a message corresponding to an error code, ERRCODE, returned
 495    from either regcomp or regexec.   We don't use PREG here.  */
 496
 497 size_t
 498 regerror (int errcode, const regex_t *__restrict preg,
 499           char *__restrict errbuf, size_t errbuf_size)
 500 {
 501   const char *msg;
 502   size_t msg_size;
 503
 504   if (BE (errcode < 0
 505           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 506                                / sizeof (__re_error_msgid_idx[0])), 0))
 507     /* Only error codes returned by the rest of the code should be passed
 508        to this routine.  If we are given anything else, or if other regex
 509        code generates an invalid error code, then the program has a bug.
 510        Dump core so we can fix it.  */
 511     abort ();
 512
 513   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 514
 515   msg_size = strlen (msg) + 1; /* Includes the null.  */
 516
 517   if (BE (errbuf_size != 0, 1))
 518     {
 519       if (BE (msg_size > errbuf_size, 0))
 520         {
 521 #if defined HAVE_MEMPCPY || defined _LIBC
 522           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 523 #else
 524           memcpy (errbuf, msg, errbuf_size - 1);
 525           errbuf[errbuf_size - 1] = 0;
 526 #endif
 527         }
 528       else
 529         memcpy (errbuf, msg, msg_size);
 530     }
 531
 532   return msg_size;
 533 }
 534 #ifdef _LIBC
 535 weak_alias (__regerror, regerror)
 536 #endif
 537
 538
 539 #ifdef RE_ENABLE_I18N
 540 /* This static array is used for the map to single-byte characters when
 541    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 542    it the same all the time.  UTF-8 is the preferred encoding so this is
 543    a worthwhile optimization.  */
 544 static const bitset utf8_sb_map =
 545 {
 546   /* Set the first 128 bits.  */
 547 # if 2 < BITSET_WORDS
 548   BITSET_WORD_MAX,
 549 # endif
 550 # if 4 < BITSET_WORDS
 551   BITSET_WORD_MAX,
 552 # endif
 553 # if 6 < BITSET_WORDS
 554   BITSET_WORD_MAX,
 555 # endif
 556 # if 8 < BITSET_WORDS
 557 #  error "Invalid BITSET_WORDS"
 558 # endif
 559   (BITSET_WORD_MAX
 560    >> (SBC_MAX % BITSET_WORD_BITS == 0
 561        ? 0
 562        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 563 };
 564 #endif
 565
 566
 567 static void
 568 free_dfa_content (re_dfa_t *dfa)
 569 {
 570   Idx i, j;
 571
 572   if (dfa->nodes)
 573     for (i = 0; i < dfa->nodes_len; ++i)
 574       free_token (dfa->nodes + i);
 575   re_free (dfa->nexts);
 576   for (i = 0; i < dfa->nodes_len; ++i)
 577     {
 578       if (dfa->eclosures != NULL)
 579         re_node_set_free (dfa->eclosures + i);
 580       if (dfa->inveclosures != NULL)
 581         re_node_set_free (dfa->inveclosures + i);
 582       if (dfa->edests != NULL)
 583         re_node_set_free (dfa->edests + i);
 584     }
 585   re_free (dfa->edests);
 586   re_free (dfa->eclosures);
 587   re_free (dfa->inveclosures);
 588   re_free (dfa->nodes);
 589
 590   if (dfa->state_table)
 591     for (i = 0; i <= dfa->state_hash_mask; ++i)
 592       {
 593         struct re_state_table_entry *entry = dfa->state_table + i;
 594         for (j = 0; j < entry->num; ++j)
 595           {
 596             re_dfastate_t *state = entry->array[j];
 597             free_state (state);
 598           }
 599         re_free (entry->array);
 600       }
 601   re_free (dfa->state_table);
 602 #ifdef RE_ENABLE_I18N
 603   if (dfa->sb_char != utf8_sb_map)
 604     re_free (dfa->sb_char);
 605 #endif
 606   re_free (dfa->subexp_map);
 607 #ifdef DEBUG
 608   re_free (dfa->re_str);
 609 #endif
 610
 611   re_free (dfa);
 612 }
 613
 614
 615 /* Free dynamically allocated space used by PREG.  */
 616
 617 void
 618 regfree (regex_t *preg)
 619 {
 620   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 621   if (BE (dfa != NULL, 1))
 622     free_dfa_content (dfa);
 623   preg->re_buffer = NULL;
 624   preg->re_allocated = 0;
 625
 626   re_free (preg->re_fastmap);
 627   preg->re_fastmap = NULL;
 628
 629   re_free (preg->re_translate);
 630   preg->re_translate = NULL;
 631 }
 632 #ifdef _LIBC
 633 weak_alias (__regfree, regfree)
 634 #endif
 635 \f
 636 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 637    them unless specifically requested.  */
 638
 639 #if defined _REGEX_RE_COMP || defined _LIBC
 640
 641 /* BSD has one and only one pattern buffer.  */
 642 static struct re_pattern_buffer re_comp_buf;
 643
 644 char *
 645 # ifdef _LIBC
 646 /* Make these definitions weak in libc, so POSIX programs can redefine
 647    these names if they don't use our functions, and still use
 648    regcomp/regexec above without link errors.  */
 649 weak_function
 650 # endif
 651 re_comp (const char *s)
 652 {
 653   reg_errcode_t ret;
 654   char *fastmap;
 655
 656   if (!s)
 657     {
 658       if (!re_comp_buf.re_buffer)
 659         return gettext ("No previous regular expression");
 660       return 0;
 661     }
 662
 663   if (re_comp_buf.re_buffer)
 664     {
 665       fastmap = re_comp_buf.re_fastmap;
 666       re_comp_buf.re_fastmap = NULL;
 667       __regfree (&re_comp_buf);
 668       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 669       re_comp_buf.re_fastmap = fastmap;
 670     }
 671
 672   if (re_comp_buf.re_fastmap == NULL)
 673     {
 674       re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
 675       if (re_comp_buf.re_fastmap == NULL)
 676         return (char *) gettext (__re_error_msgid
 677                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 678     }
 679
 680   /* Since `re_exec' always passes NULL for the `regs' argument, we
 681      don't need to initialize the pattern buffer fields which affect it.  */
 682
 683   /* Match anchors at newlines.  */
 684   re_comp_buf.re_newline_anchor = 1;
 685
 686   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 687
 688   if (!ret)
 689     return NULL;
 690
 691   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 692   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 693 }
 694
 695 #ifdef _LIBC
 696 libc_freeres_fn (free_mem)
 697 {
 698   __regfree (&re_comp_buf);
 699 }
 700 #endif
 701
 702 #endif /* _REGEX_RE_COMP */
 703 \f
 704 /* Internal entry point.
 705    Compile the regular expression PATTERN, whose length is LENGTH.
 706    SYNTAX indicate regular expression's syntax.  */
 707
 708 static reg_errcode_t
 709 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
 710                      reg_syntax_t syntax)
 711 {
 712   reg_errcode_t err = REG_NOERROR;
 713   re_dfa_t *dfa;
 714   re_string_t regexp;
 715
 716   /* Initialize the pattern buffer.  */
 717   preg->re_fastmap_accurate = 0;
 718   preg->re_syntax = syntax;
 719   preg->re_not_bol = preg->re_not_eol = 0;
 720   preg->re_used = 0;
 721   preg->re_nsub = 0;
 722   preg->re_can_be_null = 0;
 723   preg->re_regs_allocated = REG_UNALLOCATED;
 724
 725   /* Initialize the dfa.  */
 726   dfa = (re_dfa_t *) preg->re_buffer;
 727   if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
 728     {
 729       /* If zero allocated, but buffer is non-null, try to realloc
 730          enough space.  This loses if buffer's address is bogus, but
 731          that is the user's responsibility.  If buffer is null this
 732          is a simple allocation.  */
 733       dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
 734       if (dfa == NULL)
 735         return REG_ESPACE;
 736       preg->re_allocated = sizeof (re_dfa_t);
 737       preg->re_buffer = (unsigned char *) dfa;
 738     }
 739   preg->re_used = sizeof (re_dfa_t);
 740
 741   __libc_lock_init (dfa->lock);
 742
 743   err = init_dfa (dfa, length);
 744   if (BE (err != REG_NOERROR, 0))
 745     {
 746       free_dfa_content (dfa);
 747       preg->re_buffer = NULL;
 748       preg->re_allocated = 0;
 749       return err;
 750     }
 751 #ifdef DEBUG
 752   dfa->re_str = re_malloc (char, length + 1);
 753   strncpy (dfa->re_str, pattern, length + 1);
 754 #endif
 755
 756   err = re_string_construct (&regexp, pattern, length, preg->re_translate,
 757                              syntax & REG_IGNORE_CASE, dfa);
 758   if (BE (err != REG_NOERROR, 0))
 759     {
 760     re_compile_internal_free_return:
 761       free_workarea_compile (preg);
 762       re_string_destruct (&regexp);
 763       free_dfa_content (dfa);
 764       preg->re_buffer = NULL;
 765       preg->re_allocated = 0;
 766       return err;
 767     }
 768
 769   /* Parse the regular expression, and build a structure tree.  */
 770   preg->re_nsub = 0;
 771   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 772   if (BE (dfa->str_tree == NULL, 0))
 773     goto re_compile_internal_free_return;
 774
 775   /* Analyze the tree and create the nfa.  */
 776   err = analyze (preg);
 777   if (BE (err != REG_NOERROR, 0))
 778     goto re_compile_internal_free_return;
 779
 780 #ifdef RE_ENABLE_I18N
 781   /* If possible, do searching in single byte encoding to speed things up.  */
 782   if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
 783     optimize_utf8 (dfa);
 784 #endif
 785
 786   /* Then create the initial state of the dfa.  */
 787   err = create_initial_state (dfa);
 788
 789   /* Release work areas.  */
 790   free_workarea_compile (preg);
 791   re_string_destruct (&regexp);
 792
 793   if (BE (err != REG_NOERROR, 0))
 794     {
 795       free_dfa_content (dfa);
 796       preg->re_buffer = NULL;
 797       preg->re_allocated = 0;
 798     }
 799
 800   return err;
 801 }
 802
 803 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 804    as the initial length of some arrays.  */
 805
 806 static reg_errcode_t
 807 init_dfa (re_dfa_t *dfa, Idx pat_len)
 808 {
 809   __re_size_t table_size;
 810 #ifndef _LIBC
 811   char *codeset_name;
 812 #endif
 813
 814   memset (dfa, '\0', sizeof (re_dfa_t));
 815
 816   /* Force allocation of str_tree_storage the first time.  */
 817   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 818
 819   dfa->nodes_alloc = pat_len + 1;
 820   dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
 821
 822   /*  table_size = 2 ^ ceil(log pat_len) */
 823   for (table_size = 1; table_size <= pat_len; table_size <<= 1)
 824     if (0 < (Idx) -1 && table_size == 0)
 825       return REG_ESPACE;
 826
 827   dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
 828   dfa->state_hash_mask = table_size - 1;
 829
 830   dfa->mb_cur_max = MB_CUR_MAX;
 831 #ifdef _LIBC
 832   if (dfa->mb_cur_max == 6
 833       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 834     dfa->is_utf8 = 1;
 835   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 836                        != 0);
 837 #else
 838 # ifdef HAVE_LANGINFO_CODESET
 839   codeset_name = nl_langinfo (CODESET);
 840 # else
 841   codeset_name = getenv ("LC_ALL");
 842   if (codeset_name == NULL || codeset_name[0] == '\0')
 843     codeset_name = getenv ("LC_CTYPE");
 844   if (codeset_name == NULL || codeset_name[0] == '\0')
 845     codeset_name = getenv ("LANG");
 846   if (codeset_name == NULL)
 847     codeset_name = "";
 848   else if (strchr (codeset_name, '.') !=  NULL)
 849     codeset_name = strchr (codeset_name, '.') + 1;
 850 # endif
 851
 852   if (strcasecmp (codeset_name, "UTF-8") == 0
 853       || strcasecmp (codeset_name, "UTF8") == 0)
 854     dfa->is_utf8 = 1;
 855
 856   /* We check exhaustively in the loop below if this charset is a
 857      superset of ASCII.  */
 858   dfa->map_notascii = 0;
 859 #endif
 860
 861 #ifdef RE_ENABLE_I18N
 862   if (dfa->mb_cur_max > 1)
 863     {
 864       if (dfa->is_utf8)
 865         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 866       else
 867         {
 868           int i, j, ch;
 869
 870           dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS);
 871           if (BE (dfa->sb_char == NULL, 0))
 872             return REG_ESPACE;
 873
 874           /* Set the bits corresponding to single byte chars.  */
 875           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 876             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 877               {
 878                 wint_t wch = __btowc (ch);
 879                 if (wch != WEOF)
 880                   dfa->sb_char[i] |= (bitset_word) 1 << j;
 881 # ifndef _LIBC
 882                 if (isascii (ch) && wch != ch)
 883                   dfa->map_notascii = 1;
 884 # endif
 885               }
 886         }
 887     }
 888 #endif
 889
 890   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 891     return REG_ESPACE;
 892   return REG_NOERROR;
 893 }
 894
 895 /* Initialize WORD_CHAR table, which indicate which character is
 896    "word".  In this case "word" means that it is the word construction
 897    character used by some operators like "\<", "\>", etc.  */
 898
 899 static void
 900 init_word_char (re_dfa_t *dfa)
 901 {
 902   int i, j, ch;
 903   dfa->word_ops_used = 1;
 904   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 905     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 906       if (isalnum (ch) || ch == '_')
 907         dfa->word_char[i] |= (bitset_word) 1 << j;
 908 }
 909
 910 /* Free the work area which are only used while compiling.  */
 911
 912 static void
 913 free_workarea_compile (regex_t *preg)
 914 {
 915   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 916   bin_tree_storage_t *storage, *next;
 917   for (storage = dfa->str_tree_storage; storage; storage = next)
 918     {
 919       next = storage->next;
 920       re_free (storage);
 921     }
 922   dfa->str_tree_storage = NULL;
 923   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 924   dfa->str_tree = NULL;
 925   re_free (dfa->org_indices);
 926   dfa->org_indices = NULL;
 927 }
 928
 929 /* Create initial states for all contexts.  */
 930
 931 static reg_errcode_t
 932 create_initial_state (re_dfa_t *dfa)
 933 {
 934   Idx first, i;
 935   reg_errcode_t err;
 936   re_node_set init_nodes;
 937
 938   /* Initial states have the epsilon closure of the node which is
 939      the first node of the regular expression.  */
 940   first = dfa->str_tree->first->node_idx;
 941   dfa->init_node = first;
 942   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 943   if (BE (err != REG_NOERROR, 0))
 944     return err;
 945
 946   /* The back-references which are in initial states can epsilon transit,
 947      since in this case all of the subexpressions can be null.
 948      Then we add epsilon closures of the nodes which are the next nodes of
 949      the back-references.  */
 950   if (dfa->nbackref > 0)
 951     for (i = 0; i < init_nodes.nelem; ++i)
 952       {
 953         Idx node_idx = init_nodes.elems[i];
 954         re_token_type_t type = dfa->nodes[node_idx].type;
 955
 956         Idx clexp_idx;
 957         if (type != OP_BACK_REF)
 958           continue;
 959         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 960           {
 961             re_token_t *clexp_node;
 962             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 963             if (clexp_node->type == OP_CLOSE_SUBEXP
 964                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 965               break;
 966           }
 967         if (clexp_idx == init_nodes.nelem)
 968           continue;
 969
 970         if (type == OP_BACK_REF)
 971           {
 972             Idx dest_idx = dfa->edests[node_idx].elems[0];
 973             if (!re_node_set_contains (&init_nodes, dest_idx))
 974               {
 975                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 976                 i = 0;
 977               }
 978           }
 979       }
 980
 981   /* It must be the first time to invoke acquire_state.  */
 982   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 983   /* We don't check ERR here, since the initial state must not be NULL.  */
 984   if (BE (dfa->init_state == NULL, 0))
 985     return err;
 986   if (dfa->init_state->has_constraint)
 987     {
 988       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 989                                                        CONTEXT_WORD);
 990       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 991                                                      CONTEXT_NEWLINE);
 992       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 993                                                          &init_nodes,
 994                                                          CONTEXT_NEWLINE
 995                                                          | CONTEXT_BEGBUF);
 996       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 997               || dfa->init_state_begbuf == NULL, 0))
 998         return err;
 999     }
1000   else
1001     dfa->init_state_word = dfa->init_state_nl
1002       = dfa->init_state_begbuf = dfa->init_state;
1003
1004   re_node_set_free (&init_nodes);
1005   return REG_NOERROR;
1006 }
1007 \f
1008 #ifdef RE_ENABLE_I18N
1009 /* If it is possible to do searching in single byte encoding instead of UTF-8
1010    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1011    DFA nodes where needed.  */
1012
1013 static void
1014 optimize_utf8 (re_dfa_t *dfa)
1015 {
1016   Idx node;
1017   int i;
1018   bool mb_chars = false;
1019   bool has_period = false;
1020
1021   for (node = 0; node < dfa->nodes_len; ++node)
1022     switch (dfa->nodes[node].type)
1023       {
1024       case CHARACTER:
1025         if (dfa->nodes[node].opr.c >= 0x80)
1026           mb_chars = true;
1027         break;
1028       case ANCHOR:
1029         switch (dfa->nodes[node].opr.idx)
1030           {
1031           case LINE_FIRST:
1032           case LINE_LAST:
1033           case BUF_FIRST:
1034           case BUF_LAST:
1035             break;
1036           default:
1037             /* Word anchors etc. cannot be handled.  */
1038             return;
1039           }
1040         break;
1041       case OP_PERIOD:
1042         has_period = true;
1043         break;
1044       case OP_BACK_REF:
1045       case OP_ALT:
1046       case END_OF_RE:
1047       case OP_DUP_ASTERISK:
1048       case OP_OPEN_SUBEXP:
1049       case OP_CLOSE_SUBEXP:
1050         break;
1051       case COMPLEX_BRACKET:
1052         return;
1053       case SIMPLE_BRACKET:
1054         /* Just double check.  */
1055         {
1056           int rshift =
1057             (SBC_MAX / 2 % BITSET_WORD_BITS == 0
1058              ? 0
1059              : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS);
1060           for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061             {
1062               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1063                 return;
1064               rshift = 0;
1065             }
1066         }
1067         break;
1068       default:
1069         abort ();
1070       }
1071
1072   if (mb_chars || has_period)
1073     for (node = 0; node < dfa->nodes_len; ++node)
1074       {
1075         if (dfa->nodes[node].type == CHARACTER
1076             && dfa->nodes[node].opr.c >= 0x80)
1077           dfa->nodes[node].mb_partial = 0;
1078         else if (dfa->nodes[node].type == OP_PERIOD)
1079           dfa->nodes[node].type = OP_UTF8_PERIOD;
1080       }
1081
1082   /* The search can be in single byte locale.  */
1083   dfa->mb_cur_max = 1;
1084   dfa->is_utf8 = 0;
1085   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1086 }
1087 #endif
1088 \f
1089 /* Analyze the structure tree, and calculate "first", "next", "edest",
1090    "eclosure", and "inveclosure".  */
1091
1092 static reg_errcode_t
1093 analyze (regex_t *preg)
1094 {
1095   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1096   reg_errcode_t ret;
1097
1098   /* Allocate arrays.  */
1099   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1100   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1101   dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1102   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1103   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1104           || dfa->eclosures == NULL, 0))
1105     return REG_ESPACE;
1106
1107   dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1108   if (dfa->subexp_map != NULL)
1109     {
1110       Idx i;
1111       for (i = 0; i < preg->re_nsub; i++)
1112         dfa->subexp_map[i] = i;
1113       preorder (dfa->str_tree, optimize_subexps, dfa);
1114       for (i = 0; i < preg->re_nsub; i++)
1115         if (dfa->subexp_map[i] != i)
1116           break;
1117       if (i == preg->re_nsub)
1118         {
1119           free (dfa->subexp_map);
1120           dfa->subexp_map = NULL;
1121         }
1122     }
1123
1124   ret = postorder (dfa->str_tree, lower_subexps, preg);
1125   if (BE (ret != REG_NOERROR, 0))
1126     return ret;
1127   ret = postorder (dfa->str_tree, calc_first, dfa);
1128   if (BE (ret != REG_NOERROR, 0))
1129     return ret;
1130   preorder (dfa->str_tree, calc_next, dfa);
1131   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1132   if (BE (ret != REG_NOERROR, 0))
1133     return ret;
1134   ret = calc_eclosure (dfa);
1135   if (BE (ret != REG_NOERROR, 0))
1136     return ret;
1137
1138   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1139      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1140   if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1141       || dfa->nbackref)
1142     {
1143       dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1144       if (BE (dfa->inveclosures == NULL, 0))
1145         return REG_ESPACE;
1146       ret = calc_inveclosure (dfa);
1147     }
1148
1149   return ret;
1150 }
1151
1152 /* Our parse trees are very unbalanced, so we cannot use a stack to
1153    implement parse tree visits.  Instead, we use parent pointers and
1154    some hairy code in these two functions.  */
1155 static reg_errcode_t
1156 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1157            void *extra)
1158 {
1159   bin_tree_t *node, *prev;
1160
1161   for (node = root; ; )
1162     {
1163       /* Descend down the tree, preferably to the left (or to the right
1164          if that's the only child).  */
1165       while (node->left || node->right)
1166         if (node->left)
1167           node = node->left;
1168         else
1169           node = node->right;
1170
1171       do
1172         {
1173           reg_errcode_t err = fn (extra, node);
1174           if (BE (err != REG_NOERROR, 0))
1175             return err;
1176           if (node->parent == NULL)
1177             return REG_NOERROR;
1178           prev = node;
1179           node = node->parent;
1180         }
1181       /* Go up while we have a node that is reached from the right.  */
1182       while (node->right == prev || node->right == NULL);
1183       node = node->right;
1184     }
1185 }
1186
1187 static reg_errcode_t
1188 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189           void *extra)
1190 {
1191   bin_tree_t *node;
1192
1193   for (node = root; ; )
1194     {
1195       reg_errcode_t err = fn (extra, node);
1196       if (BE (err != REG_NOERROR, 0))
1197         return err;
1198
1199       /* Go to the left node, or up and to the right.  */
1200       if (node->left)
1201         node = node->left;
1202       else
1203         {
1204           bin_tree_t *prev = NULL;
1205           while (node->right == prev || node->right == NULL)
1206             {
1207               prev = node;
1208               node = node->parent;
1209               if (!node)
1210                 return REG_NOERROR;
1211             }
1212           node = node->right;
1213         }
1214     }
1215 }
1216
1217 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1218    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1219    backreferences as well.  Requires a preorder visit.  */
1220 static reg_errcode_t
1221 optimize_subexps (void *extra, bin_tree_t *node)
1222 {
1223   re_dfa_t *dfa = (re_dfa_t *) extra;
1224
1225   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1226     {
1227       int idx = node->token.opr.idx;
1228       node->token.opr.idx = dfa->subexp_map[idx];
1229       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1230     }
1231
1232   else if (node->token.type == SUBEXP
1233            && node->left && node->left->token.type == SUBEXP)
1234     {
1235       Idx other_idx = node->left->token.opr.idx;
1236
1237       node->left = node->left->left;
1238       if (node->left)
1239         node->left->parent = node;
1240
1241       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1242       if (other_idx < BITSET_WORD_BITS)
1243         dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx);
1244     }
1245
1246   return REG_NOERROR;
1247 }
1248
1249 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1250    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1251 static reg_errcode_t
1252 lower_subexps (void *extra, bin_tree_t *node)
1253 {
1254   regex_t *preg = (regex_t *) extra;
1255   reg_errcode_t err = REG_NOERROR;
1256
1257   if (node->left && node->left->token.type == SUBEXP)
1258     {
1259       node->left = lower_subexp (&err, preg, node->left);
1260       if (node->left)
1261         node->left->parent = node;
1262     }
1263   if (node->right && node->right->token.type == SUBEXP)
1264     {
1265       node->right = lower_subexp (&err, preg, node->right);
1266       if (node->right)
1267         node->right->parent = node;
1268     }
1269
1270   return err;
1271 }
1272
1273 static bin_tree_t *
1274 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1275 {
1276   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1277   bin_tree_t *body = node->left;
1278   bin_tree_t *op, *cls, *tree1, *tree;
1279
1280   if (preg->re_no_sub
1281       /* We do not optimize empty subexpressions, because otherwise we may
1282          have bad CONCAT nodes with NULL children.  This is obviously not
1283          very common, so we do not lose much.  An example that triggers
1284          this case is the sed "script" /\(\)/x.  */
1285       && node->left != NULL
1286       && ! (node->token.opr.idx < BITSET_WORD_BITS
1287             && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx)))
1288     return node->left;
1289
1290   /* Convert the SUBEXP node to the concatenation of an
1291      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1292   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1293   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1294   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1295   tree = create_tree (dfa, op, tree1, CONCAT);
1296   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1297     {
1298       *err = REG_ESPACE;
1299       return NULL;
1300     }
1301
1302   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1303   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1304   return tree;
1305 }
1306
1307 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1308    nodes.  Requires a postorder visit.  */
1309 static reg_errcode_t
1310 calc_first (void *extra, bin_tree_t *node)
1311 {
1312   re_dfa_t *dfa = (re_dfa_t *) extra;
1313   if (node->token.type == CONCAT)
1314     {
1315       node->first = node->left->first;
1316       node->node_idx = node->left->node_idx;
1317     }
1318   else
1319     {
1320       node->first = node;
1321       node->node_idx = re_dfa_add_node (dfa, node->token);
1322       if (BE (node->node_idx == REG_MISSING, 0))
1323         return REG_ESPACE;
1324     }
1325   return REG_NOERROR;
1326 }
1327
1328 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1329 static reg_errcode_t
1330 calc_next (void *extra, bin_tree_t *node)
1331 {
1332   switch (node->token.type)
1333     {
1334     case OP_DUP_ASTERISK:
1335       node->left->next = node;
1336       break;
1337     case CONCAT:
1338       node->left->next = node->right->first;
1339       node->right->next = node->next;
1340       break;
1341     default:
1342       if (node->left)
1343         node->left->next = node->next;
1344       if (node->right)
1345         node->right->next = node->next;
1346       break;
1347     }
1348   return REG_NOERROR;
1349 }
1350
1351 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1352 static reg_errcode_t
1353 link_nfa_nodes (void *extra, bin_tree_t *node)
1354 {
1355   re_dfa_t *dfa = (re_dfa_t *) extra;
1356   Idx idx = node->node_idx;
1357   reg_errcode_t err = REG_NOERROR;
1358
1359   switch (node->token.type)
1360     {
1361     case CONCAT:
1362       break;
1363
1364     case END_OF_RE:
1365       assert (node->next == NULL);
1366       break;
1367
1368     case OP_DUP_ASTERISK:
1369     case OP_ALT:
1370       {
1371         Idx left, right;
1372         dfa->has_plural_match = 1;
1373         if (node->left != NULL)
1374           left = node->left->first->node_idx;
1375         else
1376           left = node->next->node_idx;
1377         if (node->right != NULL)
1378           right = node->right->first->node_idx;
1379         else
1380           right = node->next->node_idx;
1381         assert (REG_VALID_INDEX (left));
1382         assert (REG_VALID_INDEX (right));
1383         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1384       }
1385       break;
1386
1387     case ANCHOR:
1388     case OP_OPEN_SUBEXP:
1389     case OP_CLOSE_SUBEXP:
1390       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1391       break;
1392
1393     case OP_BACK_REF:
1394       dfa->nexts[idx] = node->next->node_idx;
1395       if (node->token.type == OP_BACK_REF)
1396         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1397       break;
1398
1399     default:
1400       assert (!IS_EPSILON_NODE (node->token.type));
1401       dfa->nexts[idx] = node->next->node_idx;
1402       break;
1403     }
1404
1405   return err;
1406 }
1407
1408 /* Duplicate the epsilon closure of the node ROOT_NODE.
1409    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1410    to their own constraint.  */
1411
1412 static reg_errcode_t
1413 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1414                         Idx top_clone_node, Idx root_node,
1415                         unsigned int init_constraint)
1416 {
1417   Idx org_node, clone_node;
1418   bool ok;
1419   unsigned int constraint = init_constraint;
1420   for (org_node = top_org_node, clone_node = top_clone_node;;)
1421     {
1422       Idx org_dest, clone_dest;
1423       if (dfa->nodes[org_node].type == OP_BACK_REF)
1424         {
1425           /* If the back reference epsilon-transit, its destination must
1426              also have the constraint.  Then duplicate the epsilon closure
1427              of the destination of the back reference, and store it in
1428              edests of the back reference.  */
1429           org_dest = dfa->nexts[org_node];
1430           re_node_set_empty (dfa->edests + clone_node);
1431           clone_dest = duplicate_node (dfa, org_dest, constraint);
1432           if (BE (clone_dest == REG_MISSING, 0))
1433             return REG_ESPACE;
1434           dfa->nexts[clone_node] = dfa->nexts[org_node];
1435           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1436           if (BE (! ok, 0))
1437             return REG_ESPACE;
1438         }
1439       else if (dfa->edests[org_node].nelem == 0)
1440         {
1441           /* In case of the node can't epsilon-transit, don't duplicate the
1442              destination and store the original destination as the
1443              destination of the node.  */
1444           dfa->nexts[clone_node] = dfa->nexts[org_node];
1445           break;
1446         }
1447       else if (dfa->edests[org_node].nelem == 1)
1448         {
1449           /* In case of the node can epsilon-transit, and it has only one
1450              destination.  */
1451           org_dest = dfa->edests[org_node].elems[0];
1452           re_node_set_empty (dfa->edests + clone_node);
1453           if (dfa->nodes[org_node].type == ANCHOR)
1454             {
1455               /* In case of the node has another constraint, append it.  */
1456               if (org_node == root_node && clone_node != org_node)
1457                 {
1458                   /* ...but if the node is root_node itself, it means the
1459                      epsilon closure have a loop, then tie it to the
1460                      destination of the root_node.  */
1461                   ok = re_node_set_insert (dfa->edests + clone_node,
1462                                             org_dest);
1463                   if (BE (! ok, 0))
1464                     return REG_ESPACE;
1465                   break;
1466                 }
1467               constraint |= dfa->nodes[org_node].opr.ctx_type;
1468             }
1469           clone_dest = duplicate_node (dfa, org_dest, constraint);
1470           if (BE (clone_dest == REG_MISSING, 0))
1471             return REG_ESPACE;
1472           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473           if (BE (! ok, 0))
1474             return REG_ESPACE;
1475         }
1476       else /* dfa->edests[org_node].nelem == 2 */
1477         {
1478           /* In case of the node can epsilon-transit, and it has two
1479              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1480           org_dest = dfa->edests[org_node].elems[0];
1481           re_node_set_empty (dfa->edests + clone_node);
1482           /* Search for a duplicated node which satisfies the constraint.  */
1483           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1484           if (clone_dest == REG_MISSING)
1485             {
1486               /* There are no such a duplicated node, create a new one.  */
1487               reg_errcode_t err;
1488               clone_dest = duplicate_node (dfa, org_dest, constraint);
1489               if (BE (clone_dest == REG_MISSING, 0))
1490                 return REG_ESPACE;
1491               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492               if (BE (! ok, 0))
1493                 return REG_ESPACE;
1494               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1495                                             root_node, constraint);
1496               if (BE (err != REG_NOERROR, 0))
1497                 return err;
1498             }
1499           else
1500             {
1501               /* There are a duplicated node which satisfy the constraint,
1502                  use it to avoid infinite loop.  */
1503               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504               if (BE (! ok, 0))
1505                 return REG_ESPACE;
1506             }
1507
1508           org_dest = dfa->edests[org_node].elems[1];
1509           clone_dest = duplicate_node (dfa, org_dest, constraint);
1510           if (BE (clone_dest == REG_MISSING, 0))
1511             return REG_ESPACE;
1512           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1513           if (BE (! ok, 0))
1514             return REG_ESPACE;
1515         }
1516       org_node = org_dest;
1517       clone_node = clone_dest;
1518     }
1519   return REG_NOERROR;
1520 }
1521
1522 /* Search for a node which is duplicated from the node ORG_NODE, and
1523    satisfies the constraint CONSTRAINT.  */
1524
1525 static Idx
1526 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1527                         unsigned int constraint)
1528 {
1529   Idx idx;
1530   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1531     {
1532       if (org_node == dfa->org_indices[idx]
1533           && constraint == dfa->nodes[idx].constraint)
1534         return idx; /* Found.  */
1535     }
1536   return REG_MISSING; /* Not found.  */
1537 }
1538
1539 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1540    Return the index of the new node, or REG_MISSING if insufficient storage is
1541    available.  */
1542
1543 static Idx
1544 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1545 {
1546   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1547   if (BE (dup_idx != REG_MISSING, 1))
1548     {
1549       dfa->nodes[dup_idx].constraint = constraint;
1550       if (dfa->nodes[org_idx].type == ANCHOR)
1551         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1552       dfa->nodes[dup_idx].duplicated = 1;
1553
1554       /* Store the index of the original node.  */
1555       dfa->org_indices[dup_idx] = org_idx;
1556     }
1557   return dup_idx;
1558 }
1559
1560 static reg_errcode_t
1561 calc_inveclosure (re_dfa_t *dfa)
1562 {
1563   Idx src, idx;
1564   bool ok;
1565   for (idx = 0; idx < dfa->nodes_len; ++idx)
1566     re_node_set_init_empty (dfa->inveclosures + idx);
1567
1568   for (src = 0; src < dfa->nodes_len; ++src)
1569     {
1570       Idx *elems = dfa->eclosures[src].elems;
1571       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1572         {
1573           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1574           if (BE (! ok, 0))
1575             return REG_ESPACE;
1576         }
1577     }
1578
1579   return REG_NOERROR;
1580 }
1581
1582 /* Calculate "eclosure" for all the node in DFA.  */
1583
1584 static reg_errcode_t
1585 calc_eclosure (re_dfa_t *dfa)
1586 {
1587   Idx node_idx;
1588   bool incomplete;
1589 #ifdef DEBUG
1590   assert (dfa->nodes_len > 0);
1591 #endif
1592   incomplete = false;
1593   /* For each nodes, calculate epsilon closure.  */
1594   for (node_idx = 0; ; ++node_idx)
1595     {
1596       reg_errcode_t err;
1597       re_node_set eclosure_elem;
1598       if (node_idx == dfa->nodes_len)
1599         {
1600           if (!incomplete)
1601             break;
1602           incomplete = false;
1603           node_idx = 0;
1604         }
1605
1606 #ifdef DEBUG
1607       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1608 #endif
1609
1610       /* If we have already calculated, skip it.  */
1611       if (dfa->eclosures[node_idx].nelem != 0)
1612         continue;
1613       /* Calculate epsilon closure of `node_idx'.  */
1614       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1615       if (BE (err != REG_NOERROR, 0))
1616         return err;
1617
1618       if (dfa->eclosures[node_idx].nelem == 0)
1619         {
1620           incomplete = true;
1621           re_node_set_free (&eclosure_elem);
1622         }
1623     }
1624   return REG_NOERROR;
1625 }
1626
1627 /* Calculate epsilon closure of NODE.  */
1628
1629 static reg_errcode_t
1630 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1631 {
1632   reg_errcode_t err;
1633   unsigned int constraint;
1634   Idx i;
1635   bool incomplete;
1636   bool ok;
1637   re_node_set eclosure;
1638   incomplete = false;
1639   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1640   if (BE (err != REG_NOERROR, 0))
1641     return err;
1642
1643   /* This indicates that we are calculating this node now.
1644      We reference this value to avoid infinite loop.  */
1645   dfa->eclosures[node].nelem = REG_MISSING;
1646
1647   constraint = ((dfa->nodes[node].type == ANCHOR)
1648                 ? dfa->nodes[node].opr.ctx_type : 0);
1649   /* If the current node has constraints, duplicate all nodes.
1650      Since they must inherit the constraints.  */
1651   if (constraint
1652       && dfa->edests[node].nelem
1653       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1654     {
1655       Idx org_node, cur_node;
1656       org_node = cur_node = node;
1657       err = duplicate_node_closure (dfa, node, node, node, constraint);
1658       if (BE (err != REG_NOERROR, 0))
1659         return err;
1660     }
1661
1662   /* Expand each epsilon destination nodes.  */
1663   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1664     for (i = 0; i < dfa->edests[node].nelem; ++i)
1665       {
1666         re_node_set eclosure_elem;
1667         Idx edest = dfa->edests[node].elems[i];
1668         /* If calculating the epsilon closure of `edest' is in progress,
1669            return intermediate result.  */
1670         if (dfa->eclosures[edest].nelem == REG_MISSING)
1671           {
1672             incomplete = true;
1673             continue;
1674           }
1675         /* If we haven't calculated the epsilon closure of `edest' yet,
1676            calculate now. Otherwise use calculated epsilon closure.  */
1677         if (dfa->eclosures[edest].nelem == 0)
1678           {
1679             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1680             if (BE (err != REG_NOERROR, 0))
1681               return err;
1682           }
1683         else
1684           eclosure_elem = dfa->eclosures[edest];
1685         /* Merge the epsilon closure of `edest'.  */
1686         re_node_set_merge (&eclosure, &eclosure_elem);
1687         /* If the epsilon closure of `edest' is incomplete,
1688            the epsilon closure of this node is also incomplete.  */
1689         if (dfa->eclosures[edest].nelem == 0)
1690           {
1691             incomplete = true;
1692             re_node_set_free (&eclosure_elem);
1693           }
1694       }
1695
1696   /* Epsilon closures include itself.  */
1697   ok = re_node_set_insert (&eclosure, node);
1698   if (BE (! ok, 0))
1699     return REG_ESPACE;
1700   if (incomplete && !root)
1701     dfa->eclosures[node].nelem = 0;
1702   else
1703     dfa->eclosures[node] = eclosure;
1704   *new_set = eclosure;
1705   return REG_NOERROR;
1706 }
1707 \f
1708 /* Functions for token which are used in the parser.  */
1709
1710 /* Fetch a token from INPUT.
1711    We must not use this function inside bracket expressions.  */
1712
1713 static void
1714 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1715 {
1716   re_string_skip_bytes (input, peek_token (result, input, syntax));
1717 }
1718
1719 /* Peek a token from INPUT, and return the length of the token.
1720    We must not use this function inside bracket expressions.  */
1721
1722 static int
1723 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1724 {
1725   unsigned char c;
1726
1727   if (re_string_eoi (input))
1728     {
1729       token->type = END_OF_RE;
1730       return 0;
1731     }
1732
1733   c = re_string_peek_byte (input, 0);
1734   token->opr.c = c;
1735
1736   token->word_char = 0;
1737 #ifdef RE_ENABLE_I18N
1738   token->mb_partial = 0;
1739   if (input->mb_cur_max > 1 &&
1740       !re_string_first_byte (input, re_string_cur_idx (input)))
1741     {
1742       token->type = CHARACTER;
1743       token->mb_partial = 1;
1744       return 1;
1745     }
1746 #endif
1747   if (c == '\\')
1748     {
1749       unsigned char c2;
1750       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1751         {
1752           token->type = BACK_SLASH;
1753           return 1;
1754         }
1755
1756       c2 = re_string_peek_byte_case (input, 1);
1757       token->opr.c = c2;
1758       token->type = CHARACTER;
1759 #ifdef RE_ENABLE_I18N
1760       if (input->mb_cur_max > 1)
1761         {
1762           wint_t wc = re_string_wchar_at (input,
1763                                           re_string_cur_idx (input) + 1);
1764           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1765         }
1766       else
1767 #endif
1768         token->word_char = IS_WORD_CHAR (c2) != 0;
1769
1770       switch (c2)
1771         {
1772         case '|':
1773           if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1774             token->type = OP_ALT;
1775           break;
1776         case '1': case '2': case '3': case '4': case '5':
1777         case '6': case '7': case '8': case '9':
1778           if (!(syntax & REG_NO_BK_REFS))
1779             {
1780               token->type = OP_BACK_REF;
1781               token->opr.idx = c2 - '1';
1782             }
1783           break;
1784         case '<':
1785           if (!(syntax & REG_NO_GNU_OPS))
1786             {
1787               token->type = ANCHOR;
1788               token->opr.ctx_type = WORD_FIRST;
1789             }
1790           break;
1791         case '>':
1792           if (!(syntax & REG_NO_GNU_OPS))
1793             {
1794               token->type = ANCHOR;
1795               token->opr.ctx_type = WORD_LAST;
1796             }
1797           break;
1798         case 'b':
1799           if (!(syntax & REG_NO_GNU_OPS))
1800             {
1801               token->type = ANCHOR;
1802               token->opr.ctx_type = WORD_DELIM;
1803             }
1804           break;
1805         case 'B':
1806           if (!(syntax & REG_NO_GNU_OPS))
1807             {
1808               token->type = ANCHOR;
1809               token->opr.ctx_type = NOT_WORD_DELIM;
1810             }
1811           break;
1812         case 'w':
1813           if (!(syntax & REG_NO_GNU_OPS))
1814             token->type = OP_WORD;
1815           break;
1816         case 'W':
1817           if (!(syntax & REG_NO_GNU_OPS))
1818             token->type = OP_NOTWORD;
1819           break;
1820         case 's':
1821           if (!(syntax & REG_NO_GNU_OPS))
1822             token->type = OP_SPACE;
1823           break;
1824         case 'S':
1825           if (!(syntax & REG_NO_GNU_OPS))
1826             token->type = OP_NOTSPACE;
1827           break;
1828         case '`':
1829           if (!(syntax & REG_NO_GNU_OPS))
1830             {
1831               token->type = ANCHOR;
1832               token->opr.ctx_type = BUF_FIRST;
1833             }
1834           break;
1835         case '\'':
1836           if (!(syntax & REG_NO_GNU_OPS))
1837             {
1838               token->type = ANCHOR;
1839               token->opr.ctx_type = BUF_LAST;
1840             }
1841           break;
1842         case '(':
1843           if (!(syntax & REG_NO_BK_PARENS))
1844             token->type = OP_OPEN_SUBEXP;
1845           break;
1846         case ')':
1847           if (!(syntax & REG_NO_BK_PARENS))
1848             token->type = OP_CLOSE_SUBEXP;
1849           break;
1850         case '+':
1851           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1852             token->type = OP_DUP_PLUS;
1853           break;
1854         case '?':
1855           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1856             token->type = OP_DUP_QUESTION;
1857           break;
1858         case '{':
1859           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1860             token->type = OP_OPEN_DUP_NUM;
1861           break;
1862         case '}':
1863           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1864             token->type = OP_CLOSE_DUP_NUM;
1865           break;
1866         default:
1867           break;
1868         }
1869       return 2;
1870     }
1871
1872   token->type = CHARACTER;
1873 #ifdef RE_ENABLE_I18N
1874   if (input->mb_cur_max > 1)
1875     {
1876       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1877       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1878     }
1879   else
1880 #endif
1881     token->word_char = IS_WORD_CHAR (token->opr.c);
1882
1883   switch (c)
1884     {
1885     case '\n':
1886       if (syntax & REG_NEWLINE_ALT)
1887         token->type = OP_ALT;
1888       break;
1889     case '|':
1890       if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1891         token->type = OP_ALT;
1892       break;
1893     case '*':
1894       token->type = OP_DUP_ASTERISK;
1895       break;
1896     case '+':
1897       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1898         token->type = OP_DUP_PLUS;
1899       break;
1900     case '?':
1901       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1902         token->type = OP_DUP_QUESTION;
1903       break;
1904     case '{':
1905       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1906         token->type = OP_OPEN_DUP_NUM;
1907       break;
1908     case '}':
1909       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1910         token->type = OP_CLOSE_DUP_NUM;
1911       break;
1912     case '(':
1913       if (syntax & REG_NO_BK_PARENS)
1914         token->type = OP_OPEN_SUBEXP;
1915       break;
1916     case ')':
1917       if (syntax & REG_NO_BK_PARENS)
1918         token->type = OP_CLOSE_SUBEXP;
1919       break;
1920     case '[':
1921       token->type = OP_OPEN_BRACKET;
1922       break;
1923     case '.':
1924       token->type = OP_PERIOD;
1925       break;
1926     case '^':
1927       if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1928           re_string_cur_idx (input) != 0)
1929         {
1930           char prev = re_string_peek_byte (input, -1);
1931           if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1932             break;
1933         }
1934       token->type = ANCHOR;
1935       token->opr.ctx_type = LINE_FIRST;
1936       break;
1937     case '$':
1938       if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1939           re_string_cur_idx (input) + 1 != re_string_length (input))
1940         {
1941           re_token_t next;
1942           re_string_skip_bytes (input, 1);
1943           peek_token (&next, input, syntax);
1944           re_string_skip_bytes (input, -1);
1945           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1946             break;
1947         }
1948       token->type = ANCHOR;
1949       token->opr.ctx_type = LINE_LAST;
1950       break;
1951     default:
1952       break;
1953     }
1954   return 1;
1955 }
1956
1957 /* Peek a token from INPUT, and return the length of the token.
1958    We must not use this function out of bracket expressions.  */
1959
1960 static int
1961 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1962 {
1963   unsigned char c;
1964   if (re_string_eoi (input))
1965     {
1966       token->type = END_OF_RE;
1967       return 0;
1968     }
1969   c = re_string_peek_byte (input, 0);
1970   token->opr.c = c;
1971
1972 #ifdef RE_ENABLE_I18N
1973   if (input->mb_cur_max > 1 &&
1974       !re_string_first_byte (input, re_string_cur_idx (input)))
1975     {
1976       token->type = CHARACTER;
1977       return 1;
1978     }
1979 #endif /* RE_ENABLE_I18N */
1980
1981   if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1982       && re_string_cur_idx (input) + 1 < re_string_length (input))
1983     {
1984       /* In this case, '\' escape a character.  */
1985       unsigned char c2;
1986       re_string_skip_bytes (input, 1);
1987       c2 = re_string_peek_byte (input, 0);
1988       token->opr.c = c2;
1989       token->type = CHARACTER;
1990       return 1;
1991     }
1992   if (c == '[') /* '[' is a special char in a bracket exps.  */
1993     {
1994       unsigned char c2;
1995       int token_len;
1996       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1997         c2 = re_string_peek_byte (input, 1);
1998       else
1999         c2 = 0;
2000       token->opr.c = c2;
2001       token_len = 2;
2002       switch (c2)
2003         {
2004         case '.':
2005           token->type = OP_OPEN_COLL_ELEM;
2006           break;
2007         case '=':
2008           token->type = OP_OPEN_EQUIV_CLASS;
2009           break;
2010         case ':':
2011           if (syntax & REG_CHAR_CLASSES)
2012             {
2013               token->type = OP_OPEN_CHAR_CLASS;
2014               break;
2015             }
2016           /* else fall through.  */
2017         default:
2018           token->type = CHARACTER;
2019           token->opr.c = c;
2020           token_len = 1;
2021           break;
2022         }
2023       return token_len;
2024     }
2025   switch (c)
2026     {
2027     case '-':
2028       token->type = OP_CHARSET_RANGE;
2029       break;
2030     case ']':
2031       token->type = OP_CLOSE_BRACKET;
2032       break;
2033     case '^':
2034       token->type = OP_NON_MATCH_LIST;
2035       break;
2036     default:
2037       token->type = CHARACTER;
2038     }
2039   return 1;
2040 }
2041 \f
2042 /* Functions for parser.  */
2043
2044 /* Entry point of the parser.
2045    Parse the regular expression REGEXP and return the structure tree.
2046    If an error is occured, ERR is set by error code, and return NULL.
2047    This function build the following tree, from regular expression <reg_exp>:
2048            CAT
2049            / \
2050           /   \
2051    <reg_exp>  EOR
2052
2053    CAT means concatenation.
2054    EOR means end of regular expression.  */
2055
2056 static bin_tree_t *
2057 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2058        reg_errcode_t *err)
2059 {
2060   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2061   bin_tree_t *tree, *eor, *root;
2062   re_token_t current_token;
2063   dfa->syntax = syntax;
2064   fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2065   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2066   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2067     return NULL;
2068   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2069   if (tree != NULL)
2070     root = create_tree (dfa, tree, eor, CONCAT);
2071   else
2072     root = eor;
2073   if (BE (eor == NULL || root == NULL, 0))
2074     {
2075       *err = REG_ESPACE;
2076       return NULL;
2077     }
2078   return root;
2079 }
2080
2081 /* This function build the following tree, from regular expression
2082    <branch1>|<branch2>:
2083            ALT
2084            / \
2085           /   \
2086    <branch1> <branch2>
2087
2088    ALT means alternative, which represents the operator `|'.  */
2089
2090 static bin_tree_t *
2091 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2092                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2093 {
2094   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2095   bin_tree_t *tree, *branch = NULL;
2096   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2097   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2098     return NULL;
2099
2100   while (token->type == OP_ALT)
2101     {
2102       fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2103       if (token->type != OP_ALT && token->type != END_OF_RE
2104           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2105         {
2106           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2107           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2108             return NULL;
2109         }
2110       else
2111         branch = NULL;
2112       tree = create_tree (dfa, tree, branch, OP_ALT);
2113       if (BE (tree == NULL, 0))
2114         {
2115           *err = REG_ESPACE;
2116           return NULL;
2117         }
2118     }
2119   return tree;
2120 }
2121
2122 /* This function build the following tree, from regular expression
2123    <exp1><exp2>:
2124         CAT
2125         / \
2126        /   \
2127    <exp1> <exp2>
2128
2129    CAT means concatenation.  */
2130
2131 static bin_tree_t *
2132 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2133               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2134 {
2135   bin_tree_t *tree, *exp;
2136   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2137   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2138   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2139     return NULL;
2140
2141   while (token->type != OP_ALT && token->type != END_OF_RE
2142          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2143     {
2144       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2145       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2146         {
2147           return NULL;
2148         }
2149       if (tree != NULL && exp != NULL)
2150         {
2151           tree = create_tree (dfa, tree, exp, CONCAT);
2152           if (tree == NULL)
2153             {
2154               *err = REG_ESPACE;
2155               return NULL;
2156             }
2157         }
2158       else if (tree == NULL)
2159         tree = exp;
2160       /* Otherwise exp == NULL, we don't need to create new tree.  */
2161     }
2162   return tree;
2163 }
2164
2165 /* This function build the following tree, from regular expression a*:
2166          *
2167          |
2168          a
2169 */
2170
2171 static bin_tree_t *
2172 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2173                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2174 {
2175   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2176   bin_tree_t *tree;
2177   switch (token->type)
2178     {
2179     case CHARACTER:
2180       tree = create_token_tree (dfa, NULL, NULL, token);
2181       if (BE (tree == NULL, 0))
2182         {
2183           *err = REG_ESPACE;
2184           return NULL;
2185         }
2186 #ifdef RE_ENABLE_I18N
2187       if (dfa->mb_cur_max > 1)
2188         {
2189           while (!re_string_eoi (regexp)
2190                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2191             {
2192               bin_tree_t *mbc_remain;
2193               fetch_token (token, regexp, syntax);
2194               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2195               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2196               if (BE (mbc_remain == NULL || tree == NULL, 0))
2197                 {
2198                   *err = REG_ESPACE;
2199                   return NULL;
2200                 }
2201             }
2202         }
2203 #endif
2204       break;
2205     case OP_OPEN_SUBEXP:
2206       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2207       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2208         return NULL;
2209       break;
2210     case OP_OPEN_BRACKET:
2211       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2212       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2213         return NULL;
2214       break;
2215     case OP_BACK_REF:
2216       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2217         {
2218           *err = REG_ESUBREG;
2219           return NULL;
2220         }
2221       dfa->used_bkref_map |= 1 << token->opr.idx;
2222       tree = create_token_tree (dfa, NULL, NULL, token);
2223       if (BE (tree == NULL, 0))
2224         {
2225           *err = REG_ESPACE;
2226           return NULL;
2227         }
2228       ++dfa->nbackref;
2229       dfa->has_mb_node = 1;
2230       break;
2231     case OP_OPEN_DUP_NUM:
2232       if (syntax & REG_CONTEXT_INVALID_DUP)
2233         {
2234           *err = REG_BADRPT;
2235           return NULL;
2236         }
2237       /* FALLTHROUGH */
2238     case OP_DUP_ASTERISK:
2239     case OP_DUP_PLUS:
2240     case OP_DUP_QUESTION:
2241       if (syntax & REG_CONTEXT_INVALID_OPS)
2242         {
2243           *err = REG_BADRPT;
2244           return NULL;
2245         }
2246       else if (syntax & REG_CONTEXT_INDEP_OPS)
2247         {
2248           fetch_token (token, regexp, syntax);
2249           return parse_expression (regexp, preg, token, syntax, nest, err);
2250         }
2251       /* else fall through  */
2252     case OP_CLOSE_SUBEXP:
2253       if ((token->type == OP_CLOSE_SUBEXP) &&
2254           !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2255         {
2256           *err = REG_ERPAREN;
2257           return NULL;
2258         }
2259       /* else fall through  */
2260     case OP_CLOSE_DUP_NUM:
2261       /* We treat it as a normal character.  */
2262
2263       /* Then we can these characters as normal characters.  */
2264       token->type = CHARACTER;
2265       /* mb_partial and word_char bits should be initialized already
2266          by peek_token.  */
2267       tree = create_token_tree (dfa, NULL, NULL, token);
2268       if (BE (tree == NULL, 0))
2269         {
2270           *err = REG_ESPACE;
2271           return NULL;
2272         }
2273       break;
2274     case ANCHOR:
2275       if ((token->opr.ctx_type
2276            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2277           && dfa->word_ops_used == 0)
2278         init_word_char (dfa);
2279       if (token->opr.ctx_type == WORD_DELIM
2280           || token->opr.ctx_type == NOT_WORD_DELIM)
2281         {
2282           bin_tree_t *tree_first, *tree_last;
2283           if (token->opr.ctx_type == WORD_DELIM)
2284             {
2285               token->opr.ctx_type = WORD_FIRST;
2286               tree_first = create_token_tree (dfa, NULL, NULL, token);
2287               token->opr.ctx_type = WORD_LAST;
2288             }
2289           else
2290             {
2291               token->opr.ctx_type = INSIDE_WORD;
2292               tree_first = create_token_tree (dfa, NULL, NULL, token);
2293               token->opr.ctx_type = INSIDE_NOTWORD;
2294             }
2295           tree_last = create_token_tree (dfa, NULL, NULL, token);
2296           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2297           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2298             {
2299               *err = REG_ESPACE;
2300               return NULL;
2301             }
2302         }
2303       else
2304         {
2305           tree = create_token_tree (dfa, NULL, NULL, token);
2306           if (BE (tree == NULL, 0))
2307             {
2308               *err = REG_ESPACE;
2309               return NULL;
2310             }
2311         }
2312       /* We must return here, since ANCHORs can't be followed
2313          by repetition operators.
2314          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2315              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2316       fetch_token (token, regexp, syntax);
2317       return tree;
2318     case OP_PERIOD:
2319       tree = create_token_tree (dfa, NULL, NULL, token);
2320       if (BE (tree == NULL, 0))
2321         {
2322           *err = REG_ESPACE;
2323           return NULL;
2324         }
2325       if (dfa->mb_cur_max > 1)
2326         dfa->has_mb_node = 1;
2327       break;
2328     case OP_WORD:
2329     case OP_NOTWORD:
2330       tree = build_charclass_op (dfa, regexp->trans,
2331                                  (const unsigned char *) "alnum",
2332                                  (const unsigned char *) "_",
2333                                  token->type == OP_NOTWORD, err);
2334       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2335         return NULL;
2336       break;
2337     case OP_SPACE:
2338     case OP_NOTSPACE:
2339       tree = build_charclass_op (dfa, regexp->trans,
2340                                  (const unsigned char *) "space",
2341                                  (const unsigned char *) "",
2342                                  token->type == OP_NOTSPACE, err);
2343       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2344         return NULL;
2345       break;
2346     case OP_ALT:
2347     case END_OF_RE:
2348       return NULL;
2349     case BACK_SLASH:
2350       *err = REG_EESCAPE;
2351       return NULL;
2352     default:
2353       /* Must not happen?  */
2354 #ifdef DEBUG
2355       assert (0);
2356 #endif
2357       return NULL;
2358     }
2359   fetch_token (token, regexp, syntax);
2360
2361   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2362          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2363     {
2364       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2365       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2366         return NULL;
2367       /* In BRE consecutive duplications are not allowed.  */
2368       if ((syntax & REG_CONTEXT_INVALID_DUP)
2369           && (token->type == OP_DUP_ASTERISK
2370               || token->type == OP_OPEN_DUP_NUM))
2371         {
2372           *err = REG_BADRPT;
2373           return NULL;
2374         }
2375     }
2376
2377   return tree;
2378 }
2379
2380 /* This function build the following tree, from regular expression
2381    (<reg_exp>):
2382          SUBEXP
2383             |
2384         <reg_exp>
2385 */
2386
2387 static bin_tree_t *
2388 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2389                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2390 {
2391   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2392   bin_tree_t *tree;
2393   size_t cur_nsub;
2394   cur_nsub = preg->re_nsub++;
2395
2396   fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2397
2398   /* The subexpression may be a null string.  */
2399   if (token->type == OP_CLOSE_SUBEXP)
2400     tree = NULL;
2401   else
2402     {
2403       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2404       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2405         *err = REG_EPAREN;
2406       if (BE (*err != REG_NOERROR, 0))
2407         return NULL;
2408     }
2409
2410   if (cur_nsub <= '9' - '1')
2411     dfa->completed_bkref_map |= 1 << cur_nsub;
2412
2413   tree = create_tree (dfa, tree, NULL, SUBEXP);
2414   if (BE (tree == NULL, 0))
2415     {
2416       *err = REG_ESPACE;
2417       return NULL;
2418     }
2419   tree->token.opr.idx = cur_nsub;
2420   return tree;
2421 }
2422
2423 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2424
2425 static bin_tree_t *
2426 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2427               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2428 {
2429   bin_tree_t *tree = NULL, *old_tree = NULL;
2430   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2431   re_token_t start_token = *token;
2432
2433   if (token->type == OP_OPEN_DUP_NUM)
2434     {
2435       end = 0;
2436       start = fetch_number (regexp, token, syntax);
2437       if (start == REG_MISSING)
2438         {
2439           if (token->type == CHARACTER && token->opr.c == ',')
2440             start = 0; /* We treat "{,m}" as "{0,m}".  */
2441           else
2442             {
2443               *err = REG_BADBR; /* <re>{} is invalid.  */
2444               return NULL;
2445             }
2446         }
2447       if (BE (start != REG_ERROR, 1))
2448         {
2449           /* We treat "{n}" as "{n,n}".  */
2450           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2451                  : ((token->type == CHARACTER && token->opr.c == ',')
2452                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2453         }
2454       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2455         {
2456           /* Invalid sequence.  */
2457           if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2458             {
2459               if (token->type == END_OF_RE)
2460                 *err = REG_EBRACE;
2461               else
2462                 *err = REG_BADBR;
2463
2464               return NULL;
2465             }
2466
2467           /* If the syntax bit is set, rollback.  */
2468           re_string_set_index (regexp, start_idx);
2469           *token = start_token;
2470           token->type = CHARACTER;
2471           /* mb_partial and word_char bits should be already initialized by
2472              peek_token.  */
2473           return elem;
2474         }
2475
2476       if (BE (end != REG_MISSING && start > end, 0))
2477         {
2478           /* First number greater than second.  */
2479           *err = REG_BADBR;
2480           return NULL;
2481         }
2482     }
2483   else
2484     {
2485       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2486       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2487     }
2488
2489   fetch_token (token, regexp, syntax);
2490
2491   if (BE (elem == NULL, 0))
2492     return NULL;
2493   if (BE (start == 0 && end == 0, 0))
2494     {
2495       postorder (elem, free_tree, NULL);
2496       return NULL;
2497     }
2498
2499   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2500   if (BE (start > 0, 0))
2501     {
2502       tree = elem;
2503       for (i = 2; i <= start; ++i)
2504         {
2505           elem = duplicate_tree (elem, dfa);
2506           tree = create_tree (dfa, tree, elem, CONCAT);
2507           if (BE (elem == NULL || tree == NULL, 0))
2508             goto parse_dup_op_espace;
2509         }
2510
2511       if (start == end)
2512         return tree;
2513
2514       /* Duplicate ELEM before it is marked optional.  */
2515       elem = duplicate_tree (elem, dfa);
2516       old_tree = tree;
2517     }
2518   else
2519     old_tree = NULL;
2520
2521   if (elem->token.type == SUBEXP)
2522     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2523
2524   tree = create_tree (dfa, elem, NULL,
2525                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2526   if (BE (tree == NULL, 0))
2527     goto parse_dup_op_espace;
2528
2529   /* This loop is actually executed only when end != REG_MISSING,
2530      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2531      already created the start+1-th copy.  */
2532   if ((Idx) -1 < 0 || end != REG_MISSING)
2533     for (i = start + 2; i <= end; ++i)
2534       {
2535         elem = duplicate_tree (elem, dfa);
2536         tree = create_tree (dfa, tree, elem, CONCAT);
2537         if (BE (elem == NULL || tree == NULL, 0))
2538           goto parse_dup_op_espace;
2539
2540         tree = create_tree (dfa, tree, NULL, OP_ALT);
2541         if (BE (tree == NULL, 0))
2542           goto parse_dup_op_espace;
2543       }
2544
2545   if (old_tree)
2546     tree = create_tree (dfa, old_tree, tree, CONCAT);
2547
2548   return tree;
2549
2550  parse_dup_op_espace:
2551   *err = REG_ESPACE;
2552   return NULL;
2553 }
2554
2555 /* Size of the names for collating symbol/equivalence_class/character_class.
2556    I'm not sure, but maybe enough.  */
2557 #define BRACKET_NAME_BUF_SIZE 32
2558
2559 #ifndef _LIBC
2560   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2561      Build the range expression which starts from START_ELEM, and ends
2562      at END_ELEM.  The result are written to MBCSET and SBCSET.
2563      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2564      mbcset->range_ends, is a pointer argument sinse we may
2565      update it.  */
2566
2567 static reg_errcode_t
2568 build_range_exp (bitset sbcset,
2569 # ifdef RE_ENABLE_I18N
2570                  re_charset_t *mbcset, Idx *range_alloc,
2571 # endif
2572                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2573 {
2574   unsigned int start_ch, end_ch;
2575   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2576   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2577           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2578           0))
2579     return REG_ERANGE;
2580
2581   /* We can handle no multi character collating elements without libc
2582      support.  */
2583   if (BE ((start_elem->type == COLL_SYM
2584            && strlen ((char *) start_elem->opr.name) > 1)
2585           || (end_elem->type == COLL_SYM
2586               && strlen ((char *) end_elem->opr.name) > 1), 0))
2587     return REG_ECOLLATE;
2588
2589 # ifdef RE_ENABLE_I18N
2590   {
2591     wchar_t wc;
2592     wint_t start_wc, end_wc;
2593     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2594
2595     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2596                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2597                    : 0));
2598     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2599               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2600                  : 0));
2601     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2602                 ? __btowc (start_ch) : start_elem->opr.wch);
2603     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2604               ? __btowc (end_ch) : end_elem->opr.wch);
2605     if (start_wc == WEOF || end_wc == WEOF)
2606       return REG_ECOLLATE;
2607     cmp_buf[0] = start_wc;
2608     cmp_buf[4] = end_wc;
2609     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2610       return REG_ERANGE;
2611
2612     /* Got valid collation sequence values, add them as a new entry.
2613        However, for !_LIBC we have no collation elements: if the
2614        character set is single byte, the single byte character set
2615        that we build below suffices.  parse_bracket_exp passes
2616        no MBCSET if dfa->mb_cur_max == 1.  */
2617     if (mbcset)
2618       {
2619         /* Check the space of the arrays.  */
2620         if (BE (*range_alloc == mbcset->nranges, 0))
2621           {
2622             /* There is not enough space, need realloc.  */
2623             wchar_t *new_array_start, *new_array_end;
2624             Idx new_nranges;
2625
2626             new_nranges = mbcset->nranges;
2627             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2628                are NULL if *range_alloc == 0.  */
2629             new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2630                                             &new_nranges);
2631             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2632                                         new_nranges);
2633
2634             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2635               return REG_ESPACE;
2636
2637             mbcset->range_starts = new_array_start;
2638             mbcset->range_ends = new_array_end;
2639             *range_alloc = new_nranges;
2640           }
2641
2642         mbcset->range_starts[mbcset->nranges] = start_wc;
2643         mbcset->range_ends[mbcset->nranges++] = end_wc;
2644       }
2645
2646     /* Build the table for single byte characters.  */
2647     for (wc = 0; wc < SBC_MAX; ++wc)
2648       {
2649         cmp_buf[2] = wc;
2650         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2651             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2652           bitset_set (sbcset, wc);
2653       }
2654   }
2655 # else /* not RE_ENABLE_I18N */
2656   {
2657     unsigned int ch;
2658     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2659                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2660                    : 0));
2661     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2662               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2663                  : 0));
2664     if (start_ch > end_ch)
2665       return REG_ERANGE;
2666     /* Build the table for single byte characters.  */
2667     for (ch = 0; ch < SBC_MAX; ++ch)
2668       if (start_ch <= ch  && ch <= end_ch)
2669         bitset_set (sbcset, ch);
2670   }
2671 # endif /* not RE_ENABLE_I18N */
2672   return REG_NOERROR;
2673 }
2674 #endif /* not _LIBC */
2675
2676 #ifndef _LIBC
2677 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2678    Build the collating element which is represented by NAME.
2679    The result are written to MBCSET and SBCSET.
2680    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2681    pointer argument since we may update it.  */
2682
2683 static reg_errcode_t
2684 build_collating_symbol (bitset sbcset,
2685 # ifdef RE_ENABLE_I18N
2686                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2687 # endif
2688                         const unsigned char *name)
2689 {
2690   size_t name_len = strlen ((const char *) name);
2691   if (BE (name_len != 1, 0))
2692     return REG_ECOLLATE;
2693   else
2694     {
2695       bitset_set (sbcset, name[0]);
2696       return REG_NOERROR;
2697     }
2698 }
2699 #endif /* not _LIBC */
2700
2701 /* This function parse bracket expression like "[abc]", "[a-c]",
2702    "[[.a-a.]]" etc.  */
2703
2704 static bin_tree_t *
2705 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2706                    reg_syntax_t syntax, reg_errcode_t *err)
2707 {
2708 #ifdef _LIBC
2709   const unsigned char *collseqmb;
2710   const char *collseqwc;
2711   uint32_t nrules;
2712   int32_t table_size;
2713   const int32_t *symb_table;
2714   const unsigned char *extra;
2715
2716   /* Local function for parse_bracket_exp used in _LIBC environement.
2717      Seek the collating symbol entry correspondings to NAME.
2718      Return the index of the symbol in the SYMB_TABLE.  */
2719
2720   auto inline int32_t
2721   __attribute ((always_inline))
2722   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2723     {
2724       int32_t hash = elem_hash ((const char *) name, name_len);
2725       int32_t elem = hash % table_size;
2726       int32_t second = hash % (table_size - 2);
2727       while (symb_table[2 * elem] != 0)
2728         {
2729           /* First compare the hashing value.  */
2730           if (symb_table[2 * elem] == hash
2731               /* Compare the length of the name.  */
2732               && name_len == extra[symb_table[2 * elem + 1]]
2733               /* Compare the name.  */
2734               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2735                          name_len) == 0)
2736             {
2737               /* Yep, this is the entry.  */
2738               break;
2739             }
2740
2741           /* Next entry.  */
2742           elem += second;
2743         }
2744       return elem;
2745     }
2746
2747   /* Local function for parse_bracket_exp used in _LIBC environement.
2748      Look up the collation sequence value of BR_ELEM.
2749      Return the value if succeeded, UINT_MAX otherwise.  */
2750
2751   auto inline unsigned int
2752   __attribute ((always_inline))
2753   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2754     {
2755       if (br_elem->type == SB_CHAR)
2756         {
2757           /*
2758           if (MB_CUR_MAX == 1)
2759           */
2760           if (nrules == 0)
2761             return collseqmb[br_elem->opr.ch];
2762           else
2763             {
2764               wint_t wc = __btowc (br_elem->opr.ch);
2765               return __collseq_table_lookup (collseqwc, wc);
2766             }
2767         }
2768       else if (br_elem->type == MB_CHAR)
2769         {
2770           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2771         }
2772       else if (br_elem->type == COLL_SYM)
2773         {
2774           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2775           if (nrules != 0)
2776             {
2777               int32_t elem, idx;
2778               elem = seek_collating_symbol_entry (br_elem->opr.name,
2779                                                   sym_name_len);
2780               if (symb_table[2 * elem] != 0)
2781                 {
2782                   /* We found the entry.  */
2783                   idx = symb_table[2 * elem + 1];
2784                   /* Skip the name of collating element name.  */
2785                   idx += 1 + extra[idx];
2786                   /* Skip the byte sequence of the collating element.  */
2787                   idx += 1 + extra[idx];
2788                   /* Adjust for the alignment.  */
2789                   idx = (idx + 3) & ~3;
2790                   /* Skip the multibyte collation sequence value.  */
2791                   idx += sizeof (unsigned int);
2792                   /* Skip the wide char sequence of the collating element.  */
2793                   idx += sizeof (unsigned int) *
2794                     (1 + *(unsigned int *) (extra + idx));
2795                   /* Return the collation sequence value.  */
2796                   return *(unsigned int *) (extra + idx);
2797                 }
2798               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2799                 {
2800                   /* No valid character.  Match it as a single byte
2801                      character.  */
2802                   return collseqmb[br_elem->opr.name[0]];
2803                 }
2804             }
2805           else if (sym_name_len == 1)
2806             return collseqmb[br_elem->opr.name[0]];
2807         }
2808       return UINT_MAX;
2809     }
2810
2811   /* Local function for parse_bracket_exp used in _LIBC environement.
2812      Build the range expression which starts from START_ELEM, and ends
2813      at END_ELEM.  The result are written to MBCSET and SBCSET.
2814      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2815      mbcset->range_ends, is a pointer argument sinse we may
2816      update it.  */
2817
2818   auto inline reg_errcode_t
2819   __attribute ((always_inline))
2820   build_range_exp (bitset sbcset, re_charset_t *mbcset,
2821                    Idx *range_alloc,
2822                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2823     {
2824       unsigned int ch;
2825       uint32_t start_collseq;
2826       uint32_t end_collseq;
2827
2828       /* Equivalence Classes and Character Classes can't be a range
2829          start/end.  */
2830       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2831               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2832               0))
2833         return REG_ERANGE;
2834
2835       start_collseq = lookup_collation_sequence_value (start_elem);
2836       end_collseq = lookup_collation_sequence_value (end_elem);
2837       /* Check start/end collation sequence values.  */
2838       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2839         return REG_ECOLLATE;
2840       if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2841         return REG_ERANGE;
2842
2843       /* Got valid collation sequence values, add them as a new entry.
2844          However, if we have no collation elements, and the character set
2845          is single byte, the single byte character set that we
2846          build below suffices. */
2847       if (nrules > 0 || dfa->mb_cur_max > 1)
2848         {
2849           /* Check the space of the arrays.  */
2850           if (BE (*range_alloc == mbcset->nranges, 0))
2851             {
2852               /* There is not enough space, need realloc.  */
2853               uint32_t *new_array_start;
2854               uint32_t *new_array_end;
2855               Idx new_nranges;
2856
2857               new_nranges = mbcset->nranges;
2858               new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2859                                               &new_nranges);
2860               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2861                                           new_nranges);
2862
2863               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2864                 return REG_ESPACE;
2865
2866               mbcset->range_starts = new_array_start;
2867               mbcset->range_ends = new_array_end;
2868               *range_alloc = new_nranges;
2869             }
2870
2871           mbcset->range_starts[mbcset->nranges] = start_collseq;
2872           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2873         }
2874
2875       /* Build the table for single byte characters.  */
2876       for (ch = 0; ch < SBC_MAX; ch++)
2877         {
2878           uint32_t ch_collseq;
2879           /*
2880           if (MB_CUR_MAX == 1)
2881           */
2882           if (nrules == 0)
2883             ch_collseq = collseqmb[ch];
2884           else
2885             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2886           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2887             bitset_set (sbcset, ch);
2888         }
2889       return REG_NOERROR;
2890     }
2891
2892   /* Local function for parse_bracket_exp used in _LIBC environement.
2893      Build the collating element which is represented by NAME.
2894      The result are written to MBCSET and SBCSET.
2895      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2896      pointer argument sinse we may update it.  */
2897
2898   auto inline reg_errcode_t
2899   __attribute ((always_inline))
2900   build_collating_symbol (bitset sbcset, re_charset_t *mbcset,
2901                           Idx *coll_sym_alloc, const unsigned char *name)
2902     {
2903       int32_t elem, idx;
2904       size_t name_len = strlen ((const char *) name);
2905       if (nrules != 0)
2906         {
2907           elem = seek_collating_symbol_entry (name, name_len);
2908           if (symb_table[2 * elem] != 0)
2909             {
2910               /* We found the entry.  */
2911               idx = symb_table[2 * elem + 1];
2912               /* Skip the name of collating element name.  */
2913               idx += 1 + extra[idx];
2914             }
2915           else if (symb_table[2 * elem] == 0 && name_len == 1)
2916             {
2917               /* No valid character, treat it as a normal
2918                  character.  */
2919               bitset_set (sbcset, name[0]);
2920               return REG_NOERROR;
2921             }
2922           else
2923             return REG_ECOLLATE;
2924
2925           /* Got valid collation sequence, add it as a new entry.  */
2926           /* Check the space of the arrays.  */
2927           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2928             {
2929               /* Not enough, realloc it.  */
2930               Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2931               /* Use realloc since mbcset->coll_syms is NULL
2932                  if *alloc == 0.  */
2933               int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2934                                                      &new_coll_sym_alloc);
2935               if (BE (new_coll_syms == NULL, 0))
2936                 return REG_ESPACE;
2937               mbcset->coll_syms = new_coll_syms;
2938               *coll_sym_alloc = new_coll_sym_alloc;
2939             }
2940           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2941           return REG_NOERROR;
2942         }
2943       else
2944         {
2945           if (BE (name_len != 1, 0))
2946             return REG_ECOLLATE;
2947           else
2948             {
2949               bitset_set (sbcset, name[0]);
2950               return REG_NOERROR;
2951             }
2952         }
2953     }
2954 #endif
2955
2956   re_token_t br_token;
2957   re_bitset_ptr_t sbcset;
2958 #ifdef RE_ENABLE_I18N
2959   re_charset_t *mbcset;
2960   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2961   Idx equiv_class_alloc = 0, char_class_alloc = 0;
2962 #endif /* not RE_ENABLE_I18N */
2963   bool non_match = false;
2964   bin_tree_t *work_tree;
2965   int token_len;
2966   bool first_round = true;
2967 #ifdef _LIBC
2968   collseqmb = (const unsigned char *)
2969     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2970   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2971   if (nrules)
2972     {
2973       /*
2974       if (MB_CUR_MAX > 1)
2975       */
2976         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2977       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2978       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2979                                                   _NL_COLLATE_SYMB_TABLEMB);
2980       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2981                                                    _NL_COLLATE_SYMB_EXTRAMB);
2982     }
2983 #endif
2984   sbcset = re_calloc (bitset_word, BITSET_WORDS);
2985 #ifdef RE_ENABLE_I18N
2986   mbcset = re_calloc (re_charset_t, 1);
2987 #endif /* RE_ENABLE_I18N */
2988 #ifdef RE_ENABLE_I18N
2989   if (BE (sbcset == NULL || mbcset == NULL, 0))
2990 #else
2991   if (BE (sbcset == NULL, 0))
2992 #endif /* RE_ENABLE_I18N */
2993     {
2994       *err = REG_ESPACE;
2995       return NULL;
2996     }
2997
2998   token_len = peek_token_bracket (token, regexp, syntax);
2999   if (BE (token->type == END_OF_RE, 0))
3000     {
3001       *err = REG_BADPAT;
3002       goto parse_bracket_exp_free_return;
3003     }
3004   if (token->type == OP_NON_MATCH_LIST)
3005     {
3006 #ifdef RE_ENABLE_I18N
3007       mbcset->non_match = 1;
3008 #endif /* not RE_ENABLE_I18N */
3009       non_match = true;
3010       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3011         bitset_set (sbcset, '\0');
3012       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3013       token_len = peek_token_bracket (token, regexp, syntax);
3014       if (BE (token->type == END_OF_RE, 0))
3015         {
3016           *err = REG_BADPAT;
3017           goto parse_bracket_exp_free_return;
3018         }
3019     }
3020
3021   /* We treat the first ']' as a normal character.  */
3022   if (token->type == OP_CLOSE_BRACKET)
3023     token->type = CHARACTER;
3024
3025   while (1)
3026     {
3027       bracket_elem_t start_elem, end_elem;
3028       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3029       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3030       reg_errcode_t ret;
3031       int token_len2 = 0;
3032       bool is_range_exp = false;
3033       re_token_t token2;
3034
3035       start_elem.opr.name = start_name_buf;
3036       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3037                                    syntax, first_round);
3038       if (BE (ret != REG_NOERROR, 0))
3039         {
3040           *err = ret;
3041           goto parse_bracket_exp_free_return;
3042         }
3043       first_round = false;
3044
3045       /* Get information about the next token.  We need it in any case.  */
3046       token_len = peek_token_bracket (token, regexp, syntax);
3047
3048       /* Do not check for ranges if we know they are not allowed.  */
3049       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3050         {
3051           if (BE (token->type == END_OF_RE, 0))
3052             {
3053               *err = REG_EBRACK;
3054               goto parse_bracket_exp_free_return;
3055             }
3056           if (token->type == OP_CHARSET_RANGE)
3057             {
3058               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3059               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3060               if (BE (token2.type == END_OF_RE, 0))
3061                 {
3062                   *err = REG_EBRACK;
3063                   goto parse_bracket_exp_free_return;
3064                 }
3065               if (token2.type == OP_CLOSE_BRACKET)
3066                 {
3067                   /* We treat the last '-' as a normal character.  */
3068                   re_string_skip_bytes (regexp, -token_len);
3069                   token->type = CHARACTER;
3070                 }
3071               else
3072                 is_range_exp = true;
3073             }
3074         }
3075
3076       if (is_range_exp == true)
3077         {
3078           end_elem.opr.name = end_name_buf;
3079           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3080                                        dfa, syntax, true);
3081           if (BE (ret != REG_NOERROR, 0))
3082             {
3083               *err = ret;
3084               goto parse_bracket_exp_free_return;
3085             }
3086
3087           token_len = peek_token_bracket (token, regexp, syntax);
3088
3089 #ifdef _LIBC
3090           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3091                                   &start_elem, &end_elem);
3092 #else
3093 # ifdef RE_ENABLE_I18N
3094           *err = build_range_exp (sbcset,
3095                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3096                                   &range_alloc, &start_elem, &end_elem);
3097 # else
3098           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3099 # endif
3100 #endif /* RE_ENABLE_I18N */
3101           if (BE (*err != REG_NOERROR, 0))
3102             goto parse_bracket_exp_free_return;
3103         }
3104       else
3105         {
3106           switch (start_elem.type)
3107             {
3108             case SB_CHAR:
3109               bitset_set (sbcset, start_elem.opr.ch);
3110               break;
3111 #ifdef RE_ENABLE_I18N
3112             case MB_CHAR:
3113               /* Check whether the array has enough space.  */
3114               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3115                 {
3116                   wchar_t *new_mbchars;
3117                   /* Not enough, realloc it.  */
3118                   mbchar_alloc = mbcset->nmbchars;
3119                   /* Use realloc since array is NULL if *alloc == 0.  */
3120                   new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3121                                               &mbchar_alloc);
3122                   if (BE (new_mbchars == NULL, 0))
3123                     goto parse_bracket_exp_espace;
3124                   mbcset->mbchars = new_mbchars;
3125                 }
3126               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3127               break;
3128 #endif /* RE_ENABLE_I18N */
3129             case EQUIV_CLASS:
3130               *err = build_equiv_class (sbcset,
3131 #ifdef RE_ENABLE_I18N
3132                                         mbcset, &equiv_class_alloc,
3133 #endif /* RE_ENABLE_I18N */
3134                                         start_elem.opr.name);
3135               if (BE (*err != REG_NOERROR, 0))
3136                 goto parse_bracket_exp_free_return;
3137               break;
3138             case COLL_SYM:
3139               *err = build_collating_symbol (sbcset,
3140 #ifdef RE_ENABLE_I18N
3141                                              mbcset, &coll_sym_alloc,
3142 #endif /* RE_ENABLE_I18N */
3143                                              start_elem.opr.name);
3144               if (BE (*err != REG_NOERROR, 0))
3145                 goto parse_bracket_exp_free_return;
3146               break;
3147             case CHAR_CLASS:
3148               *err = build_charclass (regexp->trans, sbcset,
3149 #ifdef RE_ENABLE_I18N
3150                                       mbcset, &char_class_alloc,
3151 #endif /* RE_ENABLE_I18N */
3152                                       start_elem.opr.name, syntax);
3153               if (BE (*err != REG_NOERROR, 0))
3154                goto parse_bracket_exp_free_return;
3155               break;
3156             default:
3157               assert (0);
3158               break;
3159             }
3160         }
3161       if (BE (token->type == END_OF_RE, 0))
3162         {
3163           *err = REG_EBRACK;
3164           goto parse_bracket_exp_free_return;
3165         }
3166       if (token->type == OP_CLOSE_BRACKET)
3167         break;
3168     }
3169
3170   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3171
3172   /* If it is non-matching list.  */
3173   if (non_match)
3174     bitset_not (sbcset);
3175
3176 #ifdef RE_ENABLE_I18N
3177   /* Ensure only single byte characters are set.  */
3178   if (dfa->mb_cur_max > 1)
3179     bitset_mask (sbcset, dfa->sb_char);
3180
3181   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3182       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3183                                                      || mbcset->non_match)))
3184     {
3185       bin_tree_t *mbc_tree;
3186       int sbc_idx;
3187       /* Build a tree for complex bracket.  */
3188       dfa->has_mb_node = 1;
3189       br_token.type = COMPLEX_BRACKET;
3190       br_token.opr.mbcset = mbcset;
3191       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3192       if (BE (mbc_tree == NULL, 0))
3193         goto parse_bracket_exp_espace;
3194       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3195         if (sbcset[sbc_idx])
3196           break;
3197       /* If there are no bits set in sbcset, there is no point
3198          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3199       if (sbc_idx < BITSET_WORDS)
3200         {
3201           /* Build a tree for simple bracket.  */
3202           br_token.type = SIMPLE_BRACKET;
3203           br_token.opr.sbcset = sbcset;
3204           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3205           if (BE (work_tree == NULL, 0))
3206             goto parse_bracket_exp_espace;
3207
3208           /* Then join them by ALT node.  */
3209           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3210           if (BE (work_tree == NULL, 0))
3211             goto parse_bracket_exp_espace;
3212         }
3213       else
3214         {
3215           re_free (sbcset);
3216           work_tree = mbc_tree;
3217         }
3218     }
3219   else
3220 #endif /* not RE_ENABLE_I18N */
3221     {
3222 #ifdef RE_ENABLE_I18N
3223       free_charset (mbcset);
3224 #endif
3225       /* Build a tree for simple bracket.  */
3226       br_token.type = SIMPLE_BRACKET;
3227       br_token.opr.sbcset = sbcset;
3228       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3229       if (BE (work_tree == NULL, 0))
3230         goto parse_bracket_exp_espace;
3231     }
3232   return work_tree;
3233
3234  parse_bracket_exp_espace:
3235   *err = REG_ESPACE;
3236  parse_bracket_exp_free_return:
3237   re_free (sbcset);
3238 #ifdef RE_ENABLE_I18N
3239   free_charset (mbcset);
3240 #endif /* RE_ENABLE_I18N */
3241   return NULL;
3242 }
3243
3244 /* Parse an element in the bracket expression.  */
3245
3246 static reg_errcode_t
3247 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3248                        re_token_t *token, int token_len, re_dfa_t *dfa,
3249                        reg_syntax_t syntax, bool accept_hyphen)
3250 {
3251 #ifdef RE_ENABLE_I18N
3252   int cur_char_size;
3253   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3254   if (cur_char_size > 1)
3255     {
3256       elem->type = MB_CHAR;
3257       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3258       re_string_skip_bytes (regexp, cur_char_size);
3259       return REG_NOERROR;
3260     }
3261 #endif /* RE_ENABLE_I18N */
3262   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3263   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3264       || token->type == OP_OPEN_EQUIV_CLASS)
3265     return parse_bracket_symbol (elem, regexp, token);
3266   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3267     {
3268       /* A '-' must only appear as anything but a range indicator before
3269          the closing bracket.  Everything else is an error.  */
3270       re_token_t token2;
3271       (void) peek_token_bracket (&token2, regexp, syntax);
3272       if (token2.type != OP_CLOSE_BRACKET)
3273         /* The actual error value is not standardized since this whole
3274            case is undefined.  But ERANGE makes good sense.  */
3275         return REG_ERANGE;
3276     }
3277   elem->type = SB_CHAR;
3278   elem->opr.ch = token->opr.c;
3279   return REG_NOERROR;
3280 }
3281
3282 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3283    such as [:<character_class>:], [.<collating_element>.], and
3284    [=<equivalent_class>=].  */
3285
3286 static reg_errcode_t
3287 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3288                       re_token_t *token)
3289 {
3290   unsigned char ch, delim = token->opr.c;
3291   int i = 0;
3292   if (re_string_eoi(regexp))
3293     return REG_EBRACK;
3294   for (;; ++i)
3295     {
3296       if (i >= BRACKET_NAME_BUF_SIZE)
3297         return REG_EBRACK;
3298       if (token->type == OP_OPEN_CHAR_CLASS)
3299         ch = re_string_fetch_byte_case (regexp);
3300       else
3301         ch = re_string_fetch_byte (regexp);
3302       if (re_string_eoi(regexp))
3303         return REG_EBRACK;
3304       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3305         break;
3306       elem->opr.name[i] = ch;
3307     }
3308   re_string_skip_bytes (regexp, 1);
3309   elem->opr.name[i] = '\0';
3310   switch (token->type)
3311     {
3312     case OP_OPEN_COLL_ELEM:
3313       elem->type = COLL_SYM;
3314       break;
3315     case OP_OPEN_EQUIV_CLASS:
3316       elem->type = EQUIV_CLASS;
3317       break;
3318     case OP_OPEN_CHAR_CLASS:
3319       elem->type = CHAR_CLASS;
3320       break;
3321     default:
3322       break;
3323     }
3324   return REG_NOERROR;
3325 }
3326
3327   /* Helper function for parse_bracket_exp.
3328      Build the equivalence class which is represented by NAME.
3329      The result are written to MBCSET and SBCSET.
3330      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3331      is a pointer argument sinse we may update it.  */
3332
3333 static reg_errcode_t
3334 build_equiv_class (bitset sbcset,
3335 #ifdef RE_ENABLE_I18N
3336                    re_charset_t *mbcset, Idx *equiv_class_alloc,
3337 #endif
3338                    const unsigned char *name)
3339 {
3340 #if defined _LIBC
3341   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3342   if (nrules != 0)
3343     {
3344       const int32_t *table, *indirect;
3345       const unsigned char *weights, *extra, *cp;
3346       unsigned char char_buf[2];
3347       int32_t idx1, idx2;
3348       unsigned int ch;
3349       size_t len;
3350       /* This #include defines a local function!  */
3351 # include <locale/weight.h>
3352       /* Calculate the index for equivalence class.  */
3353       cp = name;
3354       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3355       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3356                                                _NL_COLLATE_WEIGHTMB);
3357       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3358                                                    _NL_COLLATE_EXTRAMB);
3359       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3360                                                 _NL_COLLATE_INDIRECTMB);
3361       idx1 = findidx (&cp);
3362       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3363         /* This isn't a valid character.  */
3364         return REG_ECOLLATE;
3365
3366       /* Build single byte matcing table for this equivalence class.  */
3367       char_buf[1] = (unsigned char) '\0';
3368       len = weights[idx1];
3369       for (ch = 0; ch < SBC_MAX; ++ch)
3370         {
3371           char_buf[0] = ch;
3372           cp = char_buf;
3373           idx2 = findidx (&cp);
3374 /*
3375           idx2 = table[ch];
3376 */
3377           if (idx2 == 0)
3378             /* This isn't a valid character.  */
3379             continue;
3380           if (len == weights[idx2])
3381             {
3382               int cnt = 0;
3383               while (cnt <= len &&
3384                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3385                 ++cnt;
3386
3387               if (cnt > len)
3388                 bitset_set (sbcset, ch);
3389             }
3390         }
3391       /* Check whether the array has enough space.  */
3392       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3393         {
3394           /* Not enough, realloc it.  */
3395           Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3396           /* Use realloc since the array is NULL if *alloc == 0.  */
3397           int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3398                                                      int32_t,
3399                                                      &new_equiv_class_alloc);
3400           if (BE (new_equiv_classes == NULL, 0))
3401             return REG_ESPACE;
3402           mbcset->equiv_classes = new_equiv_classes;
3403           *equiv_class_alloc = new_equiv_class_alloc;
3404         }
3405       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3406     }
3407   else
3408 #endif /* _LIBC */
3409     {
3410       if (BE (strlen ((const char *) name) != 1, 0))
3411         return REG_ECOLLATE;
3412       bitset_set (sbcset, *name);
3413     }
3414   return REG_NOERROR;
3415 }
3416
3417   /* Helper function for parse_bracket_exp.
3418      Build the character class which is represented by NAME.
3419      The result are written to MBCSET and SBCSET.
3420      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3421      is a pointer argument sinse we may update it.  */
3422
3423 static reg_errcode_t
3424 build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset,
3425 #ifdef RE_ENABLE_I18N
3426                  re_charset_t *mbcset, Idx *char_class_alloc,
3427 #endif
3428                  const unsigned char *class_name, reg_syntax_t syntax)
3429 {
3430   int i;
3431   const char *name = (const char *) class_name;
3432
3433   /* In case of REG_ICASE "upper" and "lower" match the both of
3434      upper and lower cases.  */
3435   if ((syntax & REG_IGNORE_CASE)
3436       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3437     name = "alpha";
3438
3439 #ifdef RE_ENABLE_I18N
3440   /* Check the space of the arrays.  */
3441   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3442     {
3443       /* Not enough, realloc it.  */
3444       Idx new_char_class_alloc = mbcset->nchar_classes;
3445       /* Use realloc since array is NULL if *alloc == 0.  */
3446       wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3447                                                  &new_char_class_alloc);
3448       if (BE (new_char_classes == NULL, 0))
3449         return REG_ESPACE;
3450       mbcset->char_classes = new_char_classes;
3451       *char_class_alloc = new_char_class_alloc;
3452     }
3453   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3454 #endif /* RE_ENABLE_I18N */
3455
3456 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3457     for (i = 0; i < SBC_MAX; ++i)               \
3458       {                                         \
3459         if (ctype_func (i))                     \
3460           {                                     \
3461             int ch = trans ? trans[i] : i;      \
3462             bitset_set (sbcset, ch);            \
3463           }                                     \
3464       }
3465
3466   if (strcmp (name, "alnum") == 0)
3467     BUILD_CHARCLASS_LOOP (isalnum)
3468   else if (strcmp (name, "cntrl") == 0)
3469     BUILD_CHARCLASS_LOOP (iscntrl)
3470   else if (strcmp (name, "lower") == 0)
3471     BUILD_CHARCLASS_LOOP (islower)
3472   else if (strcmp (name, "space") == 0)
3473     BUILD_CHARCLASS_LOOP (isspace)
3474   else if (strcmp (name, "alpha") == 0)
3475     BUILD_CHARCLASS_LOOP (isalpha)
3476   else if (strcmp (name, "digit") == 0)
3477     BUILD_CHARCLASS_LOOP (isdigit)
3478   else if (strcmp (name, "print") == 0)
3479     BUILD_CHARCLASS_LOOP (isprint)
3480   else if (strcmp (name, "upper") == 0)
3481     BUILD_CHARCLASS_LOOP (isupper)
3482   else if (strcmp (name, "blank") == 0)
3483     BUILD_CHARCLASS_LOOP (isblank)
3484   else if (strcmp (name, "graph") == 0)
3485     BUILD_CHARCLASS_LOOP (isgraph)
3486   else if (strcmp (name, "punct") == 0)
3487     BUILD_CHARCLASS_LOOP (ispunct)
3488   else if (strcmp (name, "xdigit") == 0)
3489     BUILD_CHARCLASS_LOOP (isxdigit)
3490   else
3491     return REG_ECTYPE;
3492
3493   return REG_NOERROR;
3494 }
3495
3496 static bin_tree_t *
3497 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3498                     const unsigned char *class_name,
3499                     const unsigned char *extra,
3500                     bool non_match, reg_errcode_t *err)
3501 {
3502   re_bitset_ptr_t sbcset;
3503 #ifdef RE_ENABLE_I18N
3504   re_charset_t *mbcset;
3505   Idx alloc = 0;
3506 #endif /* not RE_ENABLE_I18N */
3507   reg_errcode_t ret;
3508   re_token_t br_token;
3509   bin_tree_t *tree;
3510
3511   sbcset = re_calloc (bitset_word, BITSET_WORDS);
3512 #ifdef RE_ENABLE_I18N
3513   mbcset = re_calloc (re_charset_t, 1);
3514 #endif /* RE_ENABLE_I18N */
3515
3516 #ifdef RE_ENABLE_I18N
3517   if (BE (sbcset == NULL || mbcset == NULL, 0))
3518 #else /* not RE_ENABLE_I18N */
3519   if (BE (sbcset == NULL, 0))
3520 #endif /* not RE_ENABLE_I18N */
3521     {
3522       *err = REG_ESPACE;
3523       return NULL;
3524     }
3525
3526   if (non_match)
3527     {
3528 #ifdef RE_ENABLE_I18N
3529       /*
3530       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3531         bitset_set(cset->sbcset, '\0');
3532       */
3533       mbcset->non_match = 1;
3534 #endif /* not RE_ENABLE_I18N */
3535     }
3536
3537   /* We don't care the syntax in this case.  */
3538   ret = build_charclass (trans, sbcset,
3539 #ifdef RE_ENABLE_I18N
3540                          mbcset, &alloc,
3541 #endif /* RE_ENABLE_I18N */
3542                          class_name, 0);
3543
3544   if (BE (ret != REG_NOERROR, 0))
3545     {
3546       re_free (sbcset);
3547 #ifdef RE_ENABLE_I18N
3548       free_charset (mbcset);
3549 #endif /* RE_ENABLE_I18N */
3550       *err = ret;
3551       return NULL;
3552     }
3553   /* \w match '_' also.  */
3554   for (; *extra; extra++)
3555     bitset_set (sbcset, *extra);
3556
3557   /* If it is non-matching list.  */
3558   if (non_match)
3559     bitset_not (sbcset);
3560
3561 #ifdef RE_ENABLE_I18N
3562   /* Ensure only single byte characters are set.  */
3563   if (dfa->mb_cur_max > 1)
3564     bitset_mask (sbcset, dfa->sb_char);
3565 #endif
3566
3567   /* Build a tree for simple bracket.  */
3568   br_token.type = SIMPLE_BRACKET;
3569   br_token.opr.sbcset = sbcset;
3570   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3571   if (BE (tree == NULL, 0))
3572     goto build_word_op_espace;
3573
3574 #ifdef RE_ENABLE_I18N
3575   if (dfa->mb_cur_max > 1)
3576     {
3577       bin_tree_t *mbc_tree;
3578       /* Build a tree for complex bracket.  */
3579       br_token.type = COMPLEX_BRACKET;
3580       br_token.opr.mbcset = mbcset;
3581       dfa->has_mb_node = 1;
3582       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3583       if (BE (mbc_tree == NULL, 0))
3584         goto build_word_op_espace;
3585       /* Then join them by ALT node.  */
3586       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3587       if (BE (mbc_tree != NULL, 1))
3588         return tree;
3589     }
3590   else
3591     {
3592       free_charset (mbcset);
3593       return tree;
3594     }
3595 #else /* not RE_ENABLE_I18N */
3596   return tree;
3597 #endif /* not RE_ENABLE_I18N */
3598
3599  build_word_op_espace:
3600   re_free (sbcset);
3601 #ifdef RE_ENABLE_I18N
3602   free_charset (mbcset);
3603 #endif /* RE_ENABLE_I18N */
3604   *err = REG_ESPACE;
3605   return NULL;
3606 }
3607
3608 /* This is intended for the expressions like "a{1,3}".
3609    Fetch a number from `input', and return the number.
3610    Return REG_MISSING if the number field is empty like "{,1}".
3611    Return REG_ERROR if an error occurred.  */
3612
3613 static Idx
3614 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3615 {
3616   Idx num = REG_MISSING;
3617   unsigned char c;
3618   while (1)
3619     {
3620       fetch_token (token, input, syntax);
3621       c = token->opr.c;
3622       if (BE (token->type == END_OF_RE, 0))
3623         return REG_ERROR;
3624       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3625         break;
3626       num = ((token->type != CHARACTER || c < '0' || '9' < c
3627               || num == REG_ERROR)
3628              ? REG_ERROR
3629              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3630       num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3631     }
3632   return num;
3633 }
3634 \f
3635 #ifdef RE_ENABLE_I18N
3636 static void
3637 free_charset (re_charset_t *cset)
3638 {
3639   re_free (cset->mbchars);
3640 # ifdef _LIBC
3641   re_free (cset->coll_syms);
3642   re_free (cset->equiv_classes);
3643   re_free (cset->range_starts);
3644   re_free (cset->range_ends);
3645 # endif
3646   re_free (cset->char_classes);
3647   re_free (cset);
3648 }
3649 #endif /* RE_ENABLE_I18N */
3650 \f
3651 /* Functions for binary tree operation.  */
3652
3653 /* Create a tree node.  */
3654
3655 static bin_tree_t *
3656 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3657              re_token_type_t type)
3658 {
3659   re_token_t t;
3660   t.type = type;
3661   return create_token_tree (dfa, left, right, &t);
3662 }
3663
3664 static bin_tree_t *
3665 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3666                    const re_token_t *token)
3667 {
3668   bin_tree_t *tree;
3669   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3670     {
3671       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3672
3673       if (storage == NULL)
3674         return NULL;
3675       storage->next = dfa->str_tree_storage;
3676       dfa->str_tree_storage = storage;
3677       dfa->str_tree_storage_idx = 0;
3678     }
3679   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3680
3681   tree->parent = NULL;
3682   tree->left = left;
3683   tree->right = right;
3684   tree->token = *token;
3685   tree->token.duplicated = 0;
3686   tree->token.opt_subexp = 0;
3687   tree->first = NULL;
3688   tree->next = NULL;
3689   tree->node_idx = REG_MISSING;
3690
3691   if (left != NULL)
3692     left->parent = tree;
3693   if (right != NULL)
3694     right->parent = tree;
3695   return tree;
3696 }
3697
3698 /* Mark the tree SRC as an optional subexpression.
3699    To be called from preorder or postorder.  */
3700
3701 static reg_errcode_t
3702 mark_opt_subexp (void *extra, bin_tree_t *node)
3703 {
3704   Idx idx = (Idx) (long) extra;
3705   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3706     node->token.opt_subexp = 1;
3707
3708   return REG_NOERROR;
3709 }
3710
3711 /* Free the allocated memory inside NODE. */
3712
3713 static void
3714 free_token (re_token_t *node)
3715 {
3716 #ifdef RE_ENABLE_I18N
3717   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3718     free_charset (node->opr.mbcset);
3719   else
3720 #endif /* RE_ENABLE_I18N */
3721     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3722       re_free (node->opr.sbcset);
3723 }
3724
3725 /* Worker function for tree walking.  Free the allocated memory inside NODE
3726    and its children. */
3727
3728 static reg_errcode_t
3729 free_tree (void *extra, bin_tree_t *node)
3730 {
3731   free_token (&node->token);
3732   return REG_NOERROR;
3733 }
3734
3735
3736 /* Duplicate the node SRC, and return new node.  This is a preorder
3737    visit similar to the one implemented by the generic visitor, but
3738    we need more infrastructure to maintain two parallel trees --- so,
3739    it's easier to duplicate.  */
3740
3741 static bin_tree_t *
3742 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3743 {
3744   const bin_tree_t *node;
3745   bin_tree_t *dup_root;
3746   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3747
3748   for (node = root; ; )
3749     {
3750       /* Create a new tree and link it back to the current parent.  */
3751       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3752       if (*p_new == NULL)
3753         return NULL;
3754       (*p_new)->parent = dup_node;
3755       (*p_new)->token.duplicated = 1;
3756       dup_node = *p_new;
3757
3758       /* Go to the left node, or up and to the right.  */
3759       if (node->left)
3760         {
3761           node = node->left;
3762           p_new = &dup_node->left;
3763         }
3764       else
3765         {
3766           const bin_tree_t *prev = NULL;
3767           while (node->right == prev || node->right == NULL)
3768             {
3769               prev = node;
3770               node = node->parent;
3771               dup_node = dup_node->parent;
3772               if (!node)
3773                 return dup_root;
3774             }
3775           node = node->right;
3776           p_new = &dup_node->right;
3777         }
3778     }
3779 }