gl/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   3    Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License along
  18    with this program; if not, write to the Free Software Foundation,
  19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  20
  21 static void re_string_construct_common (const char *str, Idx len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, bool icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  26                                           const re_node_set *nodes,
  27                                           re_hashval_t hash) internal_function;
  28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  29                                           const re_node_set *nodes,
  30                                           unsigned int context,
  31                                           re_hashval_t hash) internal_function;
  32 \f
  33 /* Functions for string operation.  */
  34
  35 /* This function allocate the buffers.  It is necessary to call
  36    re_string_reconstruct before using the object.  */
  37
  38 static reg_errcode_t
  39 internal_function __attribute_warn_unused_result__
  40 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  41                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  42 {
  43   reg_errcode_t ret;
  44   Idx init_buf_len;
  45
  46   /* Ensure at least one character fits into the buffers.  */
  47   if (init_len < dfa->mb_cur_max)
  48     init_len = dfa->mb_cur_max;
  49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  51
  52   ret = re_string_realloc_buffers (pstr, init_buf_len);
  53   if (BE (ret != REG_NOERROR, 0))
  54     return ret;
  55
  56   pstr->word_char = dfa->word_char;
  57   pstr->word_ops_used = dfa->word_ops_used;
  58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  60   pstr->valid_raw_len = pstr->valid_len;
  61   return REG_NOERROR;
  62 }
  63
  64 /* This function allocate the buffers, and initialize them.  */
  65
  66 static reg_errcode_t
  67 internal_function __attribute_warn_unused_result__
  68 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  69                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  70 {
  71   reg_errcode_t ret;
  72   memset (pstr, '\0', sizeof (re_string_t));
  73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  74
  75   if (len > 0)
  76     {
  77       ret = re_string_realloc_buffers (pstr, len + 1);
  78       if (BE (ret != REG_NOERROR, 0))
  79         return ret;
  80     }
  81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  82
  83   if (icase)
  84     {
  85 #ifdef RE_ENABLE_I18N
  86       if (dfa->mb_cur_max > 1)
  87         {
  88           while (1)
  89             {
  90               ret = build_wcs_upper_buffer (pstr);
  91               if (BE (ret != REG_NOERROR, 0))
  92                 return ret;
  93               if (pstr->valid_raw_len >= len)
  94                 break;
  95               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  96                 break;
  97               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  98               if (BE (ret != REG_NOERROR, 0))
  99                 return ret;
 100             }
 101         }
 102       else
 103 #endif /* RE_ENABLE_I18N  */
 104         build_upper_buffer (pstr);
 105     }
 106   else
 107     {
 108 #ifdef RE_ENABLE_I18N
 109       if (dfa->mb_cur_max > 1)
 110         build_wcs_buffer (pstr);
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         {
 114           if (trans != NULL)
 115             re_string_translate_buffer (pstr);
 116           else
 117             {
 118               pstr->valid_len = pstr->bufs_len;
 119               pstr->valid_raw_len = pstr->bufs_len;
 120             }
 121         }
 122     }
 123
 124   return REG_NOERROR;
 125 }
 126
 127 /* Helper functions for re_string_allocate, and re_string_construct.  */
 128
 129 static reg_errcode_t
 130 internal_function __attribute_warn_unused_result__
 131 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 132 {
 133 #ifdef RE_ENABLE_I18N
 134   if (pstr->mb_cur_max > 1)
 135     {
 136       wint_t *new_wcs;
 137
 138       /* Avoid overflow.  */
 139       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 140       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 141         return REG_ESPACE;
 142
 143       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 144       if (BE (new_wcs == NULL, 0))
 145         return REG_ESPACE;
 146       pstr->wcs = new_wcs;
 147       if (pstr->offsets != NULL)
 148         {
 149           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 150           if (BE (new_offsets == NULL, 0))
 151             return REG_ESPACE;
 152           pstr->offsets = new_offsets;
 153         }
 154     }
 155 #endif /* RE_ENABLE_I18N  */
 156   if (pstr->mbs_allocated)
 157     {
 158       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 159                                            new_buf_len);
 160       if (BE (new_mbs == NULL, 0))
 161         return REG_ESPACE;
 162       pstr->mbs = new_mbs;
 163     }
 164   pstr->bufs_len = new_buf_len;
 165   return REG_NOERROR;
 166 }
 167
 168
 169 static void
 170 internal_function
 171 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 172                             RE_TRANSLATE_TYPE trans, bool icase,
 173                             const re_dfa_t *dfa)
 174 {
 175   pstr->raw_mbs = (const unsigned char *) str;
 176   pstr->len = len;
 177   pstr->raw_len = len;
 178   pstr->trans = trans;
 179   pstr->icase = icase;
 180   pstr->mbs_allocated = (trans != NULL || icase);
 181   pstr->mb_cur_max = dfa->mb_cur_max;
 182   pstr->is_utf8 = dfa->is_utf8;
 183   pstr->map_notascii = dfa->map_notascii;
 184   pstr->stop = pstr->len;
 185   pstr->raw_stop = pstr->stop;
 186 }
 187
 188 #ifdef RE_ENABLE_I18N
 189
 190 /* Build wide character buffer PSTR->WCS.
 191    If the byte sequence of the string are:
 192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 193    Then wide character buffer will be:
 194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 195    We use WEOF for padding, they indicate that the position isn't
 196    a first byte of a multibyte character.
 197
 198    Note that this function assumes PSTR->VALID_LEN elements are already
 199    built and starts from PSTR->VALID_LEN.  */
 200
 201 static void
 202 internal_function
 203 build_wcs_buffer (re_string_t *pstr)
 204 {
 205 #ifdef _LIBC
 206   unsigned char buf[MB_LEN_MAX];
 207   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 208 #else
 209   unsigned char buf[64];
 210 #endif
 211   mbstate_t prev_st;
 212   Idx byte_idx, end_idx, remain_len;
 213   size_t mbclen;
 214
 215   /* Build the buffers from pstr->valid_len to either pstr->len or
 216      pstr->bufs_len.  */
 217   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 218   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 219     {
 220       wchar_t wc;
 221       const char *p;
 222
 223       remain_len = end_idx - byte_idx;
 224       prev_st = pstr->cur_state;
 225       /* Apply the translation if we need.  */
 226       if (BE (pstr->trans != NULL, 0))
 227         {
 228           int i, ch;
 229
 230           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 231             {
 232               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 233               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 234             }
 235           p = (const char *) buf;
 236         }
 237       else
 238         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 239       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 240       if (BE (mbclen == (size_t) -2, 0))
 241         {
 242           /* The buffer doesn't have enough space, finish to build.  */
 243           pstr->cur_state = prev_st;
 244           break;
 245         }
 246       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 247         {
 248           /* We treat these cases as a singlebyte character.  */
 249           mbclen = 1;
 250           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 251           if (BE (pstr->trans != NULL, 0))
 252             wc = pstr->trans[wc];
 253           pstr->cur_state = prev_st;
 254         }
 255
 256       /* Write wide character and padding.  */
 257       pstr->wcs[byte_idx++] = wc;
 258       /* Write paddings.  */
 259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 260         pstr->wcs[byte_idx++] = WEOF;
 261     }
 262   pstr->valid_len = byte_idx;
 263   pstr->valid_raw_len = byte_idx;
 264 }
 265
 266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 267    but for REG_ICASE.  */
 268
 269 static reg_errcode_t
 270 internal_function __attribute_warn_unused_result__
 271 build_wcs_upper_buffer (re_string_t *pstr)
 272 {
 273   mbstate_t prev_st;
 274   Idx src_idx, byte_idx, end_idx, remain_len;
 275   size_t mbclen;
 276 #ifdef _LIBC
 277   char buf[MB_LEN_MAX];
 278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 279 #else
 280   char buf[64];
 281 #endif
 282
 283   byte_idx = pstr->valid_len;
 284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 285
 286   /* The following optimization assumes that ASCII characters can be
 287      mapped to wide characters with a simple cast.  */
 288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 289     {
 290       while (byte_idx < end_idx)
 291         {
 292           wchar_t wc;
 293
 294           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 295               && mbsinit (&pstr->cur_state))
 296             {
 297               /* In case of a singlebyte character.  */
 298               pstr->mbs[byte_idx]
 299                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 300               /* The next step uses the assumption that wchar_t is encoded
 301                  ASCII-safe: all ASCII values can be converted like this.  */
 302               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 303               ++byte_idx;
 304               continue;
 305             }
 306
 307           remain_len = end_idx - byte_idx;
 308           prev_st = pstr->cur_state;
 309           mbclen = __mbrtowc (&wc,
 310                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 311                                + byte_idx), remain_len, &pstr->cur_state);
 312           if (BE (mbclen < (size_t) -2, 1))
 313             {
 314               wchar_t wcu = wc;
 315               if (iswlower (wc))
 316                 {
 317                   size_t mbcdlen;
 318
 319                   wcu = towupper (wc);
 320                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 321                   if (BE (mbclen == mbcdlen, 1))
 322                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 323                   else
 324                     {
 325                       src_idx = byte_idx;
 326                       goto offsets_needed;
 327                     }
 328                 }
 329               else
 330                 memcpy (pstr->mbs + byte_idx,
 331                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 332               pstr->wcs[byte_idx++] = wcu;
 333               /* Write paddings.  */
 334               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 335                 pstr->wcs[byte_idx++] = WEOF;
 336             }
 337           else if (mbclen == (size_t) -1 || mbclen == 0)
 338             {
 339               /* It is an invalid character or '\0'.  Just use the byte.  */
 340               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 341               pstr->mbs[byte_idx] = ch;
 342               /* And also cast it to wide char.  */
 343               pstr->wcs[byte_idx++] = (wchar_t) ch;
 344               if (BE (mbclen == (size_t) -1, 0))
 345                 pstr->cur_state = prev_st;
 346             }
 347           else
 348             {
 349               /* The buffer doesn't have enough space, finish to build.  */
 350               pstr->cur_state = prev_st;
 351               break;
 352             }
 353         }
 354       pstr->valid_len = byte_idx;
 355       pstr->valid_raw_len = byte_idx;
 356       return REG_NOERROR;
 357     }
 358   else
 359     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 360       {
 361         wchar_t wc;
 362         const char *p;
 363       offsets_needed:
 364         remain_len = end_idx - byte_idx;
 365         prev_st = pstr->cur_state;
 366         if (BE (pstr->trans != NULL, 0))
 367           {
 368             int i, ch;
 369
 370             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 371               {
 372                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 373                 buf[i] = pstr->trans[ch];
 374               }
 375             p = (const char *) buf;
 376           }
 377         else
 378           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 379         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 380         if (BE (mbclen < (size_t) -2, 1))
 381           {
 382             wchar_t wcu = wc;
 383             if (iswlower (wc))
 384               {
 385                 size_t mbcdlen;
 386
 387                 wcu = towupper (wc);
 388                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 389                 if (BE (mbclen == mbcdlen, 1))
 390                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 391                 else if (mbcdlen != (size_t) -1)
 392                   {
 393                     size_t i;
 394
 395                     if (byte_idx + mbcdlen > pstr->bufs_len)
 396                       {
 397                         pstr->cur_state = prev_st;
 398                         break;
 399                       }
 400
 401                     if (pstr->offsets == NULL)
 402                       {
 403                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 404
 405                         if (pstr->offsets == NULL)
 406                           return REG_ESPACE;
 407                       }
 408                     if (!pstr->offsets_needed)
 409                       {
 410                         for (i = 0; i < (size_t) byte_idx; ++i)
 411                           pstr->offsets[i] = i;
 412                         pstr->offsets_needed = 1;
 413                       }
 414
 415                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 416                     pstr->wcs[byte_idx] = wcu;
 417                     pstr->offsets[byte_idx] = src_idx;
 418                     for (i = 1; i < mbcdlen; ++i)
 419                       {
 420                         pstr->offsets[byte_idx + i]
 421                           = src_idx + (i < mbclen ? i : mbclen - 1);
 422                         pstr->wcs[byte_idx + i] = WEOF;
 423                       }
 424                     pstr->len += mbcdlen - mbclen;
 425                     if (pstr->raw_stop > src_idx)
 426                       pstr->stop += mbcdlen - mbclen;
 427                     end_idx = (pstr->bufs_len > pstr->len)
 428                               ? pstr->len : pstr->bufs_len;
 429                     byte_idx += mbcdlen;
 430                     src_idx += mbclen;
 431                     continue;
 432                   }
 433                 else
 434                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 435               }
 436             else
 437               memcpy (pstr->mbs + byte_idx, p, mbclen);
 438
 439             if (BE (pstr->offsets_needed != 0, 0))
 440               {
 441                 size_t i;
 442                 for (i = 0; i < mbclen; ++i)
 443                   pstr->offsets[byte_idx + i] = src_idx + i;
 444               }
 445             src_idx += mbclen;
 446
 447             pstr->wcs[byte_idx++] = wcu;
 448             /* Write paddings.  */
 449             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 450               pstr->wcs[byte_idx++] = WEOF;
 451           }
 452         else if (mbclen == (size_t) -1 || mbclen == 0)
 453           {
 454             /* It is an invalid character or '\0'.  Just use the byte.  */
 455             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 456
 457             if (BE (pstr->trans != NULL, 0))
 458               ch = pstr->trans [ch];
 459             pstr->mbs[byte_idx] = ch;
 460
 461             if (BE (pstr->offsets_needed != 0, 0))
 462               pstr->offsets[byte_idx] = src_idx;
 463             ++src_idx;
 464
 465             /* And also cast it to wide char.  */
 466             pstr->wcs[byte_idx++] = (wchar_t) ch;
 467             if (BE (mbclen == (size_t) -1, 0))
 468               pstr->cur_state = prev_st;
 469           }
 470         else
 471           {
 472             /* The buffer doesn't have enough space, finish to build.  */
 473             pstr->cur_state = prev_st;
 474             break;
 475           }
 476       }
 477   pstr->valid_len = byte_idx;
 478   pstr->valid_raw_len = src_idx;
 479   return REG_NOERROR;
 480 }
 481
 482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 483    Return the index.  */
 484
 485 static Idx
 486 internal_function
 487 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 488 {
 489   mbstate_t prev_st;
 490   Idx rawbuf_idx;
 491   size_t mbclen;
 492   wint_t wc = WEOF;
 493
 494   /* Skip the characters which are not necessary to check.  */
 495   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 496        rawbuf_idx < new_raw_idx;)
 497     {
 498       wchar_t wc2;
 499       Idx remain_len;
 500       remain_len = pstr->len - rawbuf_idx;
 501       prev_st = pstr->cur_state;
 502       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 503                           remain_len, &pstr->cur_state);
 504       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 505         {
 506           /* We treat these cases as a single byte character.  */
 507           if (mbclen == 0 || remain_len == 0)
 508             wc = L'\0';
 509           else
 510             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 511           mbclen = 1;
 512           pstr->cur_state = prev_st;
 513         }
 514       else
 515         wc = wc2;
 516       /* Then proceed the next character.  */
 517       rawbuf_idx += mbclen;
 518     }
 519   *last_wc = wc;
 520   return rawbuf_idx;
 521 }
 522 #endif /* RE_ENABLE_I18N  */
 523
 524 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 525    This function is used in case of REG_ICASE.  */
 526
 527 static void
 528 internal_function
 529 build_upper_buffer (re_string_t *pstr)
 530 {
 531   Idx char_idx, end_idx;
 532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 533
 534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 535     {
 536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 537       if (BE (pstr->trans != NULL, 0))
 538         ch = pstr->trans[ch];
 539       if (islower (ch))
 540         pstr->mbs[char_idx] = toupper (ch);
 541       else
 542         pstr->mbs[char_idx] = ch;
 543     }
 544   pstr->valid_len = char_idx;
 545   pstr->valid_raw_len = char_idx;
 546 }
 547
 548 /* Apply TRANS to the buffer in PSTR.  */
 549
 550 static void
 551 internal_function
 552 re_string_translate_buffer (re_string_t *pstr)
 553 {
 554   Idx buf_idx, end_idx;
 555   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 556
 557   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 558     {
 559       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 560       pstr->mbs[buf_idx] = pstr->trans[ch];
 561     }
 562
 563   pstr->valid_len = buf_idx;
 564   pstr->valid_raw_len = buf_idx;
 565 }
 566
 567 /* This function re-construct the buffers.
 568    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 569    convert to upper case in case of REG_ICASE, apply translation.  */
 570
 571 static reg_errcode_t
 572 internal_function __attribute_warn_unused_result__
 573 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 574 {
 575   Idx offset;
 576
 577   if (BE (pstr->raw_mbs_idx <= idx, 0))
 578     offset = idx - pstr->raw_mbs_idx;
 579   else
 580     {
 581       /* Reset buffer.  */
 582 #ifdef RE_ENABLE_I18N
 583       if (pstr->mb_cur_max > 1)
 584         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 585 #endif /* RE_ENABLE_I18N */
 586       pstr->len = pstr->raw_len;
 587       pstr->stop = pstr->raw_stop;
 588       pstr->valid_len = 0;
 589       pstr->raw_mbs_idx = 0;
 590       pstr->valid_raw_len = 0;
 591       pstr->offsets_needed = 0;
 592       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 593                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 594       if (!pstr->mbs_allocated)
 595         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 596       offset = idx;
 597     }
 598
 599   if (BE (offset != 0, 1))
 600     {
 601       /* Should the already checked characters be kept?  */
 602       if (BE (offset < pstr->valid_raw_len, 1))
 603         {
 604           /* Yes, move them to the front of the buffer.  */
 605 #ifdef RE_ENABLE_I18N
 606           if (BE (pstr->offsets_needed, 0))
 607             {
 608               Idx low = 0, high = pstr->valid_len, mid;
 609               do
 610                 {
 611                   mid = (high + low) / 2;
 612                   if (pstr->offsets[mid] > offset)
 613                     high = mid;
 614                   else if (pstr->offsets[mid] < offset)
 615                     low = mid + 1;
 616                   else
 617                     break;
 618                 }
 619               while (low < high);
 620               if (pstr->offsets[mid] < offset)
 621                 ++mid;
 622               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 623                                                         eflags);
 624               /* This can be quite complicated, so handle specially
 625                  only the common and easy case where the character with
 626                  different length representation of lower and upper
 627                  case is present at or after offset.  */
 628               if (pstr->valid_len > offset
 629                   && mid == offset && pstr->offsets[mid] == offset)
 630                 {
 631                   memmove (pstr->wcs, pstr->wcs + offset,
 632                            (pstr->valid_len - offset) * sizeof (wint_t));
 633                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 634                   pstr->valid_len -= offset;
 635                   pstr->valid_raw_len -= offset;
 636                   for (low = 0; low < pstr->valid_len; low++)
 637                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 638                 }
 639               else
 640                 {
 641                   /* Otherwise, just find out how long the partial multibyte
 642                      character at offset is and fill it with WEOF/255.  */
 643                   pstr->len = pstr->raw_len - idx + offset;
 644                   pstr->stop = pstr->raw_stop - idx + offset;
 645                   pstr->offsets_needed = 0;
 646                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 647                     --mid;
 648                   while (mid < pstr->valid_len)
 649                     if (pstr->wcs[mid] != WEOF)
 650                       break;
 651                     else
 652                       ++mid;
 653                   if (mid == pstr->valid_len)
 654                     pstr->valid_len = 0;
 655                   else
 656                     {
 657                       pstr->valid_len = pstr->offsets[mid] - offset;
 658                       if (pstr->valid_len)
 659                         {
 660                           for (low = 0; low < pstr->valid_len; ++low)
 661                             pstr->wcs[low] = WEOF;
 662                           memset (pstr->mbs, 255, pstr->valid_len);
 663                         }
 664                     }
 665                   pstr->valid_raw_len = pstr->valid_len;
 666                 }
 667             }
 668           else
 669 #endif
 670             {
 671               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 672                                                         eflags);
 673 #ifdef RE_ENABLE_I18N
 674               if (pstr->mb_cur_max > 1)
 675                 memmove (pstr->wcs, pstr->wcs + offset,
 676                          (pstr->valid_len - offset) * sizeof (wint_t));
 677 #endif /* RE_ENABLE_I18N */
 678               if (BE (pstr->mbs_allocated, 0))
 679                 memmove (pstr->mbs, pstr->mbs + offset,
 680                          pstr->valid_len - offset);
 681               pstr->valid_len -= offset;
 682               pstr->valid_raw_len -= offset;
 683 #if DEBUG
 684               assert (pstr->valid_len > 0);
 685 #endif
 686             }
 687         }
 688       else
 689         {
 690 #ifdef RE_ENABLE_I18N
 691           /* No, skip all characters until IDX.  */
 692           Idx prev_valid_len = pstr->valid_len;
 693
 694           if (BE (pstr->offsets_needed, 0))
 695             {
 696               pstr->len = pstr->raw_len - idx + offset;
 697               pstr->stop = pstr->raw_stop - idx + offset;
 698               pstr->offsets_needed = 0;
 699             }
 700 #endif
 701           pstr->valid_len = 0;
 702 #ifdef RE_ENABLE_I18N
 703           if (pstr->mb_cur_max > 1)
 704             {
 705               Idx wcs_idx;
 706               wint_t wc = WEOF;
 707
 708               if (pstr->is_utf8)
 709                 {
 710                   const unsigned char *raw, *p, *end;
 711
 712                   /* Special case UTF-8.  Multi-byte chars start with any
 713                      byte other than 0x80 - 0xbf.  */
 714                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 715                   end = raw + (offset - pstr->mb_cur_max);
 716                   if (end < pstr->raw_mbs)
 717                     end = pstr->raw_mbs;
 718                   p = raw + offset - 1;
 719 #ifdef _LIBC
 720                   /* We know the wchar_t encoding is UCS4, so for the simple
 721                      case, ASCII characters, skip the conversion step.  */
 722                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 723                     {
 724                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 725                       /* pstr->valid_len = 0; */
 726                       wc = (wchar_t) *p;
 727                     }
 728                   else
 729 #endif
 730                     for (; p >= end; --p)
 731                       if ((*p & 0xc0) != 0x80)
 732                         {
 733                           mbstate_t cur_state;
 734                           wchar_t wc2;
 735                           Idx mlen = raw + pstr->len - p;
 736                           size_t mbclen;
 737
 738 #if 0 /* dead code: buf is set but never used */
 739                           unsigned char buf[6];
 740                           if (BE (pstr->trans != NULL, 0))
 741                             {
 742                               int i = mlen < 6 ? mlen : 6;
 743                               while (--i >= 0)
 744                                 buf[i] = pstr->trans[p[i]];
 745                             }
 746 #endif
 747                           /* XXX Don't use mbrtowc, we know which conversion
 748                              to use (UTF-8 -> UCS4).  */
 749                           memset (&cur_state, 0, sizeof (cur_state));
 750                           mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
 751                                               &cur_state);
 752                           if (raw + offset - p <= mbclen
 753                               && mbclen < (size_t) -2)
 754                             {
 755                               memset (&pstr->cur_state, '\0',
 756                                       sizeof (mbstate_t));
 757                               pstr->valid_len = mbclen - (raw + offset - p);
 758                               wc = wc2;
 759                             }
 760                           break;
 761                         }
 762                 }
 763
 764               if (wc == WEOF)
 765                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 766               if (wc == WEOF)
 767                 pstr->tip_context
 768                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 769               else
 770                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 771                                       && IS_WIDE_WORD_CHAR (wc))
 772                                      ? CONTEXT_WORD
 773                                      : ((IS_WIDE_NEWLINE (wc)
 774                                          && pstr->newline_anchor)
 775                                         ? CONTEXT_NEWLINE : 0));
 776               if (BE (pstr->valid_len, 0))
 777                 {
 778                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 779                     pstr->wcs[wcs_idx] = WEOF;
 780                   if (pstr->mbs_allocated)
 781                     memset (pstr->mbs, 255, pstr->valid_len);
 782                 }
 783               pstr->valid_raw_len = pstr->valid_len;
 784             }
 785           else
 786 #endif /* RE_ENABLE_I18N */
 787             {
 788               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 789               pstr->valid_raw_len = 0;
 790               if (pstr->trans)
 791                 c = pstr->trans[c];
 792               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 793                                    ? CONTEXT_WORD
 794                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 795                                       ? CONTEXT_NEWLINE : 0));
 796             }
 797         }
 798       if (!BE (pstr->mbs_allocated, 0))
 799         pstr->mbs += offset;
 800     }
 801   pstr->raw_mbs_idx = idx;
 802   pstr->len -= offset;
 803   pstr->stop -= offset;
 804
 805   /* Then build the buffers.  */
 806 #ifdef RE_ENABLE_I18N
 807   if (pstr->mb_cur_max > 1)
 808     {
 809       if (pstr->icase)
 810         {
 811           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 812           if (BE (ret != REG_NOERROR, 0))
 813             return ret;
 814         }
 815       else
 816         build_wcs_buffer (pstr);
 817     }
 818   else
 819 #endif /* RE_ENABLE_I18N */
 820     if (BE (pstr->mbs_allocated, 0))
 821       {
 822         if (pstr->icase)
 823           build_upper_buffer (pstr);
 824         else if (pstr->trans != NULL)
 825           re_string_translate_buffer (pstr);
 826       }
 827     else
 828       pstr->valid_len = pstr->len;
 829
 830   pstr->cur_idx = 0;
 831   return REG_NOERROR;
 832 }
 833
 834 static unsigned char
 835 internal_function __attribute ((pure))
 836 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 837 {
 838   int ch;
 839   Idx off;
 840
 841   /* Handle the common (easiest) cases first.  */
 842   if (BE (!pstr->mbs_allocated, 1))
 843     return re_string_peek_byte (pstr, idx);
 844
 845 #ifdef RE_ENABLE_I18N
 846   if (pstr->mb_cur_max > 1
 847       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 848     return re_string_peek_byte (pstr, idx);
 849 #endif
 850
 851   off = pstr->cur_idx + idx;
 852 #ifdef RE_ENABLE_I18N
 853   if (pstr->offsets_needed)
 854     off = pstr->offsets[off];
 855 #endif
 856
 857   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 858
 859 #ifdef RE_ENABLE_I18N
 860   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 861      this function returns CAPITAL LETTER I instead of first byte of
 862      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 863      since peek_byte_case doesn't advance cur_idx in any way.  */
 864   if (pstr->offsets_needed && !isascii (ch))
 865     return re_string_peek_byte (pstr, idx);
 866 #endif
 867
 868   return ch;
 869 }
 870
 871 static unsigned char
 872 internal_function __attribute ((pure))
 873 re_string_fetch_byte_case (re_string_t *pstr)
 874 {
 875   if (BE (!pstr->mbs_allocated, 1))
 876     return re_string_fetch_byte (pstr);
 877
 878 #ifdef RE_ENABLE_I18N
 879   if (pstr->offsets_needed)
 880     {
 881       Idx off;
 882       int ch;
 883
 884       /* For tr_TR.UTF-8 [[:islower:]] there is
 885          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 886          in that case the whole multi-byte character and return
 887          the original letter.  On the other side, with
 888          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 889          anything else would complicate things too much.  */
 890
 891       if (!re_string_first_byte (pstr, pstr->cur_idx))
 892         return re_string_fetch_byte (pstr);
 893
 894       off = pstr->offsets[pstr->cur_idx];
 895       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 896
 897       if (! isascii (ch))
 898         return re_string_fetch_byte (pstr);
 899
 900       re_string_skip_bytes (pstr,
 901                             re_string_char_size_at (pstr, pstr->cur_idx));
 902       return ch;
 903     }
 904 #endif
 905
 906   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 907 }
 908
 909 static void
 910 internal_function
 911 re_string_destruct (re_string_t *pstr)
 912 {
 913 #ifdef RE_ENABLE_I18N
 914   re_free (pstr->wcs);
 915   re_free (pstr->offsets);
 916 #endif /* RE_ENABLE_I18N  */
 917   if (pstr->mbs_allocated)
 918     re_free (pstr->mbs);
 919 }
 920
 921 /* Return the context at IDX in INPUT.  */
 922
 923 static unsigned int
 924 internal_function
 925 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 926 {
 927   int c;
 928   if (BE (! REG_VALID_INDEX (idx), 0))
 929     /* In this case, we use the value stored in input->tip_context,
 930        since we can't know the character in input->mbs[-1] here.  */
 931     return input->tip_context;
 932   if (BE (idx == input->len, 0))
 933     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 934             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 935 #ifdef RE_ENABLE_I18N
 936   if (input->mb_cur_max > 1)
 937     {
 938       wint_t wc;
 939       Idx wc_idx = idx;
 940       while(input->wcs[wc_idx] == WEOF)
 941         {
 942 #ifdef DEBUG
 943           /* It must not happen.  */
 944           assert (REG_VALID_INDEX (wc_idx));
 945 #endif
 946           --wc_idx;
 947           if (! REG_VALID_INDEX (wc_idx))
 948             return input->tip_context;
 949         }
 950       wc = input->wcs[wc_idx];
 951       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 952         return CONTEXT_WORD;
 953       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 954               ? CONTEXT_NEWLINE : 0);
 955     }
 956   else
 957 #endif
 958     {
 959       c = re_string_byte_at (input, idx);
 960       if (bitset_contain (input->word_char, c))
 961         return CONTEXT_WORD;
 962       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 963     }
 964 }
 965 \f
 966 /* Functions for set operation.  */
 967
 968 static reg_errcode_t
 969 internal_function __attribute_warn_unused_result__
 970 re_node_set_alloc (re_node_set *set, Idx size)
 971 {
 972   set->alloc = size;
 973   set->nelem = 0;
 974   set->elems = re_malloc (Idx, size);
 975   if (BE (set->elems == NULL, 0))
 976     return REG_ESPACE;
 977   return REG_NOERROR;
 978 }
 979
 980 static reg_errcode_t
 981 internal_function __attribute_warn_unused_result__
 982 re_node_set_init_1 (re_node_set *set, Idx elem)
 983 {
 984   set->alloc = 1;
 985   set->nelem = 1;
 986   set->elems = re_malloc (Idx, 1);
 987   if (BE (set->elems == NULL, 0))
 988     {
 989       set->alloc = set->nelem = 0;
 990       return REG_ESPACE;
 991     }
 992   set->elems[0] = elem;
 993   return REG_NOERROR;
 994 }
 995
 996 static reg_errcode_t
 997 internal_function __attribute_warn_unused_result__
 998 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 999 {
1000   set->alloc = 2;
1001   set->elems = re_malloc (Idx, 2);
1002   if (BE (set->elems == NULL, 0))
1003     return REG_ESPACE;
1004   if (elem1 == elem2)
1005     {
1006       set->nelem = 1;
1007       set->elems[0] = elem1;
1008     }
1009   else
1010     {
1011       set->nelem = 2;
1012       if (elem1 < elem2)
1013         {
1014           set->elems[0] = elem1;
1015           set->elems[1] = elem2;
1016         }
1017       else
1018         {
1019           set->elems[0] = elem2;
1020           set->elems[1] = elem1;
1021         }
1022     }
1023   return REG_NOERROR;
1024 }
1025
1026 static reg_errcode_t
1027 internal_function __attribute_warn_unused_result__
1028 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1029 {
1030   dest->nelem = src->nelem;
1031   if (src->nelem > 0)
1032     {
1033       dest->alloc = dest->nelem;
1034       dest->elems = re_malloc (Idx, dest->alloc);
1035       if (BE (dest->elems == NULL, 0))
1036         {
1037           dest->alloc = dest->nelem = 0;
1038           return REG_ESPACE;
1039         }
1040       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1041     }
1042   else
1043     re_node_set_init_empty (dest);
1044   return REG_NOERROR;
1045 }
1046
1047 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1050
1051 static reg_errcode_t
1052 internal_function __attribute_warn_unused_result__
1053 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054                            const re_node_set *src2)
1055 {
1056   Idx i1, i2, is, id, delta, sbase;
1057   if (src1->nelem == 0 || src2->nelem == 0)
1058     return REG_NOERROR;
1059
1060   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061      conservative estimate.  */
1062   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1063     {
1064       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1066       if (BE (new_elems == NULL, 0))
1067         return REG_ESPACE;
1068       dest->elems = new_elems;
1069       dest->alloc = new_alloc;
1070     }
1071
1072   /* Find the items in the intersection of SRC1 and SRC2, and copy
1073      into the top of DEST those that are not already in DEST itself.  */
1074   sbase = dest->nelem + src1->nelem + src2->nelem;
1075   i1 = src1->nelem - 1;
1076   i2 = src2->nelem - 1;
1077   id = dest->nelem - 1;
1078   for (;;)
1079     {
1080       if (src1->elems[i1] == src2->elems[i2])
1081         {
1082           /* Try to find the item in DEST.  Maybe we could binary search?  */
1083           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1084             --id;
1085
1086           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1087             dest->elems[--sbase] = src1->elems[i1];
1088
1089           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1090             break;
1091         }
1092
1093       /* Lower the highest of the two items.  */
1094       else if (src1->elems[i1] < src2->elems[i2])
1095         {
1096           if (! REG_VALID_INDEX (--i2))
1097             break;
1098         }
1099       else
1100         {
1101           if (! REG_VALID_INDEX (--i1))
1102             break;
1103         }
1104     }
1105
1106   id = dest->nelem - 1;
1107   is = dest->nelem + src1->nelem + src2->nelem - 1;
1108   delta = is - sbase + 1;
1109
1110   /* Now copy.  When DELTA becomes zero, the remaining
1111      DEST elements are already in place; this is more or
1112      less the same loop that is in re_node_set_merge.  */
1113   dest->nelem += delta;
1114   if (delta > 0 && REG_VALID_INDEX (id))
1115     for (;;)
1116       {
1117         if (dest->elems[is] > dest->elems[id])
1118           {
1119             /* Copy from the top.  */
1120             dest->elems[id + delta--] = dest->elems[is--];
1121             if (delta == 0)
1122               break;
1123           }
1124         else
1125           {
1126             /* Slide from the bottom.  */
1127             dest->elems[id + delta] = dest->elems[id];
1128             if (! REG_VALID_INDEX (--id))
1129               break;
1130           }
1131       }
1132
1133   /* Copy remaining SRC elements.  */
1134   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1135
1136   return REG_NOERROR;
1137 }
1138
1139 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1141
1142 static reg_errcode_t
1143 internal_function __attribute_warn_unused_result__
1144 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145                         const re_node_set *src2)
1146 {
1147   Idx i1, i2, id;
1148   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149     {
1150       dest->alloc = src1->nelem + src2->nelem;
1151       dest->elems = re_malloc (Idx, dest->alloc);
1152       if (BE (dest->elems == NULL, 0))
1153         return REG_ESPACE;
1154     }
1155   else
1156     {
1157       if (src1 != NULL && src1->nelem > 0)
1158         return re_node_set_init_copy (dest, src1);
1159       else if (src2 != NULL && src2->nelem > 0)
1160         return re_node_set_init_copy (dest, src2);
1161       else
1162         re_node_set_init_empty (dest);
1163       return REG_NOERROR;
1164     }
1165   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166     {
1167       if (src1->elems[i1] > src2->elems[i2])
1168         {
1169           dest->elems[id++] = src2->elems[i2++];
1170           continue;
1171         }
1172       if (src1->elems[i1] == src2->elems[i2])
1173         ++i2;
1174       dest->elems[id++] = src1->elems[i1++];
1175     }
1176   if (i1 < src1->nelem)
1177     {
1178       memcpy (dest->elems + id, src1->elems + i1,
1179              (src1->nelem - i1) * sizeof (Idx));
1180       id += src1->nelem - i1;
1181     }
1182   else if (i2 < src2->nelem)
1183     {
1184       memcpy (dest->elems + id, src2->elems + i2,
1185              (src2->nelem - i2) * sizeof (Idx));
1186       id += src2->nelem - i2;
1187     }
1188   dest->nelem = id;
1189   return REG_NOERROR;
1190 }
1191
1192 /* Calculate the union set of the sets DEST and SRC. And store it to
1193    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1194
1195 static reg_errcode_t
1196 internal_function __attribute_warn_unused_result__
1197 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1198 {
1199   Idx is, id, sbase, delta;
1200   if (src == NULL || src->nelem == 0)
1201     return REG_NOERROR;
1202   if (dest->alloc < 2 * src->nelem + dest->nelem)
1203     {
1204       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1206       if (BE (new_buffer == NULL, 0))
1207         return REG_ESPACE;
1208       dest->elems = new_buffer;
1209       dest->alloc = new_alloc;
1210     }
1211
1212   if (BE (dest->nelem == 0, 0))
1213     {
1214       dest->nelem = src->nelem;
1215       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1216       return REG_NOERROR;
1217     }
1218
1219   /* Copy into the top of DEST the items of SRC that are not
1220      found in DEST.  Maybe we could binary search in DEST?  */
1221   for (sbase = dest->nelem + 2 * src->nelem,
1222        is = src->nelem - 1, id = dest->nelem - 1;
1223        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1224     {
1225       if (dest->elems[id] == src->elems[is])
1226         is--, id--;
1227       else if (dest->elems[id] < src->elems[is])
1228         dest->elems[--sbase] = src->elems[is--];
1229       else /* if (dest->elems[id] > src->elems[is]) */
1230         --id;
1231     }
1232
1233   if (REG_VALID_INDEX (is))
1234     {
1235       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1236       sbase -= is + 1;
1237       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1238     }
1239
1240   id = dest->nelem - 1;
1241   is = dest->nelem + 2 * src->nelem - 1;
1242   delta = is - sbase + 1;
1243   if (delta == 0)
1244     return REG_NOERROR;
1245
1246   /* Now copy.  When DELTA becomes zero, the remaining
1247      DEST elements are already in place.  */
1248   dest->nelem += delta;
1249   for (;;)
1250     {
1251       if (dest->elems[is] > dest->elems[id])
1252         {
1253           /* Copy from the top.  */
1254           dest->elems[id + delta--] = dest->elems[is--];
1255           if (delta == 0)
1256             break;
1257         }
1258       else
1259         {
1260           /* Slide from the bottom.  */
1261           dest->elems[id + delta] = dest->elems[id];
1262           if (! REG_VALID_INDEX (--id))
1263             {
1264               /* Copy remaining SRC elements.  */
1265               memcpy (dest->elems, dest->elems + sbase,
1266                       delta * sizeof (Idx));
1267               break;
1268             }
1269         }
1270     }
1271
1272   return REG_NOERROR;
1273 }
1274
1275 /* Insert the new element ELEM to the re_node_set* SET.
1276    SET should not already have ELEM.
1277    Return true if successful.  */
1278
1279 static bool
1280 internal_function __attribute_warn_unused_result__
1281 re_node_set_insert (re_node_set *set, Idx elem)
1282 {
1283   Idx idx;
1284   /* In case the set is empty.  */
1285   if (set->alloc == 0)
1286     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1287
1288   if (BE (set->nelem, 0) == 0)
1289     {
1290       /* We already guaranteed above that set->alloc != 0.  */
1291       set->elems[0] = elem;
1292       ++set->nelem;
1293       return true;
1294     }
1295
1296   /* Realloc if we need.  */
1297   if (set->alloc == set->nelem)
1298     {
1299       Idx *new_elems;
1300       set->alloc = set->alloc * 2;
1301       new_elems = re_realloc (set->elems, Idx, set->alloc);
1302       if (BE (new_elems == NULL, 0))
1303         return false;
1304       set->elems = new_elems;
1305     }
1306
1307   /* Move the elements which follows the new element.  Test the
1308      first element separately to skip a check in the inner loop.  */
1309   if (elem < set->elems[0])
1310     {
1311       idx = 0;
1312       for (idx = set->nelem; idx > 0; idx--)
1313         set->elems[idx] = set->elems[idx - 1];
1314     }
1315   else
1316     {
1317       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1318         set->elems[idx] = set->elems[idx - 1];
1319     }
1320
1321   /* Insert the new element.  */
1322   set->elems[idx] = elem;
1323   ++set->nelem;
1324   return true;
1325 }
1326
1327 /* Insert the new element ELEM to the re_node_set* SET.
1328    SET should not already have any element greater than or equal to ELEM.
1329    Return true if successful.  */
1330
1331 static bool
1332 internal_function __attribute_warn_unused_result__
1333 re_node_set_insert_last (re_node_set *set, Idx elem)
1334 {
1335   /* Realloc if we need.  */
1336   if (set->alloc == set->nelem)
1337     {
1338       Idx *new_elems;
1339       set->alloc = (set->alloc + 1) * 2;
1340       new_elems = re_realloc (set->elems, Idx, set->alloc);
1341       if (BE (new_elems == NULL, 0))
1342         return false;
1343       set->elems = new_elems;
1344     }
1345
1346   /* Insert the new element.  */
1347   set->elems[set->nelem++] = elem;
1348   return true;
1349 }
1350
1351 /* Compare two node sets SET1 and SET2.
1352    Return true if SET1 and SET2 are equivalent.  */
1353
1354 static bool
1355 internal_function __attribute ((pure))
1356 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1357 {
1358   Idx i;
1359   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1360     return false;
1361   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1362     if (set1->elems[i] != set2->elems[i])
1363       return false;
1364   return true;
1365 }
1366
1367 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1368
1369 static Idx
1370 internal_function __attribute ((pure))
1371 re_node_set_contains (const re_node_set *set, Idx elem)
1372 {
1373   __re_size_t idx, right, mid;
1374   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1375     return 0;
1376
1377   /* Binary search the element.  */
1378   idx = 0;
1379   right = set->nelem - 1;
1380   while (idx < right)
1381     {
1382       mid = (idx + right) / 2;
1383       if (set->elems[mid] < elem)
1384         idx = mid + 1;
1385       else
1386         right = mid;
1387     }
1388   return set->elems[idx] == elem ? idx + 1 : 0;
1389 }
1390
1391 static void
1392 internal_function
1393 re_node_set_remove_at (re_node_set *set, Idx idx)
1394 {
1395   if (idx < 0 || idx >= set->nelem)
1396     return;
1397   --set->nelem;
1398   for (; idx < set->nelem; idx++)
1399     set->elems[idx] = set->elems[idx + 1];
1400 }
1401 \f
1402
1403 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1404    Or return REG_MISSING if an error occurred.  */
1405
1406 static Idx
1407 internal_function
1408 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1409 {
1410   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1411     {
1412       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1413       Idx *new_nexts, *new_indices;
1414       re_node_set *new_edests, *new_eclosures;
1415       re_token_t *new_nodes;
1416       size_t max_object_size =
1417         MAX (sizeof (re_token_t),
1418              MAX (sizeof (re_node_set),
1419                   sizeof (Idx)));
1420
1421       /* Avoid overflows.  */
1422       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1423         return REG_MISSING;
1424
1425       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1426       if (BE (new_nodes == NULL, 0))
1427         return REG_MISSING;
1428       dfa->nodes = new_nodes;
1429       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1430       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1431       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1432       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1433       if (BE (new_nexts == NULL || new_indices == NULL
1434               || new_edests == NULL || new_eclosures == NULL, 0))
1435         return REG_MISSING;
1436       dfa->nexts = new_nexts;
1437       dfa->org_indices = new_indices;
1438       dfa->edests = new_edests;
1439       dfa->eclosures = new_eclosures;
1440       dfa->nodes_alloc = new_nodes_alloc;
1441     }
1442   dfa->nodes[dfa->nodes_len] = token;
1443   dfa->nodes[dfa->nodes_len].constraint = 0;
1444 #ifdef RE_ENABLE_I18N
1445   {
1446   int type = token.type;
1447   dfa->nodes[dfa->nodes_len].accept_mb =
1448     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1449   }
1450 #endif
1451   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1452   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1454   return dfa->nodes_len++;
1455 }
1456
1457 static inline re_hashval_t
1458 internal_function
1459 calc_state_hash (const re_node_set *nodes, unsigned int context)
1460 {
1461   re_hashval_t hash = nodes->nelem + context;
1462   Idx i;
1463   for (i = 0 ; i < nodes->nelem ; i++)
1464     hash += nodes->elems[i];
1465   return hash;
1466 }
1467
1468 /* Search for the state whose node_set is equivalent to NODES.
1469    Return the pointer to the state, if we found it in the DFA.
1470    Otherwise create the new one and return it.  In case of an error
1471    return NULL and set the error code in ERR.
1472    Note: - We assume NULL as the invalid state, then it is possible that
1473            return value is NULL and ERR is REG_NOERROR.
1474          - We never return non-NULL value in case of any errors, it is for
1475            optimization.  */
1476
1477 static re_dfastate_t *
1478 internal_function __attribute_warn_unused_result__
1479 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1480                   const re_node_set *nodes)
1481 {
1482   re_hashval_t hash;
1483   re_dfastate_t *new_state;
1484   struct re_state_table_entry *spot;
1485   Idx i;
1486 #ifdef lint
1487   /* Suppress bogus uninitialized-variable warnings.  */
1488   *err = REG_NOERROR;
1489 #endif
1490   if (BE (nodes->nelem == 0, 0))
1491     {
1492       *err = REG_NOERROR;
1493       return NULL;
1494     }
1495   hash = calc_state_hash (nodes, 0);
1496   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1497
1498   for (i = 0 ; i < spot->num ; i++)
1499     {
1500       re_dfastate_t *state = spot->array[i];
1501       if (hash != state->hash)
1502         continue;
1503       if (re_node_set_compare (&state->nodes, nodes))
1504         return state;
1505     }
1506
1507   /* There are no appropriate state in the dfa, create the new one.  */
1508   new_state = create_ci_newstate (dfa, nodes, hash);
1509   if (BE (new_state == NULL, 0))
1510     *err = REG_ESPACE;
1511
1512   return new_state;
1513 }
1514
1515 /* Search for the state whose node_set is equivalent to NODES and
1516    whose context is equivalent to CONTEXT.
1517    Return the pointer to the state, if we found it in the DFA.
1518    Otherwise create the new one and return it.  In case of an error
1519    return NULL and set the error code in ERR.
1520    Note: - We assume NULL as the invalid state, then it is possible that
1521            return value is NULL and ERR is REG_NOERROR.
1522          - We never return non-NULL value in case of any errors, it is for
1523            optimization.  */
1524
1525 static re_dfastate_t *
1526 internal_function __attribute_warn_unused_result__
1527 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1528                           const re_node_set *nodes, unsigned int context)
1529 {
1530   re_hashval_t hash;
1531   re_dfastate_t *new_state;
1532   struct re_state_table_entry *spot;
1533   Idx i;
1534 #ifdef lint
1535   /* Suppress bogus uninitialized-variable warnings.  */
1536   *err = REG_NOERROR;
1537 #endif
1538   if (nodes->nelem == 0)
1539     {
1540       *err = REG_NOERROR;
1541       return NULL;
1542     }
1543   hash = calc_state_hash (nodes, context);
1544   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1545
1546   for (i = 0 ; i < spot->num ; i++)
1547     {
1548       re_dfastate_t *state = spot->array[i];
1549       if (state->hash == hash
1550           && state->context == context
1551           && re_node_set_compare (state->entrance_nodes, nodes))
1552         return state;
1553     }
1554   /* There are no appropriate state in `dfa', create the new one.  */
1555   new_state = create_cd_newstate (dfa, nodes, context, hash);
1556   if (BE (new_state == NULL, 0))
1557     *err = REG_ESPACE;
1558
1559   return new_state;
1560 }
1561
1562 /* Finish initialization of the new state NEWSTATE, and using its hash value
1563    HASH put in the appropriate bucket of DFA's state table.  Return value
1564    indicates the error code if failed.  */
1565
1566 static reg_errcode_t
1567 __attribute_warn_unused_result__
1568 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1569                 re_hashval_t hash)
1570 {
1571   struct re_state_table_entry *spot;
1572   reg_errcode_t err;
1573   Idx i;
1574
1575   newstate->hash = hash;
1576   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1577   if (BE (err != REG_NOERROR, 0))
1578     return REG_ESPACE;
1579   for (i = 0; i < newstate->nodes.nelem; i++)
1580     {
1581       Idx elem = newstate->nodes.elems[i];
1582       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1583         if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1584           return REG_ESPACE;
1585     }
1586
1587   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1588   if (BE (spot->alloc <= spot->num, 0))
1589     {
1590       Idx new_alloc = 2 * spot->num + 2;
1591       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1592                                               new_alloc);
1593       if (BE (new_array == NULL, 0))
1594         return REG_ESPACE;
1595       spot->array = new_array;
1596       spot->alloc = new_alloc;
1597     }
1598   spot->array[spot->num++] = newstate;
1599   return REG_NOERROR;
1600 }
1601
1602 static void
1603 free_state (re_dfastate_t *state)
1604 {
1605   re_node_set_free (&state->non_eps_nodes);
1606   re_node_set_free (&state->inveclosure);
1607   if (state->entrance_nodes != &state->nodes)
1608     {
1609       re_node_set_free (state->entrance_nodes);
1610       re_free (state->entrance_nodes);
1611     }
1612   re_node_set_free (&state->nodes);
1613   re_free (state->word_trtable);
1614   re_free (state->trtable);
1615   re_free (state);
1616 }
1617
1618 /* Create the new state which is independ of contexts.
1619    Return the new state if succeeded, otherwise return NULL.  */
1620
1621 static re_dfastate_t *
1622 internal_function __attribute_warn_unused_result__
1623 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1624                     re_hashval_t hash)
1625 {
1626   Idx i;
1627   reg_errcode_t err;
1628   re_dfastate_t *newstate;
1629
1630   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1631   if (BE (newstate == NULL, 0))
1632     return NULL;
1633   err = re_node_set_init_copy (&newstate->nodes, nodes);
1634   if (BE (err != REG_NOERROR, 0))
1635     {
1636       re_free (newstate);
1637       return NULL;
1638     }
1639
1640   newstate->entrance_nodes = &newstate->nodes;
1641   for (i = 0 ; i < nodes->nelem ; i++)
1642     {
1643       re_token_t *node = dfa->nodes + nodes->elems[i];
1644       re_token_type_t type = node->type;
1645       if (type == CHARACTER && !node->constraint)
1646         continue;
1647 #ifdef RE_ENABLE_I18N
1648       newstate->accept_mb |= node->accept_mb;
1649 #endif /* RE_ENABLE_I18N */
1650
1651       /* If the state has the halt node, the state is a halt state.  */
1652       if (type == END_OF_RE)
1653         newstate->halt = 1;
1654       else if (type == OP_BACK_REF)
1655         newstate->has_backref = 1;
1656       else if (type == ANCHOR || node->constraint)
1657         newstate->has_constraint = 1;
1658     }
1659   err = register_state (dfa, newstate, hash);
1660   if (BE (err != REG_NOERROR, 0))
1661     {
1662       free_state (newstate);
1663       newstate = NULL;
1664     }
1665   return newstate;
1666 }
1667
1668 /* Create the new state which is depend on the context CONTEXT.
1669    Return the new state if succeeded, otherwise return NULL.  */
1670
1671 static re_dfastate_t *
1672 internal_function __attribute_warn_unused_result__
1673 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1674                     unsigned int context, re_hashval_t hash)
1675 {
1676   Idx i, nctx_nodes = 0;
1677   reg_errcode_t err;
1678   re_dfastate_t *newstate;
1679
1680   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1681   if (BE (newstate == NULL, 0))
1682     return NULL;
1683   err = re_node_set_init_copy (&newstate->nodes, nodes);
1684   if (BE (err != REG_NOERROR, 0))
1685     {
1686       re_free (newstate);
1687       return NULL;
1688     }
1689
1690   newstate->context = context;
1691   newstate->entrance_nodes = &newstate->nodes;
1692
1693   for (i = 0 ; i < nodes->nelem ; i++)
1694     {
1695       re_token_t *node = dfa->nodes + nodes->elems[i];
1696       re_token_type_t type = node->type;
1697       unsigned int constraint = node->constraint;
1698
1699       if (type == CHARACTER && !constraint)
1700         continue;
1701 #ifdef RE_ENABLE_I18N
1702       newstate->accept_mb |= node->accept_mb;
1703 #endif /* RE_ENABLE_I18N */
1704
1705       /* If the state has the halt node, the state is a halt state.  */
1706       if (type == END_OF_RE)
1707         newstate->halt = 1;
1708       else if (type == OP_BACK_REF)
1709         newstate->has_backref = 1;
1710
1711       if (constraint)
1712         {
1713           if (newstate->entrance_nodes == &newstate->nodes)
1714             {
1715               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1716               if (BE (newstate->entrance_nodes == NULL, 0))
1717                 {
1718                   free_state (newstate);
1719                   return NULL;
1720                 }
1721               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1722                   != REG_NOERROR)
1723                 return NULL;
1724               nctx_nodes = 0;
1725               newstate->has_constraint = 1;
1726             }
1727
1728           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1729             {
1730               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1731               ++nctx_nodes;
1732             }
1733         }
1734     }
1735   err = register_state (dfa, newstate, hash);
1736   if (BE (err != REG_NOERROR, 0))
1737     {
1738       free_state (newstate);
1739       newstate = NULL;
1740     }
1741   return  newstate;
1742 }