devspec.en_US/project/recutils/lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2019 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU General Public
   8    License as published by the Free Software Foundation; either
   9    version 3 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa);
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash);
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash);
  31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
  32                                                 Idx new_buf_len);
  33 #ifdef RE_ENABLE_I18N
  34 static void build_wcs_buffer (re_string_t *pstr);
  35 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
  36 #endif /* RE_ENABLE_I18N */
  37 static void build_upper_buffer (re_string_t *pstr);
  38 static void re_string_translate_buffer (re_string_t *pstr);
  39 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
  40                                           int eflags) __attribute__ ((pure));
  41 \f
  42 /* Functions for string operation.  */
  43
  44 /* This function allocate the buffers.  It is necessary to call
  45    re_string_reconstruct before using the object.  */
  46
  47 static reg_errcode_t
  48 __attribute_warn_unused_result__
  49 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  50                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  51 {
  52   reg_errcode_t ret;
  53   Idx init_buf_len;
  54
  55   /* Ensure at least one character fits into the buffers.  */
  56   if (init_len < dfa->mb_cur_max)
  57     init_len = dfa->mb_cur_max;
  58   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  59   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  60
  61   ret = re_string_realloc_buffers (pstr, init_buf_len);
  62   if (__glibc_unlikely (ret != REG_NOERROR))
  63     return ret;
  64
  65   pstr->word_char = dfa->word_char;
  66   pstr->word_ops_used = dfa->word_ops_used;
  67   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  68   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  69   pstr->valid_raw_len = pstr->valid_len;
  70   return REG_NOERROR;
  71 }
  72
  73 /* This function allocate the buffers, and initialize them.  */
  74
  75 static reg_errcode_t
  76 __attribute_warn_unused_result__
  77 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  78                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  79 {
  80   reg_errcode_t ret;
  81   memset (pstr, '\0', sizeof (re_string_t));
  82   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  83
  84   if (len > 0)
  85     {
  86       ret = re_string_realloc_buffers (pstr, len + 1);
  87       if (__glibc_unlikely (ret != REG_NOERROR))
  88         return ret;
  89     }
  90   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  91
  92   if (icase)
  93     {
  94 #ifdef RE_ENABLE_I18N
  95       if (dfa->mb_cur_max > 1)
  96         {
  97           while (1)
  98             {
  99               ret = build_wcs_upper_buffer (pstr);
 100               if (__glibc_unlikely (ret != REG_NOERROR))
 101                 return ret;
 102               if (pstr->valid_raw_len >= len)
 103                 break;
 104               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 105                 break;
 106               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 107               if (__glibc_unlikely (ret != REG_NOERROR))
 108                 return ret;
 109             }
 110         }
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         build_upper_buffer (pstr);
 114     }
 115   else
 116     {
 117 #ifdef RE_ENABLE_I18N
 118       if (dfa->mb_cur_max > 1)
 119         build_wcs_buffer (pstr);
 120       else
 121 #endif /* RE_ENABLE_I18N  */
 122         {
 123           if (trans != NULL)
 124             re_string_translate_buffer (pstr);
 125           else
 126             {
 127               pstr->valid_len = pstr->bufs_len;
 128               pstr->valid_raw_len = pstr->bufs_len;
 129             }
 130         }
 131     }
 132
 133   return REG_NOERROR;
 134 }
 135
 136 /* Helper functions for re_string_allocate, and re_string_construct.  */
 137
 138 static reg_errcode_t
 139 __attribute_warn_unused_result__
 140 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 141 {
 142 #ifdef RE_ENABLE_I18N
 143   if (pstr->mb_cur_max > 1)
 144     {
 145       wint_t *new_wcs;
 146
 147       /* Avoid overflow in realloc.  */
 148       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 149       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
 150                             < new_buf_len))
 151         return REG_ESPACE;
 152
 153       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 154       if (__glibc_unlikely (new_wcs == NULL))
 155         return REG_ESPACE;
 156       pstr->wcs = new_wcs;
 157       if (pstr->offsets != NULL)
 158         {
 159           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 160           if (__glibc_unlikely (new_offsets == NULL))
 161             return REG_ESPACE;
 162           pstr->offsets = new_offsets;
 163         }
 164     }
 165 #endif /* RE_ENABLE_I18N  */
 166   if (pstr->mbs_allocated)
 167     {
 168       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 169                                            new_buf_len);
 170       if (__glibc_unlikely (new_mbs == NULL))
 171         return REG_ESPACE;
 172       pstr->mbs = new_mbs;
 173     }
 174   pstr->bufs_len = new_buf_len;
 175   return REG_NOERROR;
 176 }
 177
 178
 179 static void
 180 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 181                             RE_TRANSLATE_TYPE trans, bool icase,
 182                             const re_dfa_t *dfa)
 183 {
 184   pstr->raw_mbs = (const unsigned char *) str;
 185   pstr->len = len;
 186   pstr->raw_len = len;
 187   pstr->trans = trans;
 188   pstr->icase = icase;
 189   pstr->mbs_allocated = (trans != NULL || icase);
 190   pstr->mb_cur_max = dfa->mb_cur_max;
 191   pstr->is_utf8 = dfa->is_utf8;
 192   pstr->map_notascii = dfa->map_notascii;
 193   pstr->stop = pstr->len;
 194   pstr->raw_stop = pstr->stop;
 195 }
 196
 197 #ifdef RE_ENABLE_I18N
 198
 199 /* Build wide character buffer PSTR->WCS.
 200    If the byte sequence of the string are:
 201      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 202    Then wide character buffer will be:
 203      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 204    We use WEOF for padding, they indicate that the position isn't
 205    a first byte of a multibyte character.
 206
 207    Note that this function assumes PSTR->VALID_LEN elements are already
 208    built and starts from PSTR->VALID_LEN.  */
 209
 210 static void
 211 build_wcs_buffer (re_string_t *pstr)
 212 {
 213 #ifdef _LIBC
 214   unsigned char buf[MB_LEN_MAX];
 215   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 216 #else
 217   unsigned char buf[64];
 218 #endif
 219   mbstate_t prev_st;
 220   Idx byte_idx, end_idx, remain_len;
 221   size_t mbclen;
 222
 223   /* Build the buffers from pstr->valid_len to either pstr->len or
 224      pstr->bufs_len.  */
 225   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 226   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 227     {
 228       wchar_t wc;
 229       const char *p;
 230
 231       remain_len = end_idx - byte_idx;
 232       prev_st = pstr->cur_state;
 233       /* Apply the translation if we need.  */
 234       if (__glibc_unlikely (pstr->trans != NULL))
 235         {
 236           int i, ch;
 237
 238           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 239             {
 240               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 241               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 242             }
 243           p = (const char *) buf;
 244         }
 245       else
 246         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 247       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 248       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
 249                             || (mbclen == (size_t) -2
 250                                 && pstr->bufs_len >= pstr->len)))
 251         {
 252           /* We treat these cases as a singlebyte character.  */
 253           mbclen = 1;
 254           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 255           if (__glibc_unlikely (pstr->trans != NULL))
 256             wc = pstr->trans[wc];
 257           pstr->cur_state = prev_st;
 258         }
 259       else if (__glibc_unlikely (mbclen == (size_t) -2))
 260         {
 261           /* The buffer doesn't have enough space, finish to build.  */
 262           pstr->cur_state = prev_st;
 263           break;
 264         }
 265
 266       /* Write wide character and padding.  */
 267       pstr->wcs[byte_idx++] = wc;
 268       /* Write paddings.  */
 269       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 270         pstr->wcs[byte_idx++] = WEOF;
 271     }
 272   pstr->valid_len = byte_idx;
 273   pstr->valid_raw_len = byte_idx;
 274 }
 275
 276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 277    but for REG_ICASE.  */
 278
 279 static reg_errcode_t
 280 __attribute_warn_unused_result__
 281 build_wcs_upper_buffer (re_string_t *pstr)
 282 {
 283   mbstate_t prev_st;
 284   Idx src_idx, byte_idx, end_idx, remain_len;
 285   size_t mbclen;
 286 #ifdef _LIBC
 287   char buf[MB_LEN_MAX];
 288   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 289 #else
 290   char buf[64];
 291 #endif
 292
 293   byte_idx = pstr->valid_len;
 294   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 295
 296   /* The following optimization assumes that ASCII characters can be
 297      mapped to wide characters with a simple cast.  */
 298   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 299     {
 300       while (byte_idx < end_idx)
 301         {
 302           wchar_t wc;
 303
 304           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 305               && mbsinit (&pstr->cur_state))
 306             {
 307               /* In case of a singlebyte character.  */
 308               pstr->mbs[byte_idx]
 309                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 310               /* The next step uses the assumption that wchar_t is encoded
 311                  ASCII-safe: all ASCII values can be converted like this.  */
 312               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 313               ++byte_idx;
 314               continue;
 315             }
 316
 317           remain_len = end_idx - byte_idx;
 318           prev_st = pstr->cur_state;
 319           mbclen = __mbrtowc (&wc,
 320                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 321                                + byte_idx), remain_len, &pstr->cur_state);
 322           if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 323             {
 324               wchar_t wcu = __towupper (wc);
 325               if (wcu != wc)
 326                 {
 327                   size_t mbcdlen;
 328
 329                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
 330                   if (__glibc_likely (mbclen == mbcdlen))
 331                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 332                   else
 333                     {
 334                       src_idx = byte_idx;
 335                       goto offsets_needed;
 336                     }
 337                 }
 338               else
 339                 memcpy (pstr->mbs + byte_idx,
 340                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 341               pstr->wcs[byte_idx++] = wcu;
 342               /* Write paddings.  */
 343               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 344                 pstr->wcs[byte_idx++] = WEOF;
 345             }
 346           else if (mbclen == (size_t) -1 || mbclen == 0
 347                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 348             {
 349               /* It is an invalid character, an incomplete character
 350                  at the end of the string, or '\0'.  Just use the byte.  */
 351               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 352               pstr->mbs[byte_idx] = ch;
 353               /* And also cast it to wide char.  */
 354               pstr->wcs[byte_idx++] = (wchar_t) ch;
 355               if (__glibc_unlikely (mbclen == (size_t) -1))
 356                 pstr->cur_state = prev_st;
 357             }
 358           else
 359             {
 360               /* The buffer doesn't have enough space, finish to build.  */
 361               pstr->cur_state = prev_st;
 362               break;
 363             }
 364         }
 365       pstr->valid_len = byte_idx;
 366       pstr->valid_raw_len = byte_idx;
 367       return REG_NOERROR;
 368     }
 369   else
 370     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 371       {
 372         wchar_t wc;
 373         const char *p;
 374       offsets_needed:
 375         remain_len = end_idx - byte_idx;
 376         prev_st = pstr->cur_state;
 377         if (__glibc_unlikely (pstr->trans != NULL))
 378           {
 379             int i, ch;
 380
 381             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 382               {
 383                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 384                 buf[i] = pstr->trans[ch];
 385               }
 386             p = (const char *) buf;
 387           }
 388         else
 389           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 390         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 391         if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 392           {
 393             wchar_t wcu = __towupper (wc);
 394             if (wcu != wc)
 395               {
 396                 size_t mbcdlen;
 397
 398                 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
 399                 if (__glibc_likely (mbclen == mbcdlen))
 400                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 401                 else if (mbcdlen != (size_t) -1)
 402                   {
 403                     size_t i;
 404
 405                     if (byte_idx + mbcdlen > pstr->bufs_len)
 406                       {
 407                         pstr->cur_state = prev_st;
 408                         break;
 409                       }
 410
 411                     if (pstr->offsets == NULL)
 412                       {
 413                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 414
 415                         if (pstr->offsets == NULL)
 416                           return REG_ESPACE;
 417                       }
 418                     if (!pstr->offsets_needed)
 419                       {
 420                         for (i = 0; i < (size_t) byte_idx; ++i)
 421                           pstr->offsets[i] = i;
 422                         pstr->offsets_needed = 1;
 423                       }
 424
 425                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 426                     pstr->wcs[byte_idx] = wcu;
 427                     pstr->offsets[byte_idx] = src_idx;
 428                     for (i = 1; i < mbcdlen; ++i)
 429                       {
 430                         pstr->offsets[byte_idx + i]
 431                           = src_idx + (i < mbclen ? i : mbclen - 1);
 432                         pstr->wcs[byte_idx + i] = WEOF;
 433                       }
 434                     pstr->len += mbcdlen - mbclen;
 435                     if (pstr->raw_stop > src_idx)
 436                       pstr->stop += mbcdlen - mbclen;
 437                     end_idx = (pstr->bufs_len > pstr->len)
 438                               ? pstr->len : pstr->bufs_len;
 439                     byte_idx += mbcdlen;
 440                     src_idx += mbclen;
 441                     continue;
 442                   }
 443                 else
 444                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 445               }
 446             else
 447               memcpy (pstr->mbs + byte_idx, p, mbclen);
 448
 449             if (__glibc_unlikely (pstr->offsets_needed != 0))
 450               {
 451                 size_t i;
 452                 for (i = 0; i < mbclen; ++i)
 453                   pstr->offsets[byte_idx + i] = src_idx + i;
 454               }
 455             src_idx += mbclen;
 456
 457             pstr->wcs[byte_idx++] = wcu;
 458             /* Write paddings.  */
 459             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 460               pstr->wcs[byte_idx++] = WEOF;
 461           }
 462         else if (mbclen == (size_t) -1 || mbclen == 0
 463                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 464           {
 465             /* It is an invalid character or '\0'.  Just use the byte.  */
 466             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 467
 468             if (__glibc_unlikely (pstr->trans != NULL))
 469               ch = pstr->trans [ch];
 470             pstr->mbs[byte_idx] = ch;
 471
 472             if (__glibc_unlikely (pstr->offsets_needed != 0))
 473               pstr->offsets[byte_idx] = src_idx;
 474             ++src_idx;
 475
 476             /* And also cast it to wide char.  */
 477             pstr->wcs[byte_idx++] = (wchar_t) ch;
 478             if (__glibc_unlikely (mbclen == (size_t) -1))
 479               pstr->cur_state = prev_st;
 480           }
 481         else
 482           {
 483             /* The buffer doesn't have enough space, finish to build.  */
 484             pstr->cur_state = prev_st;
 485             break;
 486           }
 487       }
 488   pstr->valid_len = byte_idx;
 489   pstr->valid_raw_len = src_idx;
 490   return REG_NOERROR;
 491 }
 492
 493 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 494    Return the index.  */
 495
 496 static Idx
 497 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 498 {
 499   mbstate_t prev_st;
 500   Idx rawbuf_idx;
 501   size_t mbclen;
 502   wint_t wc = WEOF;
 503
 504   /* Skip the characters which are not necessary to check.  */
 505   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 506        rawbuf_idx < new_raw_idx;)
 507     {
 508       wchar_t wc2;
 509       Idx remain_len = pstr->raw_len - rawbuf_idx;
 510       prev_st = pstr->cur_state;
 511       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 512                           remain_len, &pstr->cur_state);
 513       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
 514                             || mbclen == 0))
 515         {
 516           /* We treat these cases as a single byte character.  */
 517           if (mbclen == 0 || remain_len == 0)
 518             wc = L'\0';
 519           else
 520             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 521           mbclen = 1;
 522           pstr->cur_state = prev_st;
 523         }
 524       else
 525         wc = wc2;
 526       /* Then proceed the next character.  */
 527       rawbuf_idx += mbclen;
 528     }
 529   *last_wc = wc;
 530   return rawbuf_idx;
 531 }
 532 #endif /* RE_ENABLE_I18N  */
 533
 534 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 535    This function is used in case of REG_ICASE.  */
 536
 537 static void
 538 build_upper_buffer (re_string_t *pstr)
 539 {
 540   Idx char_idx, end_idx;
 541   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 542
 543   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 544     {
 545       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 546       if (__glibc_unlikely (pstr->trans != NULL))
 547         ch = pstr->trans[ch];
 548       pstr->mbs[char_idx] = toupper (ch);
 549     }
 550   pstr->valid_len = char_idx;
 551   pstr->valid_raw_len = char_idx;
 552 }
 553
 554 /* Apply TRANS to the buffer in PSTR.  */
 555
 556 static void
 557 re_string_translate_buffer (re_string_t *pstr)
 558 {
 559   Idx buf_idx, end_idx;
 560   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 561
 562   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 563     {
 564       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 565       pstr->mbs[buf_idx] = pstr->trans[ch];
 566     }
 567
 568   pstr->valid_len = buf_idx;
 569   pstr->valid_raw_len = buf_idx;
 570 }
 571
 572 /* This function re-construct the buffers.
 573    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 574    convert to upper case in case of REG_ICASE, apply translation.  */
 575
 576 static reg_errcode_t
 577 __attribute_warn_unused_result__
 578 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 579 {
 580   Idx offset;
 581
 582   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
 583     offset = idx - pstr->raw_mbs_idx;
 584   else
 585     {
 586       /* Reset buffer.  */
 587 #ifdef RE_ENABLE_I18N
 588       if (pstr->mb_cur_max > 1)
 589         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 590 #endif /* RE_ENABLE_I18N */
 591       pstr->len = pstr->raw_len;
 592       pstr->stop = pstr->raw_stop;
 593       pstr->valid_len = 0;
 594       pstr->raw_mbs_idx = 0;
 595       pstr->valid_raw_len = 0;
 596       pstr->offsets_needed = 0;
 597       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 598                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 599       if (!pstr->mbs_allocated)
 600         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 601       offset = idx;
 602     }
 603
 604   if (__glibc_likely (offset != 0))
 605     {
 606       /* Should the already checked characters be kept?  */
 607       if (__glibc_likely (offset < pstr->valid_raw_len))
 608         {
 609           /* Yes, move them to the front of the buffer.  */
 610 #ifdef RE_ENABLE_I18N
 611           if (__glibc_unlikely (pstr->offsets_needed))
 612             {
 613               Idx low = 0, high = pstr->valid_len, mid;
 614               do
 615                 {
 616                   mid = (high + low) / 2;
 617                   if (pstr->offsets[mid] > offset)
 618                     high = mid;
 619                   else if (pstr->offsets[mid] < offset)
 620                     low = mid + 1;
 621                   else
 622                     break;
 623                 }
 624               while (low < high);
 625               if (pstr->offsets[mid] < offset)
 626                 ++mid;
 627               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 628                                                         eflags);
 629               /* This can be quite complicated, so handle specially
 630                  only the common and easy case where the character with
 631                  different length representation of lower and upper
 632                  case is present at or after offset.  */
 633               if (pstr->valid_len > offset
 634                   && mid == offset && pstr->offsets[mid] == offset)
 635                 {
 636                   memmove (pstr->wcs, pstr->wcs + offset,
 637                            (pstr->valid_len - offset) * sizeof (wint_t));
 638                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 639                   pstr->valid_len -= offset;
 640                   pstr->valid_raw_len -= offset;
 641                   for (low = 0; low < pstr->valid_len; low++)
 642                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 643                 }
 644               else
 645                 {
 646                   /* Otherwise, just find out how long the partial multibyte
 647                      character at offset is and fill it with WEOF/255.  */
 648                   pstr->len = pstr->raw_len - idx + offset;
 649                   pstr->stop = pstr->raw_stop - idx + offset;
 650                   pstr->offsets_needed = 0;
 651                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 652                     --mid;
 653                   while (mid < pstr->valid_len)
 654                     if (pstr->wcs[mid] != WEOF)
 655                       break;
 656                     else
 657                       ++mid;
 658                   if (mid == pstr->valid_len)
 659                     pstr->valid_len = 0;
 660                   else
 661                     {
 662                       pstr->valid_len = pstr->offsets[mid] - offset;
 663                       if (pstr->valid_len)
 664                         {
 665                           for (low = 0; low < pstr->valid_len; ++low)
 666                             pstr->wcs[low] = WEOF;
 667                           memset (pstr->mbs, 255, pstr->valid_len);
 668                         }
 669                     }
 670                   pstr->valid_raw_len = pstr->valid_len;
 671                 }
 672             }
 673           else
 674 #endif
 675             {
 676               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 677                                                         eflags);
 678 #ifdef RE_ENABLE_I18N
 679               if (pstr->mb_cur_max > 1)
 680                 memmove (pstr->wcs, pstr->wcs + offset,
 681                          (pstr->valid_len - offset) * sizeof (wint_t));
 682 #endif /* RE_ENABLE_I18N */
 683               if (__glibc_unlikely (pstr->mbs_allocated))
 684                 memmove (pstr->mbs, pstr->mbs + offset,
 685                          pstr->valid_len - offset);
 686               pstr->valid_len -= offset;
 687               pstr->valid_raw_len -= offset;
 688 #if defined DEBUG && DEBUG
 689               assert (pstr->valid_len > 0);
 690 #endif
 691             }
 692         }
 693       else
 694         {
 695 #ifdef RE_ENABLE_I18N
 696           /* No, skip all characters until IDX.  */
 697           Idx prev_valid_len = pstr->valid_len;
 698
 699           if (__glibc_unlikely (pstr->offsets_needed))
 700             {
 701               pstr->len = pstr->raw_len - idx + offset;
 702               pstr->stop = pstr->raw_stop - idx + offset;
 703               pstr->offsets_needed = 0;
 704             }
 705 #endif
 706           pstr->valid_len = 0;
 707 #ifdef RE_ENABLE_I18N
 708           if (pstr->mb_cur_max > 1)
 709             {
 710               Idx wcs_idx;
 711               wint_t wc = WEOF;
 712
 713               if (pstr->is_utf8)
 714                 {
 715                   const unsigned char *raw, *p, *end;
 716
 717                   /* Special case UTF-8.  Multi-byte chars start with any
 718                      byte other than 0x80 - 0xbf.  */
 719                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 720                   end = raw + (offset - pstr->mb_cur_max);
 721                   if (end < pstr->raw_mbs)
 722                     end = pstr->raw_mbs;
 723                   p = raw + offset - 1;
 724 #ifdef _LIBC
 725                   /* We know the wchar_t encoding is UCS4, so for the simple
 726                      case, ASCII characters, skip the conversion step.  */
 727                   if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
 728                     {
 729                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 730                       /* pstr->valid_len = 0; */
 731                       wc = (wchar_t) *p;
 732                     }
 733                   else
 734 #endif
 735                     for (; p >= end; --p)
 736                       if ((*p & 0xc0) != 0x80)
 737                         {
 738                           mbstate_t cur_state;
 739                           wchar_t wc2;
 740                           Idx mlen = raw + pstr->len - p;
 741                           unsigned char buf[6];
 742                           size_t mbclen;
 743
 744                           const unsigned char *pp = p;
 745                           if (__glibc_unlikely (pstr->trans != NULL))
 746                             {
 747                               int i = mlen < 6 ? mlen : 6;
 748                               while (--i >= 0)
 749                                 buf[i] = pstr->trans[p[i]];
 750                               pp = buf;
 751                             }
 752                           /* XXX Don't use mbrtowc, we know which conversion
 753                              to use (UTF-8 -> UCS4).  */
 754                           memset (&cur_state, 0, sizeof (cur_state));
 755                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 756                                               &cur_state);
 757                           if (raw + offset - p <= mbclen
 758                               && mbclen < (size_t) -2)
 759                             {
 760                               memset (&pstr->cur_state, '\0',
 761                                       sizeof (mbstate_t));
 762                               pstr->valid_len = mbclen - (raw + offset - p);
 763                               wc = wc2;
 764                             }
 765                           break;
 766                         }
 767                 }
 768
 769               if (wc == WEOF)
 770                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 771               if (wc == WEOF)
 772                 pstr->tip_context
 773                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 774               else
 775                 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
 776                                       && IS_WIDE_WORD_CHAR (wc))
 777                                      ? CONTEXT_WORD
 778                                      : ((IS_WIDE_NEWLINE (wc)
 779                                          && pstr->newline_anchor)
 780                                         ? CONTEXT_NEWLINE : 0));
 781               if (__glibc_unlikely (pstr->valid_len))
 782                 {
 783                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 784                     pstr->wcs[wcs_idx] = WEOF;
 785                   if (pstr->mbs_allocated)
 786                     memset (pstr->mbs, 255, pstr->valid_len);
 787                 }
 788               pstr->valid_raw_len = pstr->valid_len;
 789             }
 790           else
 791 #endif /* RE_ENABLE_I18N */
 792             {
 793               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 794               pstr->valid_raw_len = 0;
 795               if (pstr->trans)
 796                 c = pstr->trans[c];
 797               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 798                                    ? CONTEXT_WORD
 799                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 800                                       ? CONTEXT_NEWLINE : 0));
 801             }
 802         }
 803       if (!__glibc_unlikely (pstr->mbs_allocated))
 804         pstr->mbs += offset;
 805     }
 806   pstr->raw_mbs_idx = idx;
 807   pstr->len -= offset;
 808   pstr->stop -= offset;
 809
 810   /* Then build the buffers.  */
 811 #ifdef RE_ENABLE_I18N
 812   if (pstr->mb_cur_max > 1)
 813     {
 814       if (pstr->icase)
 815         {
 816           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 817           if (__glibc_unlikely (ret != REG_NOERROR))
 818             return ret;
 819         }
 820       else
 821         build_wcs_buffer (pstr);
 822     }
 823   else
 824 #endif /* RE_ENABLE_I18N */
 825     if (__glibc_unlikely (pstr->mbs_allocated))
 826       {
 827         if (pstr->icase)
 828           build_upper_buffer (pstr);
 829         else if (pstr->trans != NULL)
 830           re_string_translate_buffer (pstr);
 831       }
 832     else
 833       pstr->valid_len = pstr->len;
 834
 835   pstr->cur_idx = 0;
 836   return REG_NOERROR;
 837 }
 838
 839 static unsigned char
 840 __attribute__ ((pure))
 841 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 842 {
 843   int ch;
 844   Idx off;
 845
 846   /* Handle the common (easiest) cases first.  */
 847   if (__glibc_likely (!pstr->mbs_allocated))
 848     return re_string_peek_byte (pstr, idx);
 849
 850 #ifdef RE_ENABLE_I18N
 851   if (pstr->mb_cur_max > 1
 852       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 853     return re_string_peek_byte (pstr, idx);
 854 #endif
 855
 856   off = pstr->cur_idx + idx;
 857 #ifdef RE_ENABLE_I18N
 858   if (pstr->offsets_needed)
 859     off = pstr->offsets[off];
 860 #endif
 861
 862   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 863
 864 #ifdef RE_ENABLE_I18N
 865   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 866      this function returns CAPITAL LETTER I instead of first byte of
 867      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 868      since peek_byte_case doesn't advance cur_idx in any way.  */
 869   if (pstr->offsets_needed && !isascii (ch))
 870     return re_string_peek_byte (pstr, idx);
 871 #endif
 872
 873   return ch;
 874 }
 875
 876 static unsigned char
 877 re_string_fetch_byte_case (re_string_t *pstr)
 878 {
 879   if (__glibc_likely (!pstr->mbs_allocated))
 880     return re_string_fetch_byte (pstr);
 881
 882 #ifdef RE_ENABLE_I18N
 883   if (pstr->offsets_needed)
 884     {
 885       Idx off;
 886       int ch;
 887
 888       /* For tr_TR.UTF-8 [[:islower:]] there is
 889          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 890          in that case the whole multi-byte character and return
 891          the original letter.  On the other side, with
 892          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 893          anything else would complicate things too much.  */
 894
 895       if (!re_string_first_byte (pstr, pstr->cur_idx))
 896         return re_string_fetch_byte (pstr);
 897
 898       off = pstr->offsets[pstr->cur_idx];
 899       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 900
 901       if (! isascii (ch))
 902         return re_string_fetch_byte (pstr);
 903
 904       re_string_skip_bytes (pstr,
 905                             re_string_char_size_at (pstr, pstr->cur_idx));
 906       return ch;
 907     }
 908 #endif
 909
 910   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 911 }
 912
 913 static void
 914 re_string_destruct (re_string_t *pstr)
 915 {
 916 #ifdef RE_ENABLE_I18N
 917   re_free (pstr->wcs);
 918   re_free (pstr->offsets);
 919 #endif /* RE_ENABLE_I18N  */
 920   if (pstr->mbs_allocated)
 921     re_free (pstr->mbs);
 922 }
 923
 924 /* Return the context at IDX in INPUT.  */
 925
 926 static unsigned int
 927 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 928 {
 929   int c;
 930   if (__glibc_unlikely (idx < 0))
 931     /* In this case, we use the value stored in input->tip_context,
 932        since we can't know the character in input->mbs[-1] here.  */
 933     return input->tip_context;
 934   if (__glibc_unlikely (idx == input->len))
 935     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 936             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 937 #ifdef RE_ENABLE_I18N
 938   if (input->mb_cur_max > 1)
 939     {
 940       wint_t wc;
 941       Idx wc_idx = idx;
 942       while(input->wcs[wc_idx] == WEOF)
 943         {
 944 #if defined DEBUG && DEBUG
 945           /* It must not happen.  */
 946           assert (wc_idx >= 0);
 947 #endif
 948           --wc_idx;
 949           if (wc_idx < 0)
 950             return input->tip_context;
 951         }
 952       wc = input->wcs[wc_idx];
 953       if (__glibc_unlikely (input->word_ops_used != 0)
 954           && IS_WIDE_WORD_CHAR (wc))
 955         return CONTEXT_WORD;
 956       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 957               ? CONTEXT_NEWLINE : 0);
 958     }
 959   else
 960 #endif
 961     {
 962       c = re_string_byte_at (input, idx);
 963       if (bitset_contain (input->word_char, c))
 964         return CONTEXT_WORD;
 965       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 966     }
 967 }
 968 \f
 969 /* Functions for set operation.  */
 970
 971 static reg_errcode_t
 972 __attribute_warn_unused_result__
 973 re_node_set_alloc (re_node_set *set, Idx size)
 974 {
 975   set->alloc = size;
 976   set->nelem = 0;
 977   set->elems = re_malloc (Idx, size);
 978   if (__glibc_unlikely (set->elems == NULL)
 979       && (MALLOC_0_IS_NONNULL || size != 0))
 980     return REG_ESPACE;
 981   return REG_NOERROR;
 982 }
 983
 984 static reg_errcode_t
 985 __attribute_warn_unused_result__
 986 re_node_set_init_1 (re_node_set *set, Idx elem)
 987 {
 988   set->alloc = 1;
 989   set->nelem = 1;
 990   set->elems = re_malloc (Idx, 1);
 991   if (__glibc_unlikely (set->elems == NULL))
 992     {
 993       set->alloc = set->nelem = 0;
 994       return REG_ESPACE;
 995     }
 996   set->elems[0] = elem;
 997   return REG_NOERROR;
 998 }
 999
1000 static reg_errcode_t
1001 __attribute_warn_unused_result__
1002 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1003 {
1004   set->alloc = 2;
1005   set->elems = re_malloc (Idx, 2);
1006   if (__glibc_unlikely (set->elems == NULL))
1007     return REG_ESPACE;
1008   if (elem1 == elem2)
1009     {
1010       set->nelem = 1;
1011       set->elems[0] = elem1;
1012     }
1013   else
1014     {
1015       set->nelem = 2;
1016       if (elem1 < elem2)
1017         {
1018           set->elems[0] = elem1;
1019           set->elems[1] = elem2;
1020         }
1021       else
1022         {
1023           set->elems[0] = elem2;
1024           set->elems[1] = elem1;
1025         }
1026     }
1027   return REG_NOERROR;
1028 }
1029
1030 static reg_errcode_t
1031 __attribute_warn_unused_result__
1032 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1033 {
1034   dest->nelem = src->nelem;
1035   if (src->nelem > 0)
1036     {
1037       dest->alloc = dest->nelem;
1038       dest->elems = re_malloc (Idx, dest->alloc);
1039       if (__glibc_unlikely (dest->elems == NULL))
1040         {
1041           dest->alloc = dest->nelem = 0;
1042           return REG_ESPACE;
1043         }
1044       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1045     }
1046   else
1047     re_node_set_init_empty (dest);
1048   return REG_NOERROR;
1049 }
1050
1051 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1052    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1053    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1054
1055 static reg_errcode_t
1056 __attribute_warn_unused_result__
1057 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1058                            const re_node_set *src2)
1059 {
1060   Idx i1, i2, is, id, delta, sbase;
1061   if (src1->nelem == 0 || src2->nelem == 0)
1062     return REG_NOERROR;
1063
1064   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1065      conservative estimate.  */
1066   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1067     {
1068       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1069       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1070       if (__glibc_unlikely (new_elems == NULL))
1071         return REG_ESPACE;
1072       dest->elems = new_elems;
1073       dest->alloc = new_alloc;
1074     }
1075
1076   /* Find the items in the intersection of SRC1 and SRC2, and copy
1077      into the top of DEST those that are not already in DEST itself.  */
1078   sbase = dest->nelem + src1->nelem + src2->nelem;
1079   i1 = src1->nelem - 1;
1080   i2 = src2->nelem - 1;
1081   id = dest->nelem - 1;
1082   for (;;)
1083     {
1084       if (src1->elems[i1] == src2->elems[i2])
1085         {
1086           /* Try to find the item in DEST.  Maybe we could binary search?  */
1087           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1088             --id;
1089
1090           if (id < 0 || dest->elems[id] != src1->elems[i1])
1091             dest->elems[--sbase] = src1->elems[i1];
1092
1093           if (--i1 < 0 || --i2 < 0)
1094             break;
1095         }
1096
1097       /* Lower the highest of the two items.  */
1098       else if (src1->elems[i1] < src2->elems[i2])
1099         {
1100           if (--i2 < 0)
1101             break;
1102         }
1103       else
1104         {
1105           if (--i1 < 0)
1106             break;
1107         }
1108     }
1109
1110   id = dest->nelem - 1;
1111   is = dest->nelem + src1->nelem + src2->nelem - 1;
1112   delta = is - sbase + 1;
1113
1114   /* Now copy.  When DELTA becomes zero, the remaining
1115      DEST elements are already in place; this is more or
1116      less the same loop that is in re_node_set_merge.  */
1117   dest->nelem += delta;
1118   if (delta > 0 && id >= 0)
1119     for (;;)
1120       {
1121         if (dest->elems[is] > dest->elems[id])
1122           {
1123             /* Copy from the top.  */
1124             dest->elems[id + delta--] = dest->elems[is--];
1125             if (delta == 0)
1126               break;
1127           }
1128         else
1129           {
1130             /* Slide from the bottom.  */
1131             dest->elems[id + delta] = dest->elems[id];
1132             if (--id < 0)
1133               break;
1134           }
1135       }
1136
1137   /* Copy remaining SRC elements.  */
1138   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1139
1140   return REG_NOERROR;
1141 }
1142
1143 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1144    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1145
1146 static reg_errcode_t
1147 __attribute_warn_unused_result__
1148 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1149                         const re_node_set *src2)
1150 {
1151   Idx i1, i2, id;
1152   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1153     {
1154       dest->alloc = src1->nelem + src2->nelem;
1155       dest->elems = re_malloc (Idx, dest->alloc);
1156       if (__glibc_unlikely (dest->elems == NULL))
1157         return REG_ESPACE;
1158     }
1159   else
1160     {
1161       if (src1 != NULL && src1->nelem > 0)
1162         return re_node_set_init_copy (dest, src1);
1163       else if (src2 != NULL && src2->nelem > 0)
1164         return re_node_set_init_copy (dest, src2);
1165       else
1166         re_node_set_init_empty (dest);
1167       return REG_NOERROR;
1168     }
1169   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1170     {
1171       if (src1->elems[i1] > src2->elems[i2])
1172         {
1173           dest->elems[id++] = src2->elems[i2++];
1174           continue;
1175         }
1176       if (src1->elems[i1] == src2->elems[i2])
1177         ++i2;
1178       dest->elems[id++] = src1->elems[i1++];
1179     }
1180   if (i1 < src1->nelem)
1181     {
1182       memcpy (dest->elems + id, src1->elems + i1,
1183              (src1->nelem - i1) * sizeof (Idx));
1184       id += src1->nelem - i1;
1185     }
1186   else if (i2 < src2->nelem)
1187     {
1188       memcpy (dest->elems + id, src2->elems + i2,
1189              (src2->nelem - i2) * sizeof (Idx));
1190       id += src2->nelem - i2;
1191     }
1192   dest->nelem = id;
1193   return REG_NOERROR;
1194 }
1195
1196 /* Calculate the union set of the sets DEST and SRC. And store it to
1197    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1198
1199 static reg_errcode_t
1200 __attribute_warn_unused_result__
1201 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1202 {
1203   Idx is, id, sbase, delta;
1204   if (src == NULL || src->nelem == 0)
1205     return REG_NOERROR;
1206   if (dest->alloc < 2 * src->nelem + dest->nelem)
1207     {
1208       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1209       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1210       if (__glibc_unlikely (new_buffer == NULL))
1211         return REG_ESPACE;
1212       dest->elems = new_buffer;
1213       dest->alloc = new_alloc;
1214     }
1215
1216   if (__glibc_unlikely (dest->nelem == 0))
1217     {
1218       dest->nelem = src->nelem;
1219       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1220       return REG_NOERROR;
1221     }
1222
1223   /* Copy into the top of DEST the items of SRC that are not
1224      found in DEST.  Maybe we could binary search in DEST?  */
1225   for (sbase = dest->nelem + 2 * src->nelem,
1226        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1227     {
1228       if (dest->elems[id] == src->elems[is])
1229         is--, id--;
1230       else if (dest->elems[id] < src->elems[is])
1231         dest->elems[--sbase] = src->elems[is--];
1232       else /* if (dest->elems[id] > src->elems[is]) */
1233         --id;
1234     }
1235
1236   if (is >= 0)
1237     {
1238       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1239       sbase -= is + 1;
1240       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1241     }
1242
1243   id = dest->nelem - 1;
1244   is = dest->nelem + 2 * src->nelem - 1;
1245   delta = is - sbase + 1;
1246   if (delta == 0)
1247     return REG_NOERROR;
1248
1249   /* Now copy.  When DELTA becomes zero, the remaining
1250      DEST elements are already in place.  */
1251   dest->nelem += delta;
1252   for (;;)
1253     {
1254       if (dest->elems[is] > dest->elems[id])
1255         {
1256           /* Copy from the top.  */
1257           dest->elems[id + delta--] = dest->elems[is--];
1258           if (delta == 0)
1259             break;
1260         }
1261       else
1262         {
1263           /* Slide from the bottom.  */
1264           dest->elems[id + delta] = dest->elems[id];
1265           if (--id < 0)
1266             {
1267               /* Copy remaining SRC elements.  */
1268               memcpy (dest->elems, dest->elems + sbase,
1269                       delta * sizeof (Idx));
1270               break;
1271             }
1272         }
1273     }
1274
1275   return REG_NOERROR;
1276 }
1277
1278 /* Insert the new element ELEM to the re_node_set* SET.
1279    SET should not already have ELEM.
1280    Return true if successful.  */
1281
1282 static bool
1283 __attribute_warn_unused_result__
1284 re_node_set_insert (re_node_set *set, Idx elem)
1285 {
1286   Idx idx;
1287   /* In case the set is empty.  */
1288   if (set->alloc == 0)
1289     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1290
1291   if (__glibc_unlikely (set->nelem) == 0)
1292     {
1293       /* We already guaranteed above that set->alloc != 0.  */
1294       set->elems[0] = elem;
1295       ++set->nelem;
1296       return true;
1297     }
1298
1299   /* Realloc if we need.  */
1300   if (set->alloc == set->nelem)
1301     {
1302       Idx *new_elems;
1303       set->alloc = set->alloc * 2;
1304       new_elems = re_realloc (set->elems, Idx, set->alloc);
1305       if (__glibc_unlikely (new_elems == NULL))
1306         return false;
1307       set->elems = new_elems;
1308     }
1309
1310   /* Move the elements which follows the new element.  Test the
1311      first element separately to skip a check in the inner loop.  */
1312   if (elem < set->elems[0])
1313     {
1314       idx = 0;
1315       for (idx = set->nelem; idx > 0; idx--)
1316         set->elems[idx] = set->elems[idx - 1];
1317     }
1318   else
1319     {
1320       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1321         set->elems[idx] = set->elems[idx - 1];
1322     }
1323
1324   /* Insert the new element.  */
1325   set->elems[idx] = elem;
1326   ++set->nelem;
1327   return true;
1328 }
1329
1330 /* Insert the new element ELEM to the re_node_set* SET.
1331    SET should not already have any element greater than or equal to ELEM.
1332    Return true if successful.  */
1333
1334 static bool
1335 __attribute_warn_unused_result__
1336 re_node_set_insert_last (re_node_set *set, Idx elem)
1337 {
1338   /* Realloc if we need.  */
1339   if (set->alloc == set->nelem)
1340     {
1341       Idx *new_elems;
1342       set->alloc = (set->alloc + 1) * 2;
1343       new_elems = re_realloc (set->elems, Idx, set->alloc);
1344       if (__glibc_unlikely (new_elems == NULL))
1345         return false;
1346       set->elems = new_elems;
1347     }
1348
1349   /* Insert the new element.  */
1350   set->elems[set->nelem++] = elem;
1351   return true;
1352 }
1353
1354 /* Compare two node sets SET1 and SET2.
1355    Return true if SET1 and SET2 are equivalent.  */
1356
1357 static bool
1358 __attribute__ ((pure))
1359 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1360 {
1361   Idx i;
1362   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1363     return false;
1364   for (i = set1->nelem ; --i >= 0 ; )
1365     if (set1->elems[i] != set2->elems[i])
1366       return false;
1367   return true;
1368 }
1369
1370 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1371
1372 static Idx
1373 __attribute__ ((pure))
1374 re_node_set_contains (const re_node_set *set, Idx elem)
1375 {
1376   __re_size_t idx, right, mid;
1377   if (set->nelem <= 0)
1378     return 0;
1379
1380   /* Binary search the element.  */
1381   idx = 0;
1382   right = set->nelem - 1;
1383   while (idx < right)
1384     {
1385       mid = (idx + right) / 2;
1386       if (set->elems[mid] < elem)
1387         idx = mid + 1;
1388       else
1389         right = mid;
1390     }
1391   return set->elems[idx] == elem ? idx + 1 : 0;
1392 }
1393
1394 static void
1395 re_node_set_remove_at (re_node_set *set, Idx idx)
1396 {
1397   if (idx < 0 || idx >= set->nelem)
1398     return;
1399   --set->nelem;
1400   for (; idx < set->nelem; idx++)
1401     set->elems[idx] = set->elems[idx + 1];
1402 }
1403 \f
1404
1405 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406    Or return -1 if an error occurred.  */
1407
1408 static Idx
1409 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410 {
1411   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1412     {
1413       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1414       Idx *new_nexts, *new_indices;
1415       re_node_set *new_edests, *new_eclosures;
1416       re_token_t *new_nodes;
1417
1418       /* Avoid overflows in realloc.  */
1419       const size_t max_object_size = MAX (sizeof (re_token_t),
1420                                           MAX (sizeof (re_node_set),
1421                                                sizeof (Idx)));
1422       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1423                             < new_nodes_alloc))
1424         return -1;
1425
1426       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427       if (__glibc_unlikely (new_nodes == NULL))
1428         return -1;
1429       dfa->nodes = new_nodes;
1430       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1431       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1432       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1435                             || new_edests == NULL || new_eclosures == NULL))
1436         {
1437            re_free (new_nexts);
1438            re_free (new_indices);
1439            re_free (new_edests);
1440            re_free (new_eclosures);
1441            return -1;
1442         }
1443       dfa->nexts = new_nexts;
1444       dfa->org_indices = new_indices;
1445       dfa->edests = new_edests;
1446       dfa->eclosures = new_eclosures;
1447       dfa->nodes_alloc = new_nodes_alloc;
1448     }
1449   dfa->nodes[dfa->nodes_len] = token;
1450   dfa->nodes[dfa->nodes_len].constraint = 0;
1451 #ifdef RE_ENABLE_I18N
1452   dfa->nodes[dfa->nodes_len].accept_mb =
1453     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1454      || token.type == COMPLEX_BRACKET);
1455 #endif
1456   dfa->nexts[dfa->nodes_len] = -1;
1457   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1458   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1459   return dfa->nodes_len++;
1460 }
1461
1462 static re_hashval_t
1463 calc_state_hash (const re_node_set *nodes, unsigned int context)
1464 {
1465   re_hashval_t hash = nodes->nelem + context;
1466   Idx i;
1467   for (i = 0 ; i < nodes->nelem ; i++)
1468     hash += nodes->elems[i];
1469   return hash;
1470 }
1471
1472 /* Search for the state whose node_set is equivalent to NODES.
1473    Return the pointer to the state, if we found it in the DFA.
1474    Otherwise create the new one and return it.  In case of an error
1475    return NULL and set the error code in ERR.
1476    Note: - We assume NULL as the invalid state, then it is possible that
1477            return value is NULL and ERR is REG_NOERROR.
1478          - We never return non-NULL value in case of any errors, it is for
1479            optimization.  */
1480
1481 static re_dfastate_t *
1482 __attribute_warn_unused_result__
1483 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1484                   const re_node_set *nodes)
1485 {
1486   re_hashval_t hash;
1487   re_dfastate_t *new_state;
1488   struct re_state_table_entry *spot;
1489   Idx i;
1490 #if defined GCC_LINT || defined lint
1491   /* Suppress bogus uninitialized-variable warnings.  */
1492   *err = REG_NOERROR;
1493 #endif
1494   if (__glibc_unlikely (nodes->nelem == 0))
1495     {
1496       *err = REG_NOERROR;
1497       return NULL;
1498     }
1499   hash = calc_state_hash (nodes, 0);
1500   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1501
1502   for (i = 0 ; i < spot->num ; i++)
1503     {
1504       re_dfastate_t *state = spot->array[i];
1505       if (hash != state->hash)
1506         continue;
1507       if (re_node_set_compare (&state->nodes, nodes))
1508         return state;
1509     }
1510
1511   /* There are no appropriate state in the dfa, create the new one.  */
1512   new_state = create_ci_newstate (dfa, nodes, hash);
1513   if (__glibc_unlikely (new_state == NULL))
1514     *err = REG_ESPACE;
1515
1516   return new_state;
1517 }
1518
1519 /* Search for the state whose node_set is equivalent to NODES and
1520    whose context is equivalent to CONTEXT.
1521    Return the pointer to the state, if we found it in the DFA.
1522    Otherwise create the new one and return it.  In case of an error
1523    return NULL and set the error code in ERR.
1524    Note: - We assume NULL as the invalid state, then it is possible that
1525            return value is NULL and ERR is REG_NOERROR.
1526          - We never return non-NULL value in case of any errors, it is for
1527            optimization.  */
1528
1529 static re_dfastate_t *
1530 __attribute_warn_unused_result__
1531 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1532                           const re_node_set *nodes, unsigned int context)
1533 {
1534   re_hashval_t hash;
1535   re_dfastate_t *new_state;
1536   struct re_state_table_entry *spot;
1537   Idx i;
1538 #if defined GCC_LINT || defined lint
1539   /* Suppress bogus uninitialized-variable warnings.  */
1540   *err = REG_NOERROR;
1541 #endif
1542   if (nodes->nelem == 0)
1543     {
1544       *err = REG_NOERROR;
1545       return NULL;
1546     }
1547   hash = calc_state_hash (nodes, context);
1548   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1549
1550   for (i = 0 ; i < spot->num ; i++)
1551     {
1552       re_dfastate_t *state = spot->array[i];
1553       if (state->hash == hash
1554           && state->context == context
1555           && re_node_set_compare (state->entrance_nodes, nodes))
1556         return state;
1557     }
1558   /* There are no appropriate state in 'dfa', create the new one.  */
1559   new_state = create_cd_newstate (dfa, nodes, context, hash);
1560   if (__glibc_unlikely (new_state == NULL))
1561     *err = REG_ESPACE;
1562
1563   return new_state;
1564 }
1565
1566 /* Finish initialization of the new state NEWSTATE, and using its hash value
1567    HASH put in the appropriate bucket of DFA's state table.  Return value
1568    indicates the error code if failed.  */
1569
1570 static reg_errcode_t
1571 __attribute_warn_unused_result__
1572 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1573                 re_hashval_t hash)
1574 {
1575   struct re_state_table_entry *spot;
1576   reg_errcode_t err;
1577   Idx i;
1578
1579   newstate->hash = hash;
1580   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1581   if (__glibc_unlikely (err != REG_NOERROR))
1582     return REG_ESPACE;
1583   for (i = 0; i < newstate->nodes.nelem; i++)
1584     {
1585       Idx elem = newstate->nodes.elems[i];
1586       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1587         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1588           return REG_ESPACE;
1589     }
1590
1591   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1592   if (__glibc_unlikely (spot->alloc <= spot->num))
1593     {
1594       Idx new_alloc = 2 * spot->num + 2;
1595       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1596                                               new_alloc);
1597       if (__glibc_unlikely (new_array == NULL))
1598         return REG_ESPACE;
1599       spot->array = new_array;
1600       spot->alloc = new_alloc;
1601     }
1602   spot->array[spot->num++] = newstate;
1603   return REG_NOERROR;
1604 }
1605
1606 static void
1607 free_state (re_dfastate_t *state)
1608 {
1609   re_node_set_free (&state->non_eps_nodes);
1610   re_node_set_free (&state->inveclosure);
1611   if (state->entrance_nodes != &state->nodes)
1612     {
1613       re_node_set_free (state->entrance_nodes);
1614       re_free (state->entrance_nodes);
1615     }
1616   re_node_set_free (&state->nodes);
1617   re_free (state->word_trtable);
1618   re_free (state->trtable);
1619   re_free (state);
1620 }
1621
1622 /* Create the new state which is independent of contexts.
1623    Return the new state if succeeded, otherwise return NULL.  */
1624
1625 static re_dfastate_t *
1626 __attribute_warn_unused_result__
1627 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1628                     re_hashval_t hash)
1629 {
1630   Idx i;
1631   reg_errcode_t err;
1632   re_dfastate_t *newstate;
1633
1634   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1635   if (__glibc_unlikely (newstate == NULL))
1636     return NULL;
1637   err = re_node_set_init_copy (&newstate->nodes, nodes);
1638   if (__glibc_unlikely (err != REG_NOERROR))
1639     {
1640       re_free (newstate);
1641       return NULL;
1642     }
1643
1644   newstate->entrance_nodes = &newstate->nodes;
1645   for (i = 0 ; i < nodes->nelem ; i++)
1646     {
1647       re_token_t *node = dfa->nodes + nodes->elems[i];
1648       re_token_type_t type = node->type;
1649       if (type == CHARACTER && !node->constraint)
1650         continue;
1651 #ifdef RE_ENABLE_I18N
1652       newstate->accept_mb |= node->accept_mb;
1653 #endif /* RE_ENABLE_I18N */
1654
1655       /* If the state has the halt node, the state is a halt state.  */
1656       if (type == END_OF_RE)
1657         newstate->halt = 1;
1658       else if (type == OP_BACK_REF)
1659         newstate->has_backref = 1;
1660       else if (type == ANCHOR || node->constraint)
1661         newstate->has_constraint = 1;
1662     }
1663   err = register_state (dfa, newstate, hash);
1664   if (__glibc_unlikely (err != REG_NOERROR))
1665     {
1666       free_state (newstate);
1667       newstate = NULL;
1668     }
1669   return newstate;
1670 }
1671
1672 /* Create the new state which is depend on the context CONTEXT.
1673    Return the new state if succeeded, otherwise return NULL.  */
1674
1675 static re_dfastate_t *
1676 __attribute_warn_unused_result__
1677 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1678                     unsigned int context, re_hashval_t hash)
1679 {
1680   Idx i, nctx_nodes = 0;
1681   reg_errcode_t err;
1682   re_dfastate_t *newstate;
1683
1684   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1685   if (__glibc_unlikely (newstate == NULL))
1686     return NULL;
1687   err = re_node_set_init_copy (&newstate->nodes, nodes);
1688   if (__glibc_unlikely (err != REG_NOERROR))
1689     {
1690       re_free (newstate);
1691       return NULL;
1692     }
1693
1694   newstate->context = context;
1695   newstate->entrance_nodes = &newstate->nodes;
1696
1697   for (i = 0 ; i < nodes->nelem ; i++)
1698     {
1699       re_token_t *node = dfa->nodes + nodes->elems[i];
1700       re_token_type_t type = node->type;
1701       unsigned int constraint = node->constraint;
1702
1703       if (type == CHARACTER && !constraint)
1704         continue;
1705 #ifdef RE_ENABLE_I18N
1706       newstate->accept_mb |= node->accept_mb;
1707 #endif /* RE_ENABLE_I18N */
1708
1709       /* If the state has the halt node, the state is a halt state.  */
1710       if (type == END_OF_RE)
1711         newstate->halt = 1;
1712       else if (type == OP_BACK_REF)
1713         newstate->has_backref = 1;
1714
1715       if (constraint)
1716         {
1717           if (newstate->entrance_nodes == &newstate->nodes)
1718             {
1719               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1720               if (__glibc_unlikely (newstate->entrance_nodes == NULL))
1721                 {
1722                   free_state (newstate);
1723                   return NULL;
1724                 }
1725               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1726                   != REG_NOERROR)
1727                 return NULL;
1728               nctx_nodes = 0;
1729               newstate->has_constraint = 1;
1730             }
1731
1732           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1733             {
1734               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1735               ++nctx_nodes;
1736             }
1737         }
1738     }
1739   err = register_state (dfa, newstate, hash);
1740   if (__glibc_unlikely (err != REG_NOERROR))
1741     {
1742       free_state (newstate);
1743       newstate = NULL;
1744     }
1745   return  newstate;
1746 }