gl/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa);
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash);
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash);
  31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
  32                                                 Idx new_buf_len);
  33 static void build_wcs_buffer (re_string_t *pstr);
  34 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
  35 static void build_upper_buffer (re_string_t *pstr);
  36 static void re_string_translate_buffer (re_string_t *pstr);
  37 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
  38                                           int eflags) __attribute__ ((pure));
  39 \f
  40 /* Functions for string operation.  */
  41
  42 /* This function allocate the buffers.  It is necessary to call
  43    re_string_reconstruct before using the object.  */
  44
  45 static reg_errcode_t
  46 __attribute_warn_unused_result__
  47 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  48                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  49 {
  50   reg_errcode_t ret;
  51   Idx init_buf_len;
  52
  53   /* Ensure at least one character fits into the buffers.  */
  54   if (init_len < dfa->mb_cur_max)
  55     init_len = dfa->mb_cur_max;
  56   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  57   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  58
  59   ret = re_string_realloc_buffers (pstr, init_buf_len);
  60   if (__glibc_unlikely (ret != REG_NOERROR))
  61     return ret;
  62
  63   pstr->word_char = dfa->word_char;
  64   pstr->word_ops_used = dfa->word_ops_used;
  65   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  66   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  67   pstr->valid_raw_len = pstr->valid_len;
  68   return REG_NOERROR;
  69 }
  70
  71 /* This function allocate the buffers, and initialize them.  */
  72
  73 static reg_errcode_t
  74 __attribute_warn_unused_result__
  75 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  76                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  77 {
  78   reg_errcode_t ret;
  79   memset (pstr, '\0', sizeof (re_string_t));
  80   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  81
  82   if (len > 0)
  83     {
  84       ret = re_string_realloc_buffers (pstr, len + 1);
  85       if (__glibc_unlikely (ret != REG_NOERROR))
  86         return ret;
  87     }
  88   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  89
  90   if (icase)
  91     {
  92       if (dfa->mb_cur_max > 1)
  93         {
  94           while (1)
  95             {
  96               ret = build_wcs_upper_buffer (pstr);
  97               if (__glibc_unlikely (ret != REG_NOERROR))
  98                 return ret;
  99               if (pstr->valid_raw_len >= len)
 100                 break;
 101               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 102                 break;
 103               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 104               if (__glibc_unlikely (ret != REG_NOERROR))
 105                 return ret;
 106             }
 107         }
 108       else
 109         build_upper_buffer (pstr);
 110     }
 111   else
 112     {
 113       if (dfa->mb_cur_max > 1)
 114         build_wcs_buffer (pstr);
 115       else
 116         {
 117           if (trans != NULL)
 118             re_string_translate_buffer (pstr);
 119           else
 120             {
 121               pstr->valid_len = pstr->bufs_len;
 122               pstr->valid_raw_len = pstr->bufs_len;
 123             }
 124         }
 125     }
 126
 127   return REG_NOERROR;
 128 }
 129
 130 /* Helper functions for re_string_allocate, and re_string_construct.  */
 131
 132 static reg_errcode_t
 133 __attribute_warn_unused_result__
 134 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 135 {
 136   if (pstr->mb_cur_max > 1)
 137     {
 138       wint_t *new_wcs;
 139
 140       /* Avoid overflow in realloc.  */
 141       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 142       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
 143                             < new_buf_len))
 144         return REG_ESPACE;
 145
 146       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 147       if (__glibc_unlikely (new_wcs == NULL))
 148         return REG_ESPACE;
 149       pstr->wcs = new_wcs;
 150       if (pstr->offsets != NULL)
 151         {
 152           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 153           if (__glibc_unlikely (new_offsets == NULL))
 154             return REG_ESPACE;
 155           pstr->offsets = new_offsets;
 156         }
 157     }
 158   if (pstr->mbs_allocated)
 159     {
 160       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 161                                            new_buf_len);
 162       if (__glibc_unlikely (new_mbs == NULL))
 163         return REG_ESPACE;
 164       pstr->mbs = new_mbs;
 165     }
 166   pstr->bufs_len = new_buf_len;
 167   return REG_NOERROR;
 168 }
 169
 170
 171 static void
 172 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 173                             RE_TRANSLATE_TYPE trans, bool icase,
 174                             const re_dfa_t *dfa)
 175 {
 176   pstr->raw_mbs = (const unsigned char *) str;
 177   pstr->len = len;
 178   pstr->raw_len = len;
 179   pstr->trans = trans;
 180   pstr->icase = icase;
 181   pstr->mbs_allocated = (trans != NULL || icase);
 182   pstr->mb_cur_max = dfa->mb_cur_max;
 183   pstr->is_utf8 = dfa->is_utf8;
 184   pstr->map_notascii = dfa->map_notascii;
 185   pstr->stop = pstr->len;
 186   pstr->raw_stop = pstr->stop;
 187 }
 188
 189
 190 /* Build wide character buffer PSTR->WCS.
 191    If the byte sequence of the string are:
 192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 193    Then wide character buffer will be:
 194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 195    We use WEOF for padding, they indicate that the position isn't
 196    a first byte of a multibyte character.
 197
 198    Note that this function assumes PSTR->VALID_LEN elements are already
 199    built and starts from PSTR->VALID_LEN.  */
 200
 201 static void
 202 build_wcs_buffer (re_string_t *pstr)
 203 {
 204 #ifdef _LIBC
 205   unsigned char buf[MB_LEN_MAX];
 206   DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
 207 #else
 208   unsigned char buf[64];
 209 #endif
 210   mbstate_t prev_st;
 211   Idx byte_idx, end_idx, remain_len;
 212   size_t mbclen;
 213
 214   /* Build the buffers from pstr->valid_len to either pstr->len or
 215      pstr->bufs_len.  */
 216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 218     {
 219       wchar_t wc;
 220       const char *p;
 221
 222       remain_len = end_idx - byte_idx;
 223       prev_st = pstr->cur_state;
 224       /* Apply the translation if we need.  */
 225       if (__glibc_unlikely (pstr->trans != NULL))
 226         {
 227           int i, ch;
 228
 229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 230             {
 231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 233             }
 234           p = (const char *) buf;
 235         }
 236       else
 237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 239       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
 240                             || (mbclen == (size_t) -2
 241                                 && pstr->bufs_len >= pstr->len)))
 242         {
 243           /* We treat these cases as a singlebyte character.  */
 244           mbclen = 1;
 245           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 246           if (__glibc_unlikely (pstr->trans != NULL))
 247             wc = pstr->trans[wc];
 248           pstr->cur_state = prev_st;
 249         }
 250       else if (__glibc_unlikely (mbclen == (size_t) -2))
 251         {
 252           /* The buffer doesn't have enough space, finish to build.  */
 253           pstr->cur_state = prev_st;
 254           break;
 255         }
 256
 257       /* Write wide character and padding.  */
 258       pstr->wcs[byte_idx++] = wc;
 259       /* Write paddings.  */
 260       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 261         pstr->wcs[byte_idx++] = WEOF;
 262     }
 263   pstr->valid_len = byte_idx;
 264   pstr->valid_raw_len = byte_idx;
 265 }
 266
 267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 268    but for REG_ICASE.  */
 269
 270 static reg_errcode_t
 271 __attribute_warn_unused_result__
 272 build_wcs_upper_buffer (re_string_t *pstr)
 273 {
 274   mbstate_t prev_st;
 275   Idx src_idx, byte_idx, end_idx, remain_len;
 276   size_t mbclen;
 277 #ifdef _LIBC
 278   char buf[MB_LEN_MAX];
 279   DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
 280 #else
 281   char buf[64];
 282 #endif
 283
 284   byte_idx = pstr->valid_len;
 285   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 286
 287   /* The following optimization assumes that ASCII characters can be
 288      mapped to wide characters with a simple cast.  */
 289   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 290     {
 291       while (byte_idx < end_idx)
 292         {
 293           wchar_t wc;
 294           unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 295
 296           if (isascii (ch) && mbsinit (&pstr->cur_state))
 297             {
 298               /* The next step uses the assumption that wchar_t is encoded
 299                  ASCII-safe: all ASCII values can be converted like this.  */
 300               wchar_t wcu = __towupper (ch);
 301               if (isascii (wcu))
 302                 {
 303                   pstr->mbs[byte_idx] = wcu;
 304                   pstr->wcs[byte_idx] = wcu;
 305                   byte_idx++;
 306                   continue;
 307                 }
 308             }
 309
 310           remain_len = end_idx - byte_idx;
 311           prev_st = pstr->cur_state;
 312           mbclen = __mbrtowc (&wc,
 313                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 314                                + byte_idx), remain_len, &pstr->cur_state);
 315           if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 316             {
 317               wchar_t wcu = __towupper (wc);
 318               if (wcu != wc)
 319                 {
 320                   size_t mbcdlen;
 321
 322                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
 323                   if (__glibc_likely (mbclen == mbcdlen))
 324                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 325                   else
 326                     {
 327                       src_idx = byte_idx;
 328                       goto offsets_needed;
 329                     }
 330                 }
 331               else
 332                 memcpy (pstr->mbs + byte_idx,
 333                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 334               pstr->wcs[byte_idx++] = wcu;
 335               /* Write paddings.  */
 336               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 337                 pstr->wcs[byte_idx++] = WEOF;
 338             }
 339           else if (mbclen == (size_t) -1 || mbclen == 0
 340                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 341             {
 342               /* It is an invalid character, an incomplete character
 343                  at the end of the string, or '\0'.  Just use the byte.  */
 344               pstr->mbs[byte_idx] = ch;
 345               /* And also cast it to wide char.  */
 346               pstr->wcs[byte_idx++] = (wchar_t) ch;
 347               if (__glibc_unlikely (mbclen == (size_t) -1))
 348                 pstr->cur_state = prev_st;
 349             }
 350           else
 351             {
 352               /* The buffer doesn't have enough space, finish to build.  */
 353               pstr->cur_state = prev_st;
 354               break;
 355             }
 356         }
 357       pstr->valid_len = byte_idx;
 358       pstr->valid_raw_len = byte_idx;
 359       return REG_NOERROR;
 360     }
 361   else
 362     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 363       {
 364         wchar_t wc;
 365         const char *p;
 366       offsets_needed:
 367         remain_len = end_idx - byte_idx;
 368         prev_st = pstr->cur_state;
 369         if (__glibc_unlikely (pstr->trans != NULL))
 370           {
 371             int i, ch;
 372
 373             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 374               {
 375                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 376                 buf[i] = pstr->trans[ch];
 377               }
 378             p = (const char *) buf;
 379           }
 380         else
 381           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 382         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 383         if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 384           {
 385             wchar_t wcu = __towupper (wc);
 386             if (wcu != wc)
 387               {
 388                 size_t mbcdlen;
 389
 390                 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
 391                 if (__glibc_likely (mbclen == mbcdlen))
 392                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 393                 else if (mbcdlen != (size_t) -1)
 394                   {
 395                     size_t i;
 396
 397                     if (byte_idx + mbcdlen > pstr->bufs_len)
 398                       {
 399                         pstr->cur_state = prev_st;
 400                         break;
 401                       }
 402
 403                     if (pstr->offsets == NULL)
 404                       {
 405                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 406
 407                         if (pstr->offsets == NULL)
 408                           return REG_ESPACE;
 409                       }
 410                     if (!pstr->offsets_needed)
 411                       {
 412                         for (i = 0; i < (size_t) byte_idx; ++i)
 413                           pstr->offsets[i] = i;
 414                         pstr->offsets_needed = 1;
 415                       }
 416
 417                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 418                     pstr->wcs[byte_idx] = wcu;
 419                     pstr->offsets[byte_idx] = src_idx;
 420                     for (i = 1; i < mbcdlen; ++i)
 421                       {
 422                         pstr->offsets[byte_idx + i]
 423                           = src_idx + (i < mbclen ? i : mbclen - 1);
 424                         pstr->wcs[byte_idx + i] = WEOF;
 425                       }
 426                     pstr->len += mbcdlen - mbclen;
 427                     if (pstr->raw_stop > src_idx)
 428                       pstr->stop += mbcdlen - mbclen;
 429                     end_idx = (pstr->bufs_len > pstr->len)
 430                               ? pstr->len : pstr->bufs_len;
 431                     byte_idx += mbcdlen;
 432                     src_idx += mbclen;
 433                     continue;
 434                   }
 435                 else
 436                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 437               }
 438             else
 439               memcpy (pstr->mbs + byte_idx, p, mbclen);
 440
 441             if (__glibc_unlikely (pstr->offsets_needed != 0))
 442               {
 443                 size_t i;
 444                 for (i = 0; i < mbclen; ++i)
 445                   pstr->offsets[byte_idx + i] = src_idx + i;
 446               }
 447             src_idx += mbclen;
 448
 449             pstr->wcs[byte_idx++] = wcu;
 450             /* Write paddings.  */
 451             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 452               pstr->wcs[byte_idx++] = WEOF;
 453           }
 454         else if (mbclen == (size_t) -1 || mbclen == 0
 455                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 456           {
 457             /* It is an invalid character or '\0'.  Just use the byte.  */
 458             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 459
 460             if (__glibc_unlikely (pstr->trans != NULL))
 461               ch = pstr->trans [ch];
 462             pstr->mbs[byte_idx] = ch;
 463
 464             if (__glibc_unlikely (pstr->offsets_needed != 0))
 465               pstr->offsets[byte_idx] = src_idx;
 466             ++src_idx;
 467
 468             /* And also cast it to wide char.  */
 469             pstr->wcs[byte_idx++] = (wchar_t) ch;
 470             if (__glibc_unlikely (mbclen == (size_t) -1))
 471               pstr->cur_state = prev_st;
 472           }
 473         else
 474           {
 475             /* The buffer doesn't have enough space, finish to build.  */
 476             pstr->cur_state = prev_st;
 477             break;
 478           }
 479       }
 480   pstr->valid_len = byte_idx;
 481   pstr->valid_raw_len = src_idx;
 482   return REG_NOERROR;
 483 }
 484
 485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 486    Return the index.  */
 487
 488 static Idx
 489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 490 {
 491   mbstate_t prev_st;
 492   Idx rawbuf_idx;
 493   size_t mbclen;
 494   wint_t wc = WEOF;
 495
 496   /* Skip the characters which are not necessary to check.  */
 497   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 498        rawbuf_idx < new_raw_idx;)
 499     {
 500       wchar_t wc2;
 501       Idx remain_len = pstr->raw_len - rawbuf_idx;
 502       prev_st = pstr->cur_state;
 503       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 504                           remain_len, &pstr->cur_state);
 505       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
 506                             || mbclen == 0))
 507         {
 508           /* We treat these cases as a single byte character.  */
 509           if (mbclen == 0 || remain_len == 0)
 510             wc = L'\0';
 511           else
 512             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 513           mbclen = 1;
 514           pstr->cur_state = prev_st;
 515         }
 516       else
 517         wc = wc2;
 518       /* Then proceed the next character.  */
 519       rawbuf_idx += mbclen;
 520     }
 521   *last_wc = wc;
 522   return rawbuf_idx;
 523 }
 524
 525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 526    This function is used in case of REG_ICASE.  */
 527
 528 static void
 529 build_upper_buffer (re_string_t *pstr)
 530 {
 531   Idx char_idx, end_idx;
 532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 533
 534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 535     {
 536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 537       if (__glibc_unlikely (pstr->trans != NULL))
 538         ch = pstr->trans[ch];
 539       pstr->mbs[char_idx] = toupper (ch);
 540     }
 541   pstr->valid_len = char_idx;
 542   pstr->valid_raw_len = char_idx;
 543 }
 544
 545 /* Apply TRANS to the buffer in PSTR.  */
 546
 547 static void
 548 re_string_translate_buffer (re_string_t *pstr)
 549 {
 550   Idx buf_idx, end_idx;
 551   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 552
 553   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 554     {
 555       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 556       pstr->mbs[buf_idx] = pstr->trans[ch];
 557     }
 558
 559   pstr->valid_len = buf_idx;
 560   pstr->valid_raw_len = buf_idx;
 561 }
 562
 563 /* This function re-construct the buffers.
 564    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 565    convert to upper case in case of REG_ICASE, apply translation.  */
 566
 567 static reg_errcode_t
 568 __attribute_warn_unused_result__
 569 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 570 {
 571   Idx offset;
 572
 573   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
 574     offset = idx - pstr->raw_mbs_idx;
 575   else
 576     {
 577       /* Reset buffer.  */
 578       if (pstr->mb_cur_max > 1)
 579         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 580       pstr->len = pstr->raw_len;
 581       pstr->stop = pstr->raw_stop;
 582       pstr->valid_len = 0;
 583       pstr->raw_mbs_idx = 0;
 584       pstr->valid_raw_len = 0;
 585       pstr->offsets_needed = 0;
 586       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 587                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 588       if (!pstr->mbs_allocated)
 589         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 590       offset = idx;
 591     }
 592
 593   if (__glibc_likely (offset != 0))
 594     {
 595       /* Should the already checked characters be kept?  */
 596       if (__glibc_likely (offset < pstr->valid_raw_len))
 597         {
 598           /* Yes, move them to the front of the buffer.  */
 599           if (__glibc_unlikely (pstr->offsets_needed))
 600             {
 601               Idx low = 0, high = pstr->valid_len, mid;
 602               do
 603                 {
 604                   mid = (high + low) / 2;
 605                   if (pstr->offsets[mid] > offset)
 606                     high = mid;
 607                   else if (pstr->offsets[mid] < offset)
 608                     low = mid + 1;
 609                   else
 610                     break;
 611                 }
 612               while (low < high);
 613               if (pstr->offsets[mid] < offset)
 614                 ++mid;
 615               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 616                                                         eflags);
 617               /* This can be quite complicated, so handle specially
 618                  only the common and easy case where the character with
 619                  different length representation of lower and upper
 620                  case is present at or after offset.  */
 621               if (pstr->valid_len > offset
 622                   && mid == offset && pstr->offsets[mid] == offset)
 623                 {
 624                   memmove (pstr->wcs, pstr->wcs + offset,
 625                            (pstr->valid_len - offset) * sizeof (wint_t));
 626                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 627                   pstr->valid_len -= offset;
 628                   pstr->valid_raw_len -= offset;
 629                   for (low = 0; low < pstr->valid_len; low++)
 630                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 631                 }
 632               else
 633                 {
 634                   /* Otherwise, just find out how long the partial multibyte
 635                      character at offset is and fill it with WEOF/255.  */
 636                   pstr->len = pstr->raw_len - idx + offset;
 637                   pstr->stop = pstr->raw_stop - idx + offset;
 638                   pstr->offsets_needed = 0;
 639                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 640                     --mid;
 641                   while (mid < pstr->valid_len)
 642                     if (pstr->wcs[mid] != WEOF)
 643                       break;
 644                     else
 645                       ++mid;
 646                   if (mid == pstr->valid_len)
 647                     pstr->valid_len = 0;
 648                   else
 649                     {
 650                       pstr->valid_len = pstr->offsets[mid] - offset;
 651                       if (pstr->valid_len)
 652                         {
 653                           for (low = 0; low < pstr->valid_len; ++low)
 654                             pstr->wcs[low] = WEOF;
 655                           memset (pstr->mbs, 255, pstr->valid_len);
 656                         }
 657                     }
 658                   pstr->valid_raw_len = pstr->valid_len;
 659                 }
 660             }
 661           else
 662             {
 663               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 664                                                         eflags);
 665               if (pstr->mb_cur_max > 1)
 666                 memmove (pstr->wcs, pstr->wcs + offset,
 667                          (pstr->valid_len - offset) * sizeof (wint_t));
 668               if (__glibc_unlikely (pstr->mbs_allocated))
 669                 memmove (pstr->mbs, pstr->mbs + offset,
 670                          pstr->valid_len - offset);
 671               pstr->valid_len -= offset;
 672               pstr->valid_raw_len -= offset;
 673               DEBUG_ASSERT (pstr->valid_len > 0);
 674             }
 675         }
 676       else
 677         {
 678           /* No, skip all characters until IDX.  */
 679           Idx prev_valid_len = pstr->valid_len;
 680
 681           if (__glibc_unlikely (pstr->offsets_needed))
 682             {
 683               pstr->len = pstr->raw_len - idx + offset;
 684               pstr->stop = pstr->raw_stop - idx + offset;
 685               pstr->offsets_needed = 0;
 686             }
 687           pstr->valid_len = 0;
 688           if (pstr->mb_cur_max > 1)
 689             {
 690               Idx wcs_idx;
 691               wint_t wc = WEOF;
 692
 693               if (pstr->is_utf8)
 694                 {
 695                   const unsigned char *raw, *p, *end;
 696
 697                   /* Special case UTF-8.  Multi-byte chars start with any
 698                      byte other than 0x80 - 0xbf.  */
 699                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 700                   end = raw + (offset - pstr->mb_cur_max);
 701                   if (end < pstr->raw_mbs)
 702                     end = pstr->raw_mbs;
 703                   p = raw + offset - 1;
 704 #ifdef _LIBC
 705                   /* We know the wchar_t encoding is UCS4, so for the simple
 706                      case, ASCII characters, skip the conversion step.  */
 707                   if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
 708                     {
 709                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 710                       /* pstr->valid_len = 0; */
 711                       wc = (wchar_t) *p;
 712                     }
 713                   else
 714 #endif
 715                     for (; p >= end; --p)
 716                       if ((*p & 0xc0) != 0x80)
 717                         {
 718                           mbstate_t cur_state;
 719                           wchar_t wc2;
 720                           Idx mlen = raw + pstr->len - p;
 721                           unsigned char buf[6];
 722                           size_t mbclen;
 723
 724                           const unsigned char *pp = p;
 725                           if (__glibc_unlikely (pstr->trans != NULL))
 726                             {
 727                               int i = mlen < 6 ? mlen : 6;
 728                               while (--i >= 0)
 729                                 buf[i] = pstr->trans[p[i]];
 730                               pp = buf;
 731                             }
 732                           /* XXX Don't use mbrtowc, we know which conversion
 733                              to use (UTF-8 -> UCS4).  */
 734                           memset (&cur_state, 0, sizeof (cur_state));
 735                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 736                                               &cur_state);
 737                           if (raw + offset - p <= mbclen
 738                               && mbclen < (size_t) -2)
 739                             {
 740                               memset (&pstr->cur_state, '\0',
 741                                       sizeof (mbstate_t));
 742                               pstr->valid_len = mbclen - (raw + offset - p);
 743                               wc = wc2;
 744                             }
 745                           break;
 746                         }
 747                 }
 748
 749               if (wc == WEOF)
 750                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 751               if (wc == WEOF)
 752                 pstr->tip_context
 753                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 754               else
 755                 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
 756                                       && IS_WIDE_WORD_CHAR (wc))
 757                                      ? CONTEXT_WORD
 758                                      : ((IS_WIDE_NEWLINE (wc)
 759                                          && pstr->newline_anchor)
 760                                         ? CONTEXT_NEWLINE : 0));
 761               if (__glibc_unlikely (pstr->valid_len))
 762                 {
 763                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 764                     pstr->wcs[wcs_idx] = WEOF;
 765                   if (pstr->mbs_allocated)
 766                     memset (pstr->mbs, 255, pstr->valid_len);
 767                 }
 768               pstr->valid_raw_len = pstr->valid_len;
 769             }
 770           else
 771             {
 772               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 773               pstr->valid_raw_len = 0;
 774               if (pstr->trans)
 775                 c = pstr->trans[c];
 776               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 777                                    ? CONTEXT_WORD
 778                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 779                                       ? CONTEXT_NEWLINE : 0));
 780             }
 781         }
 782       if (!__glibc_unlikely (pstr->mbs_allocated))
 783         pstr->mbs += offset;
 784     }
 785   pstr->raw_mbs_idx = idx;
 786   pstr->len -= offset;
 787   pstr->stop -= offset;
 788
 789   /* Then build the buffers.  */
 790   if (pstr->mb_cur_max > 1)
 791     {
 792       if (pstr->icase)
 793         {
 794           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 795           if (__glibc_unlikely (ret != REG_NOERROR))
 796             return ret;
 797         }
 798       else
 799         build_wcs_buffer (pstr);
 800     }
 801   else
 802     if (__glibc_unlikely (pstr->mbs_allocated))
 803       {
 804         if (pstr->icase)
 805           build_upper_buffer (pstr);
 806         else if (pstr->trans != NULL)
 807           re_string_translate_buffer (pstr);
 808       }
 809     else
 810       pstr->valid_len = pstr->len;
 811
 812   pstr->cur_idx = 0;
 813   return REG_NOERROR;
 814 }
 815
 816 static unsigned char
 817 __attribute__ ((pure))
 818 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 819 {
 820   int ch;
 821   Idx off;
 822
 823   /* Handle the common (easiest) cases first.  */
 824   if (__glibc_likely (!pstr->mbs_allocated))
 825     return re_string_peek_byte (pstr, idx);
 826
 827   if (pstr->mb_cur_max > 1
 828       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 829     return re_string_peek_byte (pstr, idx);
 830
 831   off = pstr->cur_idx + idx;
 832   if (pstr->offsets_needed)
 833     off = pstr->offsets[off];
 834
 835   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 836
 837   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 838      this function returns CAPITAL LETTER I instead of first byte of
 839      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 840      since peek_byte_case doesn't advance cur_idx in any way.  */
 841   if (pstr->offsets_needed && !isascii (ch))
 842     return re_string_peek_byte (pstr, idx);
 843
 844   return ch;
 845 }
 846
 847 static unsigned char
 848 re_string_fetch_byte_case (re_string_t *pstr)
 849 {
 850   if (__glibc_likely (!pstr->mbs_allocated))
 851     return re_string_fetch_byte (pstr);
 852
 853   if (pstr->offsets_needed)
 854     {
 855       Idx off;
 856       int ch;
 857
 858       /* For tr_TR.UTF-8 [[:islower:]] there is
 859          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 860          in that case the whole multi-byte character and return
 861          the original letter.  On the other side, with
 862          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 863          anything else would complicate things too much.  */
 864
 865       if (!re_string_first_byte (pstr, pstr->cur_idx))
 866         return re_string_fetch_byte (pstr);
 867
 868       off = pstr->offsets[pstr->cur_idx];
 869       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 870
 871       if (! isascii (ch))
 872         return re_string_fetch_byte (pstr);
 873
 874       re_string_skip_bytes (pstr,
 875                             re_string_char_size_at (pstr, pstr->cur_idx));
 876       return ch;
 877     }
 878
 879   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 880 }
 881
 882 static void
 883 re_string_destruct (re_string_t *pstr)
 884 {
 885   re_free (pstr->wcs);
 886   re_free (pstr->offsets);
 887   if (pstr->mbs_allocated)
 888     re_free (pstr->mbs);
 889 }
 890
 891 /* Return the context at IDX in INPUT.  */
 892
 893 static unsigned int
 894 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 895 {
 896   int c;
 897   if (__glibc_unlikely (idx < 0))
 898     /* In this case, we use the value stored in input->tip_context,
 899        since we can't know the character in input->mbs[-1] here.  */
 900     return input->tip_context;
 901   if (__glibc_unlikely (idx == input->len))
 902     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 903             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 904   if (input->mb_cur_max > 1)
 905     {
 906       wint_t wc;
 907       Idx wc_idx = idx;
 908       while(input->wcs[wc_idx] == WEOF)
 909         {
 910           DEBUG_ASSERT (wc_idx >= 0);
 911           --wc_idx;
 912           if (wc_idx < 0)
 913             return input->tip_context;
 914         }
 915       wc = input->wcs[wc_idx];
 916       if (__glibc_unlikely (input->word_ops_used != 0)
 917           && IS_WIDE_WORD_CHAR (wc))
 918         return CONTEXT_WORD;
 919       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 920               ? CONTEXT_NEWLINE : 0);
 921     }
 922   else
 923     {
 924       c = re_string_byte_at (input, idx);
 925       if (bitset_contain (input->word_char, c))
 926         return CONTEXT_WORD;
 927       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 928     }
 929 }
 930 \f
 931 /* Functions for set operation.  */
 932
 933 static reg_errcode_t
 934 __attribute_warn_unused_result__
 935 re_node_set_alloc (re_node_set *set, Idx size)
 936 {
 937   set->alloc = size;
 938   set->nelem = 0;
 939   set->elems = re_malloc (Idx, size);
 940   if (__glibc_unlikely (set->elems == NULL)
 941       && (MALLOC_0_IS_NONNULL || size != 0))
 942     return REG_ESPACE;
 943   return REG_NOERROR;
 944 }
 945
 946 static reg_errcode_t
 947 __attribute_warn_unused_result__
 948 re_node_set_init_1 (re_node_set *set, Idx elem)
 949 {
 950   set->alloc = 1;
 951   set->nelem = 1;
 952   set->elems = re_malloc (Idx, 1);
 953   if (__glibc_unlikely (set->elems == NULL))
 954     {
 955       set->alloc = set->nelem = 0;
 956       return REG_ESPACE;
 957     }
 958   set->elems[0] = elem;
 959   return REG_NOERROR;
 960 }
 961
 962 static reg_errcode_t
 963 __attribute_warn_unused_result__
 964 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 965 {
 966   set->alloc = 2;
 967   set->elems = re_malloc (Idx, 2);
 968   if (__glibc_unlikely (set->elems == NULL))
 969     return REG_ESPACE;
 970   if (elem1 == elem2)
 971     {
 972       set->nelem = 1;
 973       set->elems[0] = elem1;
 974     }
 975   else
 976     {
 977       set->nelem = 2;
 978       if (elem1 < elem2)
 979         {
 980           set->elems[0] = elem1;
 981           set->elems[1] = elem2;
 982         }
 983       else
 984         {
 985           set->elems[0] = elem2;
 986           set->elems[1] = elem1;
 987         }
 988     }
 989   return REG_NOERROR;
 990 }
 991
 992 static reg_errcode_t
 993 __attribute_warn_unused_result__
 994 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 995 {
 996   dest->nelem = src->nelem;
 997   if (src->nelem > 0)
 998     {
 999       dest->alloc = dest->nelem;
1000       dest->elems = re_malloc (Idx, dest->alloc);
1001       if (__glibc_unlikely (dest->elems == NULL))
1002         {
1003           dest->alloc = dest->nelem = 0;
1004           return REG_ESPACE;
1005         }
1006       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1007     }
1008   else
1009     re_node_set_init_empty (dest);
1010   return REG_NOERROR;
1011 }
1012
1013 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1014    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1015    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1016
1017 static reg_errcode_t
1018 __attribute_warn_unused_result__
1019 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1020                            const re_node_set *src2)
1021 {
1022   Idx i1, i2, is, id, delta, sbase;
1023   if (src1->nelem == 0 || src2->nelem == 0)
1024     return REG_NOERROR;
1025
1026   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1027      conservative estimate.  */
1028   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1029     {
1030       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1031       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1032       if (__glibc_unlikely (new_elems == NULL))
1033         return REG_ESPACE;
1034       dest->elems = new_elems;
1035       dest->alloc = new_alloc;
1036     }
1037
1038   /* Find the items in the intersection of SRC1 and SRC2, and copy
1039      into the top of DEST those that are not already in DEST itself.  */
1040   sbase = dest->nelem + src1->nelem + src2->nelem;
1041   i1 = src1->nelem - 1;
1042   i2 = src2->nelem - 1;
1043   id = dest->nelem - 1;
1044   for (;;)
1045     {
1046       if (src1->elems[i1] == src2->elems[i2])
1047         {
1048           /* Try to find the item in DEST.  Maybe we could binary search?  */
1049           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1050             --id;
1051
1052           if (id < 0 || dest->elems[id] != src1->elems[i1])
1053             dest->elems[--sbase] = src1->elems[i1];
1054
1055           if (--i1 < 0 || --i2 < 0)
1056             break;
1057         }
1058
1059       /* Lower the highest of the two items.  */
1060       else if (src1->elems[i1] < src2->elems[i2])
1061         {
1062           if (--i2 < 0)
1063             break;
1064         }
1065       else
1066         {
1067           if (--i1 < 0)
1068             break;
1069         }
1070     }
1071
1072   id = dest->nelem - 1;
1073   is = dest->nelem + src1->nelem + src2->nelem - 1;
1074   delta = is - sbase + 1;
1075
1076   /* Now copy.  When DELTA becomes zero, the remaining
1077      DEST elements are already in place; this is more or
1078      less the same loop that is in re_node_set_merge.  */
1079   dest->nelem += delta;
1080   if (delta > 0 && id >= 0)
1081     for (;;)
1082       {
1083         if (dest->elems[is] > dest->elems[id])
1084           {
1085             /* Copy from the top.  */
1086             dest->elems[id + delta--] = dest->elems[is--];
1087             if (delta == 0)
1088               break;
1089           }
1090         else
1091           {
1092             /* Slide from the bottom.  */
1093             dest->elems[id + delta] = dest->elems[id];
1094             if (--id < 0)
1095               break;
1096           }
1097       }
1098
1099   /* Copy remaining SRC elements.  */
1100   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1101
1102   return REG_NOERROR;
1103 }
1104
1105 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1106    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1107
1108 static reg_errcode_t
1109 __attribute_warn_unused_result__
1110 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1111                         const re_node_set *src2)
1112 {
1113   Idx i1, i2, id;
1114   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1115     {
1116       dest->alloc = src1->nelem + src2->nelem;
1117       dest->elems = re_malloc (Idx, dest->alloc);
1118       if (__glibc_unlikely (dest->elems == NULL))
1119         return REG_ESPACE;
1120     }
1121   else
1122     {
1123       if (src1 != NULL && src1->nelem > 0)
1124         return re_node_set_init_copy (dest, src1);
1125       else if (src2 != NULL && src2->nelem > 0)
1126         return re_node_set_init_copy (dest, src2);
1127       else
1128         re_node_set_init_empty (dest);
1129       return REG_NOERROR;
1130     }
1131   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1132     {
1133       if (src1->elems[i1] > src2->elems[i2])
1134         {
1135           dest->elems[id++] = src2->elems[i2++];
1136           continue;
1137         }
1138       if (src1->elems[i1] == src2->elems[i2])
1139         ++i2;
1140       dest->elems[id++] = src1->elems[i1++];
1141     }
1142   if (i1 < src1->nelem)
1143     {
1144       memcpy (dest->elems + id, src1->elems + i1,
1145              (src1->nelem - i1) * sizeof (Idx));
1146       id += src1->nelem - i1;
1147     }
1148   else if (i2 < src2->nelem)
1149     {
1150       memcpy (dest->elems + id, src2->elems + i2,
1151              (src2->nelem - i2) * sizeof (Idx));
1152       id += src2->nelem - i2;
1153     }
1154   dest->nelem = id;
1155   return REG_NOERROR;
1156 }
1157
1158 /* Calculate the union set of the sets DEST and SRC. And store it to
1159    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1160
1161 static reg_errcode_t
1162 __attribute_warn_unused_result__
1163 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1164 {
1165   Idx is, id, sbase, delta;
1166   if (src == NULL || src->nelem == 0)
1167     return REG_NOERROR;
1168   if (dest->alloc < 2 * src->nelem + dest->nelem)
1169     {
1170       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1171       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1172       if (__glibc_unlikely (new_buffer == NULL))
1173         return REG_ESPACE;
1174       dest->elems = new_buffer;
1175       dest->alloc = new_alloc;
1176     }
1177
1178   if (__glibc_unlikely (dest->nelem == 0))
1179     {
1180       /* Although we already guaranteed above that dest->alloc != 0 and
1181          therefore dest->elems != NULL, add a debug assertion to pacify
1182          GCC 11.2.1's -fanalyzer.  */
1183       DEBUG_ASSERT (dest->elems);
1184       dest->nelem = src->nelem;
1185       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1186       return REG_NOERROR;
1187     }
1188
1189   /* Copy into the top of DEST the items of SRC that are not
1190      found in DEST.  Maybe we could binary search in DEST?  */
1191   for (sbase = dest->nelem + 2 * src->nelem,
1192        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1193     {
1194       if (dest->elems[id] == src->elems[is])
1195         is--, id--;
1196       else if (dest->elems[id] < src->elems[is])
1197         dest->elems[--sbase] = src->elems[is--];
1198       else /* if (dest->elems[id] > src->elems[is]) */
1199         --id;
1200     }
1201
1202   if (is >= 0)
1203     {
1204       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1205       sbase -= is + 1;
1206       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1207     }
1208
1209   id = dest->nelem - 1;
1210   is = dest->nelem + 2 * src->nelem - 1;
1211   delta = is - sbase + 1;
1212   if (delta == 0)
1213     return REG_NOERROR;
1214
1215   /* Now copy.  When DELTA becomes zero, the remaining
1216      DEST elements are already in place.  */
1217   dest->nelem += delta;
1218   for (;;)
1219     {
1220       if (dest->elems[is] > dest->elems[id])
1221         {
1222           /* Copy from the top.  */
1223           dest->elems[id + delta--] = dest->elems[is--];
1224           if (delta == 0)
1225             break;
1226         }
1227       else
1228         {
1229           /* Slide from the bottom.  */
1230           dest->elems[id + delta] = dest->elems[id];
1231           if (--id < 0)
1232             {
1233               /* Copy remaining SRC elements.  */
1234               memcpy (dest->elems, dest->elems + sbase,
1235                       delta * sizeof (Idx));
1236               break;
1237             }
1238         }
1239     }
1240
1241   return REG_NOERROR;
1242 }
1243
1244 /* Insert the new element ELEM to the re_node_set* SET.
1245    SET should not already have ELEM.
1246    Return true if successful.  */
1247
1248 static bool
1249 __attribute_warn_unused_result__
1250 re_node_set_insert (re_node_set *set, Idx elem)
1251 {
1252   Idx idx;
1253   /* In case the set is empty.  */
1254   if (set->alloc == 0)
1255     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1256
1257   if (__glibc_unlikely (set->nelem) == 0)
1258     {
1259       /* Although we already guaranteed above that set->alloc != 0 and
1260          therefore set->elems != NULL, add a debug assertion to pacify
1261          GCC 11.2 -fanalyzer.  */
1262       DEBUG_ASSERT (set->elems);
1263       set->elems[0] = elem;
1264       ++set->nelem;
1265       return true;
1266     }
1267
1268   /* Realloc if we need.  */
1269   if (set->alloc == set->nelem)
1270     {
1271       Idx *new_elems;
1272       set->alloc = set->alloc * 2;
1273       new_elems = re_realloc (set->elems, Idx, set->alloc);
1274       if (__glibc_unlikely (new_elems == NULL))
1275         return false;
1276       set->elems = new_elems;
1277     }
1278
1279   /* Move the elements which follows the new element.  Test the
1280      first element separately to skip a check in the inner loop.  */
1281   if (elem < set->elems[0])
1282     {
1283       for (idx = set->nelem; idx > 0; idx--)
1284         set->elems[idx] = set->elems[idx - 1];
1285     }
1286   else
1287     {
1288       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1289         set->elems[idx] = set->elems[idx - 1];
1290       DEBUG_ASSERT (set->elems[idx - 1] < elem);
1291     }
1292
1293   /* Insert the new element.  */
1294   set->elems[idx] = elem;
1295   ++set->nelem;
1296   return true;
1297 }
1298
1299 /* Insert the new element ELEM to the re_node_set* SET.
1300    SET should not already have any element greater than or equal to ELEM.
1301    Return true if successful.  */
1302
1303 static bool
1304 __attribute_warn_unused_result__
1305 re_node_set_insert_last (re_node_set *set, Idx elem)
1306 {
1307   /* Realloc if we need.  */
1308   if (set->alloc == set->nelem)
1309     {
1310       Idx *new_elems;
1311       set->alloc = (set->alloc + 1) * 2;
1312       new_elems = re_realloc (set->elems, Idx, set->alloc);
1313       if (__glibc_unlikely (new_elems == NULL))
1314         return false;
1315       set->elems = new_elems;
1316     }
1317
1318   /* Insert the new element.  */
1319   set->elems[set->nelem++] = elem;
1320   return true;
1321 }
1322
1323 /* Compare two node sets SET1 and SET2.
1324    Return true if SET1 and SET2 are equivalent.  */
1325
1326 static bool
1327 __attribute__ ((pure))
1328 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1329 {
1330   Idx i;
1331   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1332     return false;
1333   for (i = set1->nelem ; --i >= 0 ; )
1334     if (set1->elems[i] != set2->elems[i])
1335       return false;
1336   return true;
1337 }
1338
1339 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1340
1341 static Idx
1342 __attribute__ ((pure))
1343 re_node_set_contains (const re_node_set *set, Idx elem)
1344 {
1345   __re_size_t idx, right, mid;
1346   if (set->nelem <= 0)
1347     return 0;
1348
1349   /* Binary search the element.  */
1350   idx = 0;
1351   right = set->nelem - 1;
1352   while (idx < right)
1353     {
1354       mid = (idx + right) / 2;
1355       if (set->elems[mid] < elem)
1356         idx = mid + 1;
1357       else
1358         right = mid;
1359     }
1360   return set->elems[idx] == elem ? idx + 1 : 0;
1361 }
1362
1363 static void
1364 re_node_set_remove_at (re_node_set *set, Idx idx)
1365 {
1366   if (idx < 0 || idx >= set->nelem)
1367     return;
1368   --set->nelem;
1369   for (; idx < set->nelem; idx++)
1370     set->elems[idx] = set->elems[idx + 1];
1371 }
1372 \f
1373
1374 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1375    Or return -1 if an error occurred.  */
1376
1377 static Idx
1378 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1379 {
1380   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1381     {
1382       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1383       Idx *new_nexts, *new_indices;
1384       re_node_set *new_edests, *new_eclosures;
1385       re_token_t *new_nodes;
1386
1387       /* Avoid overflows in realloc.  */
1388       const size_t max_object_size = MAX (sizeof (re_token_t),
1389                                           MAX (sizeof (re_node_set),
1390                                                sizeof (Idx)));
1391       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1392                             < new_nodes_alloc))
1393         return -1;
1394
1395       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1396       if (__glibc_unlikely (new_nodes == NULL))
1397         return -1;
1398       dfa->nodes = new_nodes;
1399       dfa->nodes_alloc = new_nodes_alloc;
1400       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1401       if (new_nexts != NULL)
1402         dfa->nexts = new_nexts;
1403       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1404       if (new_indices != NULL)
1405         dfa->org_indices = new_indices;
1406       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1407       if (new_edests != NULL)
1408         dfa->edests = new_edests;
1409       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1410       if (new_eclosures != NULL)
1411         dfa->eclosures = new_eclosures;
1412       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1413                             || new_edests == NULL || new_eclosures == NULL))
1414         return -1;
1415     }
1416   dfa->nodes[dfa->nodes_len] = token;
1417   dfa->nodes[dfa->nodes_len].constraint = 0;
1418   dfa->nodes[dfa->nodes_len].accept_mb =
1419     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1420      || token.type == COMPLEX_BRACKET);
1421   dfa->nexts[dfa->nodes_len] = -1;
1422   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1423   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1424   return dfa->nodes_len++;
1425 }
1426
1427 static re_hashval_t
1428 calc_state_hash (const re_node_set *nodes, unsigned int context)
1429 {
1430   re_hashval_t hash = nodes->nelem + context;
1431   Idx i;
1432   for (i = 0 ; i < nodes->nelem ; i++)
1433     hash += nodes->elems[i];
1434   return hash;
1435 }
1436
1437 /* Search for the state whose node_set is equivalent to NODES.
1438    Return the pointer to the state, if we found it in the DFA.
1439    Otherwise create the new one and return it.  In case of an error
1440    return NULL and set the error code in ERR.
1441    Note: - We assume NULL as the invalid state, then it is possible that
1442            return value is NULL and ERR is REG_NOERROR.
1443          - We never return non-NULL value in case of any errors, it is for
1444            optimization.  */
1445
1446 static re_dfastate_t *
1447 __attribute_warn_unused_result__
1448 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1449                   const re_node_set *nodes)
1450 {
1451   re_hashval_t hash;
1452   re_dfastate_t *new_state;
1453   struct re_state_table_entry *spot;
1454   Idx i;
1455 #if defined GCC_LINT || defined lint
1456   /* Suppress bogus uninitialized-variable warnings.  */
1457   *err = REG_NOERROR;
1458 #endif
1459   if (__glibc_unlikely (nodes->nelem == 0))
1460     {
1461       *err = REG_NOERROR;
1462       return NULL;
1463     }
1464   hash = calc_state_hash (nodes, 0);
1465   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1466
1467   for (i = 0 ; i < spot->num ; i++)
1468     {
1469       re_dfastate_t *state = spot->array[i];
1470       if (hash != state->hash)
1471         continue;
1472       if (re_node_set_compare (&state->nodes, nodes))
1473         return state;
1474     }
1475
1476   /* There are no appropriate state in the dfa, create the new one.  */
1477   new_state = create_ci_newstate (dfa, nodes, hash);
1478   if (__glibc_unlikely (new_state == NULL))
1479     *err = REG_ESPACE;
1480
1481   return new_state;
1482 }
1483
1484 /* Search for the state whose node_set is equivalent to NODES and
1485    whose context is equivalent to CONTEXT.
1486    Return the pointer to the state, if we found it in the DFA.
1487    Otherwise create the new one and return it.  In case of an error
1488    return NULL and set the error code in ERR.
1489    Note: - We assume NULL as the invalid state, then it is possible that
1490            return value is NULL and ERR is REG_NOERROR.
1491          - We never return non-NULL value in case of any errors, it is for
1492            optimization.  */
1493
1494 static re_dfastate_t *
1495 __attribute_warn_unused_result__
1496 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1497                           const re_node_set *nodes, unsigned int context)
1498 {
1499   re_hashval_t hash;
1500   re_dfastate_t *new_state;
1501   struct re_state_table_entry *spot;
1502   Idx i;
1503 #if defined GCC_LINT || defined lint
1504   /* Suppress bogus uninitialized-variable warnings.  */
1505   *err = REG_NOERROR;
1506 #endif
1507   if (nodes->nelem == 0)
1508     {
1509       *err = REG_NOERROR;
1510       return NULL;
1511     }
1512   hash = calc_state_hash (nodes, context);
1513   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1514
1515   for (i = 0 ; i < spot->num ; i++)
1516     {
1517       re_dfastate_t *state = spot->array[i];
1518       if (state->hash == hash
1519           && state->context == context
1520           && re_node_set_compare (state->entrance_nodes, nodes))
1521         return state;
1522     }
1523   /* There are no appropriate state in 'dfa', create the new one.  */
1524   new_state = create_cd_newstate (dfa, nodes, context, hash);
1525   if (__glibc_unlikely (new_state == NULL))
1526     *err = REG_ESPACE;
1527
1528   return new_state;
1529 }
1530
1531 /* Finish initialization of the new state NEWSTATE, and using its hash value
1532    HASH put in the appropriate bucket of DFA's state table.  Return value
1533    indicates the error code if failed.  */
1534
1535 static reg_errcode_t
1536 __attribute_warn_unused_result__
1537 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1538                 re_hashval_t hash)
1539 {
1540   struct re_state_table_entry *spot;
1541   reg_errcode_t err;
1542   Idx i;
1543
1544   newstate->hash = hash;
1545   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1546   if (__glibc_unlikely (err != REG_NOERROR))
1547     return REG_ESPACE;
1548   for (i = 0; i < newstate->nodes.nelem; i++)
1549     {
1550       Idx elem = newstate->nodes.elems[i];
1551       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1552         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1553           return REG_ESPACE;
1554     }
1555
1556   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1557   if (__glibc_unlikely (spot->alloc <= spot->num))
1558     {
1559       Idx new_alloc = 2 * spot->num + 2;
1560       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1561                                               new_alloc);
1562       if (__glibc_unlikely (new_array == NULL))
1563         return REG_ESPACE;
1564       spot->array = new_array;
1565       spot->alloc = new_alloc;
1566     }
1567   spot->array[spot->num++] = newstate;
1568   return REG_NOERROR;
1569 }
1570
1571 static void
1572 free_state (re_dfastate_t *state)
1573 {
1574   re_node_set_free (&state->non_eps_nodes);
1575   re_node_set_free (&state->inveclosure);
1576   if (state->entrance_nodes != &state->nodes)
1577     {
1578       re_node_set_free (state->entrance_nodes);
1579       re_free (state->entrance_nodes);
1580     }
1581   re_node_set_free (&state->nodes);
1582   re_free (state->word_trtable);
1583   re_free (state->trtable);
1584   re_free (state);
1585 }
1586
1587 /* Create the new state which is independent of contexts.
1588    Return the new state if succeeded, otherwise return NULL.  */
1589
1590 static re_dfastate_t *
1591 __attribute_warn_unused_result__
1592 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1593                     re_hashval_t hash)
1594 {
1595   Idx i;
1596   reg_errcode_t err;
1597   re_dfastate_t *newstate;
1598
1599   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1600   if (__glibc_unlikely (newstate == NULL))
1601     return NULL;
1602   err = re_node_set_init_copy (&newstate->nodes, nodes);
1603   if (__glibc_unlikely (err != REG_NOERROR))
1604     {
1605       re_free (newstate);
1606       return NULL;
1607     }
1608
1609   newstate->entrance_nodes = &newstate->nodes;
1610   for (i = 0 ; i < nodes->nelem ; i++)
1611     {
1612       re_token_t *node = dfa->nodes + nodes->elems[i];
1613       re_token_type_t type = node->type;
1614       if (type == CHARACTER && !node->constraint)
1615         continue;
1616       newstate->accept_mb |= node->accept_mb;
1617
1618       /* If the state has the halt node, the state is a halt state.  */
1619       if (type == END_OF_RE)
1620         newstate->halt = 1;
1621       else if (type == OP_BACK_REF)
1622         newstate->has_backref = 1;
1623       else if (type == ANCHOR || node->constraint)
1624         newstate->has_constraint = 1;
1625     }
1626   err = register_state (dfa, newstate, hash);
1627   if (__glibc_unlikely (err != REG_NOERROR))
1628     {
1629       free_state (newstate);
1630       newstate = NULL;
1631     }
1632   return newstate;
1633 }
1634
1635 /* Create the new state which is depend on the context CONTEXT.
1636    Return the new state if succeeded, otherwise return NULL.  */
1637
1638 static re_dfastate_t *
1639 __attribute_warn_unused_result__
1640 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1641                     unsigned int context, re_hashval_t hash)
1642 {
1643   Idx i, nctx_nodes = 0;
1644   reg_errcode_t err;
1645   re_dfastate_t *newstate;
1646
1647   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1648   if (__glibc_unlikely (newstate == NULL))
1649     return NULL;
1650   err = re_node_set_init_copy (&newstate->nodes, nodes);
1651   if (__glibc_unlikely (err != REG_NOERROR))
1652     {
1653       re_free (newstate);
1654       return NULL;
1655     }
1656
1657   newstate->context = context;
1658   newstate->entrance_nodes = &newstate->nodes;
1659
1660   for (i = 0 ; i < nodes->nelem ; i++)
1661     {
1662       re_token_t *node = dfa->nodes + nodes->elems[i];
1663       re_token_type_t type = node->type;
1664       unsigned int constraint = node->constraint;
1665
1666       if (type == CHARACTER && !constraint)
1667         continue;
1668       newstate->accept_mb |= node->accept_mb;
1669
1670       /* If the state has the halt node, the state is a halt state.  */
1671       if (type == END_OF_RE)
1672         newstate->halt = 1;
1673       else if (type == OP_BACK_REF)
1674         newstate->has_backref = 1;
1675
1676       if (constraint)
1677         {
1678           if (newstate->entrance_nodes == &newstate->nodes)
1679             {
1680               re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1681               if (__glibc_unlikely (entrance_nodes == NULL))
1682                 {
1683                   free_state (newstate);
1684                   return NULL;
1685                 }
1686               newstate->entrance_nodes = entrance_nodes;
1687               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1688                   != REG_NOERROR)
1689                 {
1690                   free_state (newstate);
1691                   return NULL;
1692                 }
1693               nctx_nodes = 0;
1694               newstate->has_constraint = 1;
1695             }
1696
1697           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1698             {
1699               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1700               ++nctx_nodes;
1701             }
1702         }
1703     }
1704   err = register_state (dfa, newstate, hash);
1705   if (__glibc_unlikely (err != REG_NOERROR))
1706     {
1707       free_state (newstate);
1708       newstate = NULL;
1709     }
1710   return  newstate;
1711 }