posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 #ifdef RE_ENABLE_I18N
  26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
  27                                  wint_t *last_wc) internal_function;
  28 #endif /* RE_ENABLE_I18N */
  29 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
  30                                      unsigned int hash) internal_function;
  31 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
  32                                           const re_node_set *nodes,
  33                                           unsigned int hash) internal_function;
  34 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
  35                                           const re_node_set *nodes,
  36                                           unsigned int context,
  37                                           unsigned int hash) internal_function;
  38 static unsigned int inline calc_state_hash (const re_node_set *nodes,
  39                                             unsigned int context) internal_function;
  40 \f
  41 /* Functions for string operation.  */
  42
  43 /* This function allocate the buffers.  It is necessary to call
  44    re_string_reconstruct before using the object.  */
  45
  46 static reg_errcode_t
  47 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
  48      re_string_t *pstr;
  49      const char *str;
  50      int len, init_len, icase;
  51      RE_TRANSLATE_TYPE trans;
  52      const re_dfa_t *dfa;
  53 {
  54   reg_errcode_t ret;
  55   int init_buf_len;
  56
  57   /* Ensure at least one character fits into the buffers.  */
  58   if (init_len < dfa->mb_cur_max)
  59     init_len = dfa->mb_cur_max;
  60   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  61   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  62
  63   ret = re_string_realloc_buffers (pstr, init_buf_len);
  64   if (BE (ret != REG_NOERROR, 0))
  65     return ret;
  66
  67   pstr->word_char = dfa->word_char;
  68   pstr->word_ops_used = dfa->word_ops_used;
  69   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  70   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  71   pstr->valid_raw_len = pstr->valid_len;
  72   return REG_NOERROR;
  73 }
  74
  75 /* This function allocate the buffers, and initialize them.  */
  76
  77 static reg_errcode_t
  78 re_string_construct (pstr, str, len, trans, icase, dfa)
  79      re_string_t *pstr;
  80      const char *str;
  81      int len, icase;
  82      RE_TRANSLATE_TYPE trans;
  83      const re_dfa_t *dfa;
  84 {
  85   reg_errcode_t ret;
  86   memset (pstr, '\0', sizeof (re_string_t));
  87   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  88
  89   if (len > 0)
  90     {
  91       ret = re_string_realloc_buffers (pstr, len + 1);
  92       if (BE (ret != REG_NOERROR, 0))
  93         return ret;
  94     }
  95   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  96
  97   if (icase)
  98     {
  99 #ifdef RE_ENABLE_I18N
 100       if (dfa->mb_cur_max > 1)
 101         {
 102           while (1)
 103             {
 104               ret = build_wcs_upper_buffer (pstr);
 105               if (BE (ret != REG_NOERROR, 0))
 106                 return ret;
 107               if (pstr->valid_raw_len >= len)
 108                 break;
 109               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 110                 break;
 111               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 112               if (BE (ret != REG_NOERROR, 0))
 113                 return ret;
 114             }
 115         }
 116       else
 117 #endif /* RE_ENABLE_I18N  */
 118         build_upper_buffer (pstr);
 119     }
 120   else
 121     {
 122 #ifdef RE_ENABLE_I18N
 123       if (dfa->mb_cur_max > 1)
 124         build_wcs_buffer (pstr);
 125       else
 126 #endif /* RE_ENABLE_I18N  */
 127         {
 128           if (trans != NULL)
 129             re_string_translate_buffer (pstr);
 130           else
 131             {
 132               pstr->valid_len = pstr->bufs_len;
 133               pstr->valid_raw_len = pstr->bufs_len;
 134             }
 135         }
 136     }
 137
 138   return REG_NOERROR;
 139 }
 140
 141 /* Helper functions for re_string_allocate, and re_string_construct.  */
 142
 143 static reg_errcode_t
 144 re_string_realloc_buffers (pstr, new_buf_len)
 145      re_string_t *pstr;
 146      int new_buf_len;
 147 {
 148 #ifdef RE_ENABLE_I18N
 149   if (pstr->mb_cur_max > 1)
 150     {
 151       wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
 152       if (BE (new_array == NULL, 0))
 153         return REG_ESPACE;
 154       pstr->wcs = new_array;
 155       if (pstr->offsets != NULL)
 156         {
 157           int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
 158           if (BE (new_array == NULL, 0))
 159             return REG_ESPACE;
 160           pstr->offsets = new_array;
 161         }
 162     }
 163 #endif /* RE_ENABLE_I18N  */
 164   if (pstr->mbs_allocated)
 165     {
 166       unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
 167                                              new_buf_len);
 168       if (BE (new_array == NULL, 0))
 169         return REG_ESPACE;
 170       pstr->mbs = new_array;
 171     }
 172   pstr->bufs_len = new_buf_len;
 173   return REG_NOERROR;
 174 }
 175
 176
 177 static void
 178 re_string_construct_common (str, len, pstr, trans, icase, dfa)
 179      const char *str;
 180      int len;
 181      re_string_t *pstr;
 182      RE_TRANSLATE_TYPE trans;
 183      int icase;
 184      const re_dfa_t *dfa;
 185 {
 186   pstr->raw_mbs = (const unsigned char *) str;
 187   pstr->len = len;
 188   pstr->raw_len = len;
 189   pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
 190   pstr->icase = icase ? 1 : 0;
 191   pstr->mbs_allocated = (trans != NULL || icase);
 192   pstr->mb_cur_max = dfa->mb_cur_max;
 193   pstr->is_utf8 = dfa->is_utf8;
 194   pstr->map_notascii = dfa->map_notascii;
 195   pstr->stop = pstr->len;
 196   pstr->raw_stop = pstr->stop;
 197 }
 198
 199 #ifdef RE_ENABLE_I18N
 200
 201 /* Build wide character buffer PSTR->WCS.
 202    If the byte sequence of the string are:
 203      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 204    Then wide character buffer will be:
 205      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 206    We use WEOF for padding, they indicate that the position isn't
 207    a first byte of a multibyte character.
 208
 209    Note that this function assumes PSTR->VALID_LEN elements are already
 210    built and starts from PSTR->VALID_LEN.  */
 211
 212 static void
 213 build_wcs_buffer (pstr)
 214      re_string_t *pstr;
 215 {
 216 #ifdef _LIBC
 217   unsigned char buf[MB_LEN_MAX];
 218   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 219 #else
 220   unsigned char buf[64];
 221 #endif
 222   mbstate_t prev_st;
 223   int byte_idx, end_idx, mbclen, remain_len;
 224
 225   /* Build the buffers from pstr->valid_len to either pstr->len or
 226      pstr->bufs_len.  */
 227   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 228   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 229     {
 230       wchar_t wc;
 231       const char *p;
 232
 233       remain_len = end_idx - byte_idx;
 234       prev_st = pstr->cur_state;
 235       /* Apply the translation if we need.  */
 236       if (BE (pstr->trans != NULL, 0))
 237         {
 238           int i, ch;
 239
 240           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 241             {
 242               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 243               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 244             }
 245           p = (const char *) buf;
 246         }
 247       else
 248         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 249       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 250       if (BE (mbclen == (size_t) -2, 0))
 251         {
 252           /* The buffer doesn't have enough space, finish to build.  */
 253           pstr->cur_state = prev_st;
 254           break;
 255         }
 256       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 257         {
 258           /* We treat these cases as a singlebyte character.  */
 259           mbclen = 1;
 260           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 261           if (BE (pstr->trans != NULL, 0))
 262             wc = pstr->trans[wc];
 263           pstr->cur_state = prev_st;
 264         }
 265
 266       /* Write wide character and padding.  */
 267       pstr->wcs[byte_idx++] = wc;
 268       /* Write paddings.  */
 269       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 270         pstr->wcs[byte_idx++] = WEOF;
 271     }
 272   pstr->valid_len = byte_idx;
 273   pstr->valid_raw_len = byte_idx;
 274 }
 275
 276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 277    but for REG_ICASE.  */
 278
 279 static int
 280 build_wcs_upper_buffer (pstr)
 281      re_string_t *pstr;
 282 {
 283   mbstate_t prev_st;
 284   int src_idx, byte_idx, end_idx, mbclen, remain_len;
 285 #ifdef _LIBC
 286   unsigned char buf[MB_LEN_MAX];
 287   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 288 #else
 289   unsigned char buf[64];
 290 #endif
 291
 292   byte_idx = pstr->valid_len;
 293   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 294
 295   /* The following optimization assumes that ASCII characters can be
 296      mapped to wide characters with a simple cast.  */
 297   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 298     {
 299       while (byte_idx < end_idx)
 300         {
 301           wchar_t wc;
 302
 303           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 304               && mbsinit (&pstr->cur_state))
 305             {
 306               /* In case of a singlebyte character.  */
 307               pstr->mbs[byte_idx]
 308                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 309               /* The next step uses the assumption that wchar_t is encoded
 310                  ASCII-safe: all ASCII values can be converted like this.  */
 311               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 312               ++byte_idx;
 313               continue;
 314             }
 315
 316           remain_len = end_idx - byte_idx;
 317           prev_st = pstr->cur_state;
 318           mbclen = mbrtowc (&wc,
 319                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 320                              + byte_idx), remain_len, &pstr->cur_state);
 321           if (BE (mbclen > 0, 1))
 322             {
 323               wchar_t wcu = wc;
 324               if (iswlower (wc))
 325                 {
 326                   int mbcdlen;
 327
 328                   wcu = towupper (wc);
 329                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 330                   if (BE (mbclen == mbcdlen, 1))
 331                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 332                   else
 333                     {
 334                       src_idx = byte_idx;
 335                       goto offsets_needed;
 336                     }
 337                 }
 338               else
 339                 memcpy (pstr->mbs + byte_idx,
 340                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 341               pstr->wcs[byte_idx++] = wcu;
 342               /* Write paddings.  */
 343               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 344                 pstr->wcs[byte_idx++] = WEOF;
 345             }
 346           else if (mbclen == (size_t) -1 || mbclen == 0)
 347             {
 348               /* It is an invalid character or '\0'.  Just use the byte.  */
 349               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 350               pstr->mbs[byte_idx] = ch;
 351               /* And also cast it to wide char.  */
 352               pstr->wcs[byte_idx++] = (wchar_t) ch;
 353               if (BE (mbclen == (size_t) -1, 0))
 354                 pstr->cur_state = prev_st;
 355             }
 356           else
 357             {
 358               /* The buffer doesn't have enough space, finish to build.  */
 359               pstr->cur_state = prev_st;
 360               break;
 361             }
 362         }
 363       pstr->valid_len = byte_idx;
 364       pstr->valid_raw_len = byte_idx;
 365       return REG_NOERROR;
 366     }
 367   else
 368     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 369       {
 370         wchar_t wc;
 371         const char *p;
 372       offsets_needed:
 373         remain_len = end_idx - byte_idx;
 374         prev_st = pstr->cur_state;
 375         if (BE (pstr->trans != NULL, 0))
 376           {
 377             int i, ch;
 378
 379             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 380               {
 381                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 382                 buf[i] = pstr->trans[ch];
 383               }
 384             p = (const char *) buf;
 385           }
 386         else
 387           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 388         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 389         if (BE (mbclen > 0, 1))
 390           {
 391             wchar_t wcu = wc;
 392             if (iswlower (wc))
 393               {
 394                 int mbcdlen;
 395
 396                 wcu = towupper (wc);
 397                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 398                 if (BE (mbclen == mbcdlen, 1))
 399                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 400                 else
 401                   {
 402                     int i;
 403
 404                     if (byte_idx + mbcdlen > pstr->bufs_len)
 405                       {
 406                         pstr->cur_state = prev_st;
 407                         break;
 408                       }
 409
 410                     if (pstr->offsets == NULL)
 411                       {
 412                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 413
 414                         if (pstr->offsets == NULL)
 415                           return REG_ESPACE;
 416                       }
 417                     if (!pstr->offsets_needed)
 418                       {
 419                         for (i = 0; i < byte_idx; ++i)
 420                           pstr->offsets[i] = i;
 421                         pstr->offsets_needed = 1;
 422                       }
 423
 424                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 425                     pstr->wcs[byte_idx] = wcu;
 426                     pstr->offsets[byte_idx] = src_idx;
 427                     for (i = 1; i < mbcdlen; ++i)
 428                       {
 429                         pstr->offsets[byte_idx + i]
 430                           = src_idx + (i < mbclen ? i : mbclen - 1);
 431                         pstr->wcs[byte_idx + i] = WEOF;
 432                       }
 433                     pstr->len += mbcdlen - mbclen;
 434                     if (pstr->raw_stop > src_idx)
 435                       pstr->stop += mbcdlen - mbclen;
 436                     end_idx = (pstr->bufs_len > pstr->len)
 437                               ? pstr->len : pstr->bufs_len;
 438                     byte_idx += mbcdlen;
 439                     src_idx += mbclen;
 440                     continue;
 441                   }
 442               }
 443             else
 444               memcpy (pstr->mbs + byte_idx, p, mbclen);
 445
 446             if (BE (pstr->offsets_needed != 0, 0))
 447               {
 448                 int i;
 449                 for (i = 0; i < mbclen; ++i)
 450                   pstr->offsets[byte_idx + i] = src_idx + i;
 451               }
 452             src_idx += mbclen;
 453
 454             pstr->wcs[byte_idx++] = wcu;
 455             /* Write paddings.  */
 456             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 457               pstr->wcs[byte_idx++] = WEOF;
 458           }
 459         else if (mbclen == (size_t) -1 || mbclen == 0)
 460           {
 461             /* It is an invalid character or '\0'.  Just use the byte.  */
 462             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 463
 464             if (BE (pstr->trans != NULL, 0))
 465               ch = pstr->trans [ch];
 466             pstr->mbs[byte_idx] = ch;
 467
 468             if (BE (pstr->offsets_needed != 0, 0))
 469               pstr->offsets[byte_idx] = src_idx;
 470             ++src_idx;
 471
 472             /* And also cast it to wide char.  */
 473             pstr->wcs[byte_idx++] = (wchar_t) ch;
 474             if (BE (mbclen == (size_t) -1, 0))
 475               pstr->cur_state = prev_st;
 476           }
 477         else
 478           {
 479             /* The buffer doesn't have enough space, finish to build.  */
 480             pstr->cur_state = prev_st;
 481             break;
 482           }
 483       }
 484   pstr->valid_len = byte_idx;
 485   pstr->valid_raw_len = src_idx;
 486   return REG_NOERROR;
 487 }
 488
 489 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 490    Return the index.  */
 491
 492 static int
 493 re_string_skip_chars (pstr, new_raw_idx, last_wc)
 494      re_string_t *pstr;
 495      int new_raw_idx;
 496      wint_t *last_wc;
 497 {
 498   mbstate_t prev_st;
 499   int rawbuf_idx, mbclen;
 500   wchar_t wc = 0;
 501
 502   /* Skip the characters which are not necessary to check.  */
 503   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 504        rawbuf_idx < new_raw_idx;)
 505     {
 506       int remain_len;
 507       remain_len = pstr->len - rawbuf_idx;
 508       prev_st = pstr->cur_state;
 509       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 510                         remain_len, &pstr->cur_state);
 511       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 512         {
 513           /* We treat these cases as a singlebyte character.  */
 514           mbclen = 1;
 515           pstr->cur_state = prev_st;
 516         }
 517       /* Then proceed the next character.  */
 518       rawbuf_idx += mbclen;
 519     }
 520   *last_wc = (wint_t) wc;
 521   return rawbuf_idx;
 522 }
 523 #endif /* RE_ENABLE_I18N  */
 524
 525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 526    This function is used in case of REG_ICASE.  */
 527
 528 static void
 529 build_upper_buffer (pstr)
 530      re_string_t *pstr;
 531 {
 532   int char_idx, end_idx;
 533   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 534
 535   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 536     {
 537       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 538       if (BE (pstr->trans != NULL, 0))
 539         ch = pstr->trans[ch];
 540       if (islower (ch))
 541         pstr->mbs[char_idx] = toupper (ch);
 542       else
 543         pstr->mbs[char_idx] = ch;
 544     }
 545   pstr->valid_len = char_idx;
 546   pstr->valid_raw_len = char_idx;
 547 }
 548
 549 /* Apply TRANS to the buffer in PSTR.  */
 550
 551 static void
 552 re_string_translate_buffer (pstr)
 553      re_string_t *pstr;
 554 {
 555   int buf_idx, end_idx;
 556   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 557
 558   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 559     {
 560       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 561       pstr->mbs[buf_idx] = pstr->trans[ch];
 562     }
 563
 564   pstr->valid_len = buf_idx;
 565   pstr->valid_raw_len = buf_idx;
 566 }
 567
 568 /* This function re-construct the buffers.
 569    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 570    convert to upper case in case of REG_ICASE, apply translation.  */
 571
 572 static reg_errcode_t
 573 re_string_reconstruct (pstr, idx, eflags)
 574      re_string_t *pstr;
 575      int idx, eflags;
 576 {
 577   int offset = idx - pstr->raw_mbs_idx;
 578   if (BE (offset < 0, 0))
 579     {
 580       /* Reset buffer.  */
 581 #ifdef RE_ENABLE_I18N
 582       if (pstr->mb_cur_max > 1)
 583         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 584 #endif /* RE_ENABLE_I18N */
 585       pstr->len = pstr->raw_len;
 586       pstr->stop = pstr->raw_stop;
 587       pstr->valid_len = 0;
 588       pstr->raw_mbs_idx = 0;
 589       pstr->valid_raw_len = 0;
 590       pstr->offsets_needed = 0;
 591       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 592                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 593       if (!pstr->mbs_allocated)
 594         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 595       offset = idx;
 596     }
 597
 598   if (BE (offset != 0, 1))
 599     {
 600       /* Are the characters which are already checked remain?  */
 601       if (BE (offset < pstr->valid_raw_len, 1)
 602 #ifdef RE_ENABLE_I18N
 603           /* Handling this would enlarge the code too much.
 604              Accept a slowdown in that case.  */
 605           && pstr->offsets_needed == 0
 606 #endif
 607          )
 608         {
 609           /* Yes, move them to the front of the buffer.  */
 610           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 611 #ifdef RE_ENABLE_I18N
 612           if (pstr->mb_cur_max > 1)
 613             memmove (pstr->wcs, pstr->wcs + offset,
 614                      (pstr->valid_len - offset) * sizeof (wint_t));
 615 #endif /* RE_ENABLE_I18N */
 616           if (BE (pstr->mbs_allocated, 0))
 617             memmove (pstr->mbs, pstr->mbs + offset,
 618                      pstr->valid_len - offset);
 619           pstr->valid_len -= offset;
 620           pstr->valid_raw_len -= offset;
 621 #if DEBUG
 622           assert (pstr->valid_len > 0);
 623 #endif
 624         }
 625       else
 626         {
 627           /* No, skip all characters until IDX.  */
 628 #ifdef RE_ENABLE_I18N
 629           if (BE (pstr->offsets_needed, 0))
 630             {
 631               pstr->len = pstr->raw_len - idx + offset;
 632               pstr->stop = pstr->raw_stop - idx + offset;
 633               pstr->offsets_needed = 0;
 634             }
 635 #endif
 636           pstr->valid_len = 0;
 637           pstr->valid_raw_len = 0;
 638 #ifdef RE_ENABLE_I18N
 639           if (pstr->mb_cur_max > 1)
 640             {
 641               int wcs_idx;
 642               wint_t wc = WEOF;
 643
 644               if (pstr->is_utf8)
 645                 {
 646                   const unsigned char *raw, *p, *q, *end;
 647
 648                   /* Special case UTF-8.  Multi-byte chars start with any
 649                      byte other than 0x80 - 0xbf.  */
 650                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 651                   end = raw + (offset - pstr->mb_cur_max);
 652                   for (p = raw + offset - 1; p >= end; --p)
 653                     if ((*p & 0xc0) != 0x80)
 654                       {
 655                         mbstate_t cur_state;
 656                         wchar_t wc2;
 657                         int mlen = raw + pstr->len - p;
 658                         unsigned char buf[6];
 659
 660                         q = p;
 661                         if (BE (pstr->trans != NULL, 0))
 662                           {
 663                             int i = mlen < 6 ? mlen : 6;
 664                             while (--i >= 0)
 665                               buf[i] = pstr->trans[p[i]];
 666                             q = buf;
 667                           }
 668                         /* XXX Don't use mbrtowc, we know which conversion
 669                            to use (UTF-8 -> UCS4).  */
 670                         memset (&cur_state, 0, sizeof (cur_state));
 671                         mlen = mbrtowc (&wc2, p, mlen, &cur_state)
 672                                - (raw + offset - p);
 673                         if (mlen >= 0)
 674                           {
 675                             memset (&pstr->cur_state, '\0',
 676                                     sizeof (mbstate_t));
 677                             pstr->valid_len = mlen;
 678                             wc = wc2;
 679                           }
 680                         break;
 681                       }
 682                 }
 683
 684               if (wc == WEOF)
 685                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 686               if (BE (pstr->valid_len, 0))
 687                 {
 688                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 689                     pstr->wcs[wcs_idx] = WEOF;
 690                   if (pstr->mbs_allocated)
 691                     memset (pstr->mbs, 255, pstr->valid_len);
 692                 }
 693               pstr->valid_raw_len = pstr->valid_len;
 694               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 695                                     && IS_WIDE_WORD_CHAR (wc))
 696                                    ? CONTEXT_WORD
 697                                    : ((IS_WIDE_NEWLINE (wc)
 698                                        && pstr->newline_anchor)
 699                                       ? CONTEXT_NEWLINE : 0));
 700             }
 701           else
 702 #endif /* RE_ENABLE_I18N */
 703             {
 704               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 705               if (pstr->trans)
 706                 c = pstr->trans[c];
 707               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 708                                    ? CONTEXT_WORD
 709                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 710                                       ? CONTEXT_NEWLINE : 0));
 711             }
 712         }
 713       if (!BE (pstr->mbs_allocated, 0))
 714         pstr->mbs += offset;
 715     }
 716   pstr->raw_mbs_idx = idx;
 717   pstr->len -= offset;
 718   pstr->stop -= offset;
 719
 720   /* Then build the buffers.  */
 721 #ifdef RE_ENABLE_I18N
 722   if (pstr->mb_cur_max > 1)
 723     {
 724       if (pstr->icase)
 725         {
 726           int ret = build_wcs_upper_buffer (pstr);
 727           if (BE (ret != REG_NOERROR, 0))
 728             return ret;
 729         }
 730       else
 731         build_wcs_buffer (pstr);
 732     }
 733   else
 734 #endif /* RE_ENABLE_I18N */
 735   if (BE (pstr->mbs_allocated, 0))
 736     {
 737       if (pstr->icase)
 738         build_upper_buffer (pstr);
 739       else if (pstr->trans != NULL)
 740         re_string_translate_buffer (pstr);
 741     }
 742   else
 743     pstr->valid_len = pstr->len;
 744
 745   pstr->cur_idx = 0;
 746   return REG_NOERROR;
 747 }
 748
 749 static unsigned char
 750 re_string_peek_byte_case (pstr, idx)
 751      const re_string_t *pstr;
 752      int idx;
 753 {
 754   int ch, off;
 755
 756   /* Handle the common (easiest) cases first.  */
 757   if (BE (!pstr->mbs_allocated, 1))
 758     return re_string_peek_byte (pstr, idx);
 759
 760 #ifdef RE_ENABLE_I18N
 761   if (pstr->mb_cur_max > 1
 762       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 763     return re_string_peek_byte (pstr, idx);
 764 #endif
 765
 766   off = pstr->cur_idx + idx;
 767 #ifdef RE_ENABLE_I18N
 768   if (pstr->offsets_needed)
 769     off = pstr->offsets[off];
 770 #endif
 771
 772   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 773
 774 #ifdef RE_ENABLE_I18N
 775   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 776      this function returns CAPITAL LETTER I instead of first byte of
 777      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 778      since peek_byte_case doesn't advance cur_idx in any way.  */
 779   if (pstr->offsets_needed && !isascii (ch))
 780     return re_string_peek_byte (pstr, idx);
 781 #endif
 782
 783   return ch;
 784 }
 785
 786 static unsigned char
 787 re_string_fetch_byte_case (pstr)
 788      re_string_t *pstr;
 789 {
 790   if (BE (!pstr->mbs_allocated, 1))
 791     return re_string_fetch_byte (pstr);
 792
 793 #ifdef RE_ENABLE_I18N
 794   if (pstr->offsets_needed)
 795     {
 796       int off, ch;
 797
 798       /* For tr_TR.UTF-8 [[:islower:]] there is
 799          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 800          in that case the whole multi-byte character and return
 801          the original letter.  On the other side, with
 802          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 803          anything else would complicate things too much.  */
 804
 805       if (!re_string_first_byte (pstr, pstr->cur_idx))
 806         return re_string_fetch_byte (pstr);
 807
 808       off = pstr->offsets[pstr->cur_idx];
 809       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 810
 811       if (! isascii (ch))
 812         return re_string_fetch_byte (pstr);
 813
 814       re_string_skip_bytes (pstr,
 815                             re_string_char_size_at (pstr, pstr->cur_idx));
 816       return ch;
 817     }
 818 #endif
 819
 820   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 821 }
 822
 823 static void
 824 re_string_destruct (pstr)
 825      re_string_t *pstr;
 826 {
 827 #ifdef RE_ENABLE_I18N
 828   re_free (pstr->wcs);
 829   re_free (pstr->offsets);
 830 #endif /* RE_ENABLE_I18N  */
 831   if (pstr->mbs_allocated)
 832     re_free (pstr->mbs);
 833 }
 834
 835 /* Return the context at IDX in INPUT.  */
 836
 837 static unsigned int
 838 re_string_context_at (input, idx, eflags)
 839      const re_string_t *input;
 840      int idx, eflags;
 841 {
 842   int c;
 843   if (BE (idx < 0, 0))
 844     /* In this case, we use the value stored in input->tip_context,
 845        since we can't know the character in input->mbs[-1] here.  */
 846     return input->tip_context;
 847   if (BE (idx == input->len, 0))
 848     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 849             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 850 #ifdef RE_ENABLE_I18N
 851   if (input->mb_cur_max > 1)
 852     {
 853       wint_t wc;
 854       int wc_idx = idx;
 855       while(input->wcs[wc_idx] == WEOF)
 856         {
 857 #ifdef DEBUG
 858           /* It must not happen.  */
 859           assert (wc_idx >= 0);
 860 #endif
 861           --wc_idx;
 862           if (wc_idx < 0)
 863             return input->tip_context;
 864         }
 865       wc = input->wcs[wc_idx];
 866       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 867         return CONTEXT_WORD;
 868       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 869               ? CONTEXT_NEWLINE : 0);
 870     }
 871   else
 872 #endif
 873     {
 874       c = re_string_byte_at (input, idx);
 875       if (bitset_contain (input->word_char, c))
 876         return CONTEXT_WORD;
 877       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 878     }
 879 }
 880 \f
 881 /* Functions for set operation.  */
 882
 883 static reg_errcode_t
 884 re_node_set_alloc (set, size)
 885      re_node_set *set;
 886      int size;
 887 {
 888   set->alloc = size;
 889   set->nelem = 0;
 890   set->elems = re_malloc (int, size);
 891   if (BE (set->elems == NULL, 0))
 892     return REG_ESPACE;
 893   return REG_NOERROR;
 894 }
 895
 896 static reg_errcode_t
 897 re_node_set_init_1 (set, elem)
 898      re_node_set *set;
 899      int elem;
 900 {
 901   set->alloc = 1;
 902   set->nelem = 1;
 903   set->elems = re_malloc (int, 1);
 904   if (BE (set->elems == NULL, 0))
 905     {
 906       set->alloc = set->nelem = 0;
 907       return REG_ESPACE;
 908     }
 909   set->elems[0] = elem;
 910   return REG_NOERROR;
 911 }
 912
 913 static reg_errcode_t
 914 re_node_set_init_2 (set, elem1, elem2)
 915      re_node_set *set;
 916      int elem1, elem2;
 917 {
 918   set->alloc = 2;
 919   set->elems = re_malloc (int, 2);
 920   if (BE (set->elems == NULL, 0))
 921     return REG_ESPACE;
 922   if (elem1 == elem2)
 923     {
 924       set->nelem = 1;
 925       set->elems[0] = elem1;
 926     }
 927   else
 928     {
 929       set->nelem = 2;
 930       if (elem1 < elem2)
 931         {
 932           set->elems[0] = elem1;
 933           set->elems[1] = elem2;
 934         }
 935       else
 936         {
 937           set->elems[0] = elem2;
 938           set->elems[1] = elem1;
 939         }
 940     }
 941   return REG_NOERROR;
 942 }
 943
 944 static reg_errcode_t
 945 re_node_set_init_copy (dest, src)
 946      re_node_set *dest;
 947      const re_node_set *src;
 948 {
 949   dest->nelem = src->nelem;
 950   if (src->nelem > 0)
 951     {
 952       dest->alloc = dest->nelem;
 953       dest->elems = re_malloc (int, dest->alloc);
 954       if (BE (dest->elems == NULL, 0))
 955         {
 956           dest->alloc = dest->nelem = 0;
 957           return REG_ESPACE;
 958         }
 959       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 960     }
 961   else
 962     re_node_set_init_empty (dest);
 963   return REG_NOERROR;
 964 }
 965
 966 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 967    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 968    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 969
 970 static reg_errcode_t
 971 re_node_set_add_intersect (dest, src1, src2)
 972      re_node_set *dest;
 973      const re_node_set *src1, *src2;
 974 {
 975   int i1, i2, is, id, delta, sbase;
 976   if (src1->nelem == 0 || src2->nelem == 0)
 977     return REG_NOERROR;
 978
 979   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 980      conservative estimate.  */
 981   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 982     {
 983       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 984       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 985       if (BE (new_elems == NULL, 0))
 986         return REG_ESPACE;
 987       dest->elems = new_elems;
 988       dest->alloc = new_alloc;
 989     }
 990
 991   /* Find the items in the intersection of SRC1 and SRC2, and copy
 992      into the top of DEST those that are not already in DEST itself.  */
 993   sbase = dest->nelem + src1->nelem + src2->nelem;
 994   i1 = src1->nelem - 1;
 995   i2 = src2->nelem - 1;
 996   id = dest->nelem - 1;
 997   for (;;)
 998     {
 999       if (src1->elems[i1] == src2->elems[i2])
1000         {
1001           /* Try to find the item in DEST.  Maybe we could binary search?  */
1002           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1003             --id;
1004
1005           if (id < 0 || dest->elems[id] != src1->elems[i1])
1006             dest->elems[--sbase] = src1->elems[i1];
1007
1008           if (--i1 < 0 || --i2 < 0)
1009             break;
1010         }
1011
1012       /* Lower the highest of the two items.  */
1013       else if (src1->elems[i1] < src2->elems[i2])
1014         {
1015           if (--i2 < 0)
1016             break;
1017         }
1018       else
1019         {
1020           if (--i1 < 0)
1021             break;
1022         }
1023     }
1024
1025   id = dest->nelem - 1;
1026   is = dest->nelem + src1->nelem + src2->nelem - 1;
1027   delta = is - sbase + 1;
1028
1029   /* Now copy.  When DELTA becomes zero, the remaining
1030      DEST elements are already in place; this is more or
1031      less the same loop that is in re_node_set_merge.  */
1032   dest->nelem += delta;
1033   if (delta > 0 && id >= 0)
1034     for (;;)
1035       {
1036         if (dest->elems[is] > dest->elems[id])
1037           {
1038             /* Copy from the top.  */
1039             dest->elems[id + delta--] = dest->elems[is--];
1040             if (delta == 0)
1041               break;
1042           }
1043         else
1044           {
1045             /* Slide from the bottom.  */
1046             dest->elems[id + delta] = dest->elems[id];
1047             if (--id < 0)
1048               break;
1049           }
1050       }
1051
1052   /* Copy remaining SRC elements.  */
1053   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1054
1055   return REG_NOERROR;
1056 }
1057
1058 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1059    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1060
1061 static reg_errcode_t
1062 re_node_set_init_union (dest, src1, src2)
1063      re_node_set *dest;
1064      const re_node_set *src1, *src2;
1065 {
1066   int i1, i2, id;
1067   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1068     {
1069       dest->alloc = src1->nelem + src2->nelem;
1070       dest->elems = re_malloc (int, dest->alloc);
1071       if (BE (dest->elems == NULL, 0))
1072         return REG_ESPACE;
1073     }
1074   else
1075     {
1076       if (src1 != NULL && src1->nelem > 0)
1077         return re_node_set_init_copy (dest, src1);
1078       else if (src2 != NULL && src2->nelem > 0)
1079         return re_node_set_init_copy (dest, src2);
1080       else
1081         re_node_set_init_empty (dest);
1082       return REG_NOERROR;
1083     }
1084   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1085     {
1086       if (src1->elems[i1] > src2->elems[i2])
1087         {
1088           dest->elems[id++] = src2->elems[i2++];
1089           continue;
1090         }
1091       if (src1->elems[i1] == src2->elems[i2])
1092         ++i2;
1093       dest->elems[id++] = src1->elems[i1++];
1094     }
1095   if (i1 < src1->nelem)
1096     {
1097       memcpy (dest->elems + id, src1->elems + i1,
1098              (src1->nelem - i1) * sizeof (int));
1099       id += src1->nelem - i1;
1100     }
1101   else if (i2 < src2->nelem)
1102     {
1103       memcpy (dest->elems + id, src2->elems + i2,
1104              (src2->nelem - i2) * sizeof (int));
1105       id += src2->nelem - i2;
1106     }
1107   dest->nelem = id;
1108   return REG_NOERROR;
1109 }
1110
1111 /* Calculate the union set of the sets DEST and SRC. And store it to
1112    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1113
1114 static reg_errcode_t
1115 re_node_set_merge (dest, src)
1116      re_node_set *dest;
1117      const re_node_set *src;
1118 {
1119   int is, id, sbase, delta;
1120   if (src == NULL || src->nelem == 0)
1121     return REG_NOERROR;
1122   if (dest->alloc < 2 * src->nelem + dest->nelem)
1123     {
1124       int new_alloc = 2 * (src->nelem + dest->alloc);
1125       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1126       if (BE (new_buffer == NULL, 0))
1127         return REG_ESPACE;
1128       dest->elems = new_buffer;
1129       dest->alloc = new_alloc;
1130     }
1131
1132   if (BE (dest->nelem == 0, 0))
1133     {
1134       dest->nelem = src->nelem;
1135       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1136       return REG_NOERROR;
1137     }
1138
1139   /* Copy into the top of DEST the items of SRC that are not
1140      found in DEST.  Maybe we could binary search in DEST?  */
1141   for (sbase = dest->nelem + 2 * src->nelem,
1142        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1143     {
1144       if (dest->elems[id] == src->elems[is])
1145         is--, id--;
1146       else if (dest->elems[id] < src->elems[is])
1147         dest->elems[--sbase] = src->elems[is--];
1148       else /* if (dest->elems[id] > src->elems[is]) */
1149         --id;
1150     }
1151
1152   if (is >= 0)
1153     {
1154       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1155       sbase -= is + 1;
1156       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1157     }
1158
1159   id = dest->nelem - 1;
1160   is = dest->nelem + 2 * src->nelem - 1;
1161   delta = is - sbase + 1;
1162   if (delta == 0)
1163     return REG_NOERROR;
1164
1165   /* Now copy.  When DELTA becomes zero, the remaining
1166      DEST elements are already in place.  */
1167   dest->nelem += delta;
1168   for (;;)
1169     {
1170       if (dest->elems[is] > dest->elems[id])
1171         {
1172           /* Copy from the top.  */
1173           dest->elems[id + delta--] = dest->elems[is--];
1174           if (delta == 0)
1175             break;
1176         }
1177       else
1178         {
1179           /* Slide from the bottom.  */
1180           dest->elems[id + delta] = dest->elems[id];
1181           if (--id < 0)
1182             {
1183               /* Copy remaining SRC elements.  */
1184               memcpy (dest->elems, dest->elems + sbase,
1185                       delta * sizeof (int));
1186               break;
1187             }
1188         }
1189     }
1190
1191   return REG_NOERROR;
1192 }
1193
1194 /* Insert the new element ELEM to the re_node_set* SET.
1195    SET should not already have ELEM.
1196    return -1 if an error is occured, return 1 otherwise.  */
1197
1198 static int
1199 re_node_set_insert (set, elem)
1200      re_node_set *set;
1201      int elem;
1202 {
1203   int idx;
1204   /* In case the set is empty.  */
1205   if (set->alloc == 0)
1206     {
1207       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1208         return 1;
1209       else
1210         return -1;
1211     }
1212
1213   if (BE (set->nelem, 0) == 0)
1214     {
1215       /* We already guaranteed above that set->alloc != 0.  */
1216       set->elems[0] = elem;
1217       ++set->nelem;
1218       return 1;
1219     }
1220
1221   /* Realloc if we need.  */
1222   if (set->alloc == set->nelem)
1223     {
1224       int *new_array;
1225       set->alloc = set->alloc * 2;
1226       new_array = re_realloc (set->elems, int, set->alloc);
1227       if (BE (new_array == NULL, 0))
1228         return -1;
1229       set->elems = new_array;
1230     }
1231
1232   /* Move the elements which follows the new element.  Test the
1233      first element separately to skip a check in the inner loop.  */
1234   if (elem < set->elems[0])
1235     {
1236       idx = 0;
1237       for (idx = set->nelem; idx > 0; idx--)
1238         set->elems[idx] = set->elems[idx - 1];
1239     }
1240   else
1241     {
1242       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1243         set->elems[idx] = set->elems[idx - 1];
1244     }
1245
1246   /* Insert the new element.  */
1247   set->elems[idx] = elem;
1248   ++set->nelem;
1249   return 1;
1250 }
1251
1252 /* Insert the new element ELEM to the re_node_set* SET.
1253    SET should not already have any element greater than or equal to ELEM.
1254    Return -1 if an error is occured, return 1 otherwise.  */
1255
1256 static int
1257 re_node_set_insert_last (set, elem)
1258      re_node_set *set;
1259      int elem;
1260 {
1261   /* Realloc if we need.  */
1262   if (set->alloc == set->nelem)
1263     {
1264       int *new_array;
1265       set->alloc = (set->alloc + 1) * 2;
1266       new_array = re_realloc (set->elems, int, set->alloc);
1267       if (BE (new_array == NULL, 0))
1268         return -1;
1269       set->elems = new_array;
1270     }
1271
1272   /* Insert the new element.  */
1273   set->elems[set->nelem++] = elem;
1274   return 1;
1275 }
1276
1277 /* Compare two node sets SET1 and SET2.
1278    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1279
1280 static int
1281 re_node_set_compare (set1, set2)
1282      const re_node_set *set1, *set2;
1283 {
1284   int i;
1285   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1286     return 0;
1287   for (i = set1->nelem ; --i >= 0 ; )
1288     if (set1->elems[i] != set2->elems[i])
1289       return 0;
1290   return 1;
1291 }
1292
1293 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1294
1295 static int
1296 re_node_set_contains (set, elem)
1297      const re_node_set *set;
1298      int elem;
1299 {
1300   unsigned int idx, right, mid;
1301   if (set->nelem <= 0)
1302     return 0;
1303
1304   /* Binary search the element.  */
1305   idx = 0;
1306   right = set->nelem - 1;
1307   while (idx < right)
1308     {
1309       mid = (idx + right) / 2;
1310       if (set->elems[mid] < elem)
1311         idx = mid + 1;
1312       else
1313         right = mid;
1314     }
1315   return set->elems[idx] == elem ? idx + 1 : 0;
1316 }
1317
1318 static void
1319 re_node_set_remove_at (set, idx)
1320      re_node_set *set;
1321      int idx;
1322 {
1323   if (idx < 0 || idx >= set->nelem)
1324     return;
1325   --set->nelem;
1326   for (; idx < set->nelem; idx++)
1327     set->elems[idx] = set->elems[idx + 1];
1328 }
1329 \f
1330
1331 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1332    Or return -1, if an error will be occured.  */
1333
1334 static int
1335 re_dfa_add_node (dfa, token, mode)
1336      re_dfa_t *dfa;
1337      re_token_t token;
1338      int mode;
1339 {
1340   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1341     {
1342       int new_nodes_alloc = dfa->nodes_alloc * 2;
1343       re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1344                                           new_nodes_alloc);
1345       if (BE (new_array == NULL, 0))
1346         return -1;
1347       dfa->nodes = new_array;
1348       if (mode)
1349         {
1350           int *new_nexts, *new_indices;
1351           re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1352
1353           new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1354           new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1355           new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1356           new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1357                                       new_nodes_alloc);
1358           new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1359                                          new_nodes_alloc);
1360           if (BE (new_nexts == NULL || new_indices == NULL
1361                   || new_edests == NULL || new_eclosures == NULL
1362                   || new_inveclosures == NULL, 0))
1363             return -1;
1364           dfa->nexts = new_nexts;
1365           dfa->org_indices = new_indices;
1366           dfa->edests = new_edests;
1367           dfa->eclosures = new_eclosures;
1368           dfa->inveclosures = new_inveclosures;
1369         }
1370       dfa->nodes_alloc = new_nodes_alloc;
1371     }
1372   dfa->nodes[dfa->nodes_len] = token;
1373   dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1374   dfa->nodes[dfa->nodes_len].duplicated = 0;
1375   dfa->nodes[dfa->nodes_len].constraint = 0;
1376   return dfa->nodes_len++;
1377 }
1378
1379 static unsigned int inline
1380 calc_state_hash (nodes, context)
1381      const re_node_set *nodes;
1382      unsigned int context;
1383 {
1384   unsigned int hash = nodes->nelem + context;
1385   int i;
1386   for (i = 0 ; i < nodes->nelem ; i++)
1387     hash += nodes->elems[i];
1388   return hash;
1389 }
1390
1391 /* Search for the state whose node_set is equivalent to NODES.
1392    Return the pointer to the state, if we found it in the DFA.
1393    Otherwise create the new one and return it.  In case of an error
1394    return NULL and set the error code in ERR.
1395    Note: - We assume NULL as the invalid state, then it is possible that
1396            return value is NULL and ERR is REG_NOERROR.
1397          - We never return non-NULL value in case of any errors, it is for
1398            optimization.  */
1399
1400 static re_dfastate_t*
1401 re_acquire_state (err, dfa, nodes)
1402      reg_errcode_t *err;
1403      re_dfa_t *dfa;
1404      const re_node_set *nodes;
1405 {
1406   unsigned int hash;
1407   re_dfastate_t *new_state;
1408   struct re_state_table_entry *spot;
1409   int i;
1410   if (BE (nodes->nelem == 0, 0))
1411     {
1412       *err = REG_NOERROR;
1413       return NULL;
1414     }
1415   hash = calc_state_hash (nodes, 0);
1416   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1417
1418   for (i = 0 ; i < spot->num ; i++)
1419     {
1420       re_dfastate_t *state = spot->array[i];
1421       if (hash != state->hash)
1422         continue;
1423       if (re_node_set_compare (&state->nodes, nodes))
1424         return state;
1425     }
1426
1427   /* There are no appropriate state in the dfa, create the new one.  */
1428   new_state = create_ci_newstate (dfa, nodes, hash);
1429   if (BE (new_state != NULL, 1))
1430     return new_state;
1431   else
1432     {
1433       *err = REG_ESPACE;
1434       return NULL;
1435     }
1436 }
1437
1438 /* Search for the state whose node_set is equivalent to NODES and
1439    whose context is equivalent to CONTEXT.
1440    Return the pointer to the state, if we found it in the DFA.
1441    Otherwise create the new one and return it.  In case of an error
1442    return NULL and set the error code in ERR.
1443    Note: - We assume NULL as the invalid state, then it is possible that
1444            return value is NULL and ERR is REG_NOERROR.
1445          - We never return non-NULL value in case of any errors, it is for
1446            optimization.  */
1447
1448 static re_dfastate_t*
1449 re_acquire_state_context (err, dfa, nodes, context)
1450      reg_errcode_t *err;
1451      re_dfa_t *dfa;
1452      const re_node_set *nodes;
1453      unsigned int context;
1454 {
1455   unsigned int hash;
1456   re_dfastate_t *new_state;
1457   struct re_state_table_entry *spot;
1458   int i;
1459   if (nodes->nelem == 0)
1460     {
1461       *err = REG_NOERROR;
1462       return NULL;
1463     }
1464   hash = calc_state_hash (nodes, context);
1465   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1466
1467   for (i = 0 ; i < spot->num ; i++)
1468     {
1469       re_dfastate_t *state = spot->array[i];
1470       if (state->hash == hash
1471           && state->context == context
1472           && re_node_set_compare (state->entrance_nodes, nodes))
1473         return state;
1474     }
1475   /* There are no appropriate state in `dfa', create the new one.  */
1476   new_state = create_cd_newstate (dfa, nodes, context, hash);
1477   if (BE (new_state != NULL, 1))
1478     return new_state;
1479   else
1480     {
1481       *err = REG_ESPACE;
1482       return NULL;
1483     }
1484 }
1485
1486 /* Finish initialization of the new state NEWSTATE, and using its hash value
1487    HASH put in the appropriate bucket of DFA's state table.  Return value
1488    indicates the error code if failed.  */
1489
1490 static reg_errcode_t
1491 register_state (dfa, newstate, hash)
1492      re_dfa_t *dfa;
1493      re_dfastate_t *newstate;
1494      unsigned int hash;
1495 {
1496   struct re_state_table_entry *spot;
1497   reg_errcode_t err;
1498   int i;
1499
1500   newstate->hash = hash;
1501   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1502   if (BE (err != REG_NOERROR, 0))
1503     return REG_ESPACE;
1504   for (i = 0; i < newstate->nodes.nelem; i++)
1505     {
1506       int elem = newstate->nodes.elems[i];
1507       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1508         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1509     }
1510
1511   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1512   if (BE (spot->alloc <= spot->num, 0))
1513     {
1514       int new_alloc = 2 * spot->num + 2;
1515       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1516                                               new_alloc);
1517       if (BE (new_array == NULL, 0))
1518         return REG_ESPACE;
1519       spot->array = new_array;
1520       spot->alloc = new_alloc;
1521     }
1522   spot->array[spot->num++] = newstate;
1523   return REG_NOERROR;
1524 }
1525
1526 /* Create the new state which is independ of contexts.
1527    Return the new state if succeeded, otherwise return NULL.  */
1528
1529 static re_dfastate_t *
1530 create_ci_newstate (dfa, nodes, hash)
1531      re_dfa_t *dfa;
1532      const re_node_set *nodes;
1533      unsigned int hash;
1534 {
1535   int i;
1536   reg_errcode_t err;
1537   re_dfastate_t *newstate;
1538
1539   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1540   if (BE (newstate == NULL, 0))
1541     return NULL;
1542   err = re_node_set_init_copy (&newstate->nodes, nodes);
1543   if (BE (err != REG_NOERROR, 0))
1544     {
1545       re_free (newstate);
1546       return NULL;
1547     }
1548
1549   newstate->entrance_nodes = &newstate->nodes;
1550   for (i = 0 ; i < nodes->nelem ; i++)
1551     {
1552       re_token_t *node = dfa->nodes + nodes->elems[i];
1553       re_token_type_t type = node->type;
1554       if (type == CHARACTER && !node->constraint)
1555         continue;
1556
1557       /* If the state has the halt node, the state is a halt state.  */
1558       else if (type == END_OF_RE)
1559         newstate->halt = 1;
1560 #ifdef RE_ENABLE_I18N
1561       else if (type == COMPLEX_BRACKET
1562                || type == OP_UTF8_PERIOD
1563                || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1564         newstate->accept_mb = 1;
1565 #endif /* RE_ENABLE_I18N */
1566       else if (type == OP_BACK_REF)
1567         newstate->has_backref = 1;
1568       else if (type == ANCHOR || node->constraint)
1569         newstate->has_constraint = 1;
1570     }
1571   err = register_state (dfa, newstate, hash);
1572   if (BE (err != REG_NOERROR, 0))
1573     {
1574       free_state (newstate);
1575       newstate = NULL;
1576     }
1577   return newstate;
1578 }
1579
1580 /* Create the new state which is depend on the context CONTEXT.
1581    Return the new state if succeeded, otherwise return NULL.  */
1582
1583 static re_dfastate_t *
1584 create_cd_newstate (dfa, nodes, context, hash)
1585      re_dfa_t *dfa;
1586      const re_node_set *nodes;
1587      unsigned int context, hash;
1588 {
1589   int i, nctx_nodes = 0;
1590   reg_errcode_t err;
1591   re_dfastate_t *newstate;
1592
1593   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1594   if (BE (newstate == NULL, 0))
1595     return NULL;
1596   err = re_node_set_init_copy (&newstate->nodes, nodes);
1597   if (BE (err != REG_NOERROR, 0))
1598     {
1599       re_free (newstate);
1600       return NULL;
1601     }
1602
1603   newstate->context = context;
1604   newstate->entrance_nodes = &newstate->nodes;
1605
1606   for (i = 0 ; i < nodes->nelem ; i++)
1607     {
1608       unsigned int constraint = 0;
1609       re_token_t *node = dfa->nodes + nodes->elems[i];
1610       re_token_type_t type = node->type;
1611       if (node->constraint)
1612         constraint = node->constraint;
1613
1614       if (type == CHARACTER && !constraint)
1615         continue;
1616       /* If the state has the halt node, the state is a halt state.  */
1617       else if (type == END_OF_RE)
1618         newstate->halt = 1;
1619 #ifdef RE_ENABLE_I18N
1620       else if (type == COMPLEX_BRACKET
1621                || type == OP_UTF8_PERIOD
1622                || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1623         newstate->accept_mb = 1;
1624 #endif /* RE_ENABLE_I18N */
1625       else if (type == OP_BACK_REF)
1626         newstate->has_backref = 1;
1627       else if (type == ANCHOR)
1628         constraint = node->opr.ctx_type;
1629
1630       if (constraint)
1631         {
1632           if (newstate->entrance_nodes == &newstate->nodes)
1633             {
1634               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1635               if (BE (newstate->entrance_nodes == NULL, 0))
1636                 {
1637                   free_state (newstate);
1638                   return NULL;
1639                 }
1640               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1641               nctx_nodes = 0;
1642               newstate->has_constraint = 1;
1643             }
1644
1645           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1646             {
1647               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1648               ++nctx_nodes;
1649             }
1650         }
1651     }
1652   err = register_state (dfa, newstate, hash);
1653   if (BE (err != REG_NOERROR, 0))
1654     {
1655       free_state (newstate);
1656       newstate = NULL;
1657     }
1658   return  newstate;
1659 }
1660
1661 static void
1662 free_state (state)
1663      re_dfastate_t *state;
1664 {
1665   re_node_set_free (&state->non_eps_nodes);
1666   re_node_set_free (&state->inveclosure);
1667   if (state->entrance_nodes != &state->nodes)
1668     {
1669       re_node_set_free (state->entrance_nodes);
1670       re_free (state->entrance_nodes);
1671     }
1672   re_node_set_free (&state->nodes);
1673   re_free (state->trtable);
1674   re_free (state);
1675 }