2005-03-15 Jakub Jelinek <jakub@redhat.com>
[glibc/history.git] / posix / regex_internal.c
blobd590104fce3db4baa3e398c13157764f6fbda072
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 #ifdef RE_ENABLE_I18N
26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28 #endif /* RE_ENABLE_I18N */
29 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
30 unsigned int hash) internal_function;
31 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
32 const re_node_set *nodes,
33 unsigned int hash) internal_function;
34 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
35 const re_node_set *nodes,
36 unsigned int context,
37 unsigned int hash) internal_function;
38 static unsigned int inline calc_state_hash (const re_node_set *nodes,
39 unsigned int context) internal_function;
41 /* Functions for string operation. */
43 /* This function allocate the buffers. It is necessary to call
44 re_string_reconstruct before using the object. */
46 static reg_errcode_t
47 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
48 re_string_t *pstr;
49 const char *str;
50 int len, init_len, icase;
51 RE_TRANSLATE_TYPE trans;
52 const re_dfa_t *dfa;
54 reg_errcode_t ret;
55 int init_buf_len;
57 /* Ensure at least one character fits into the buffers. */
58 if (init_len < dfa->mb_cur_max)
59 init_len = dfa->mb_cur_max;
60 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
61 re_string_construct_common (str, len, pstr, trans, icase, dfa);
63 ret = re_string_realloc_buffers (pstr, init_buf_len);
64 if (BE (ret != REG_NOERROR, 0))
65 return ret;
67 pstr->word_char = dfa->word_char;
68 pstr->word_ops_used = dfa->word_ops_used;
69 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
70 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
71 pstr->valid_raw_len = pstr->valid_len;
72 return REG_NOERROR;
75 /* This function allocate the buffers, and initialize them. */
77 static reg_errcode_t
78 re_string_construct (pstr, str, len, trans, icase, dfa)
79 re_string_t *pstr;
80 const char *str;
81 int len, icase;
82 RE_TRANSLATE_TYPE trans;
83 const re_dfa_t *dfa;
85 reg_errcode_t ret;
86 memset (pstr, '\0', sizeof (re_string_t));
87 re_string_construct_common (str, len, pstr, trans, icase, dfa);
89 if (len > 0)
91 ret = re_string_realloc_buffers (pstr, len + 1);
92 if (BE (ret != REG_NOERROR, 0))
93 return ret;
95 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
97 if (icase)
99 #ifdef RE_ENABLE_I18N
100 if (dfa->mb_cur_max > 1)
102 while (1)
104 ret = build_wcs_upper_buffer (pstr);
105 if (BE (ret != REG_NOERROR, 0))
106 return ret;
107 if (pstr->valid_raw_len >= len)
108 break;
109 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
110 break;
111 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
112 if (BE (ret != REG_NOERROR, 0))
113 return ret;
116 else
117 #endif /* RE_ENABLE_I18N */
118 build_upper_buffer (pstr);
120 else
122 #ifdef RE_ENABLE_I18N
123 if (dfa->mb_cur_max > 1)
124 build_wcs_buffer (pstr);
125 else
126 #endif /* RE_ENABLE_I18N */
128 if (trans != NULL)
129 re_string_translate_buffer (pstr);
130 else
132 pstr->valid_len = pstr->bufs_len;
133 pstr->valid_raw_len = pstr->bufs_len;
138 return REG_NOERROR;
141 /* Helper functions for re_string_allocate, and re_string_construct. */
143 static reg_errcode_t
144 re_string_realloc_buffers (pstr, new_buf_len)
145 re_string_t *pstr;
146 int new_buf_len;
148 #ifdef RE_ENABLE_I18N
149 if (pstr->mb_cur_max > 1)
151 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
152 if (BE (new_array == NULL, 0))
153 return REG_ESPACE;
154 pstr->wcs = new_array;
155 if (pstr->offsets != NULL)
157 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
158 if (BE (new_array == NULL, 0))
159 return REG_ESPACE;
160 pstr->offsets = new_array;
163 #endif /* RE_ENABLE_I18N */
164 if (pstr->mbs_allocated)
166 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
167 new_buf_len);
168 if (BE (new_array == NULL, 0))
169 return REG_ESPACE;
170 pstr->mbs = new_array;
172 pstr->bufs_len = new_buf_len;
173 return REG_NOERROR;
177 static void
178 re_string_construct_common (str, len, pstr, trans, icase, dfa)
179 const char *str;
180 int len;
181 re_string_t *pstr;
182 RE_TRANSLATE_TYPE trans;
183 int icase;
184 const re_dfa_t *dfa;
186 pstr->raw_mbs = (const unsigned char *) str;
187 pstr->len = len;
188 pstr->raw_len = len;
189 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
190 pstr->icase = icase ? 1 : 0;
191 pstr->mbs_allocated = (trans != NULL || icase);
192 pstr->mb_cur_max = dfa->mb_cur_max;
193 pstr->is_utf8 = dfa->is_utf8;
194 pstr->map_notascii = dfa->map_notascii;
195 pstr->stop = pstr->len;
196 pstr->raw_stop = pstr->stop;
199 #ifdef RE_ENABLE_I18N
201 /* Build wide character buffer PSTR->WCS.
202 If the byte sequence of the string are:
203 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
204 Then wide character buffer will be:
205 <wc1> , WEOF , <wc2> , WEOF , <wc3>
206 We use WEOF for padding, they indicate that the position isn't
207 a first byte of a multibyte character.
209 Note that this function assumes PSTR->VALID_LEN elements are already
210 built and starts from PSTR->VALID_LEN. */
212 static void
213 build_wcs_buffer (pstr)
214 re_string_t *pstr;
216 #ifdef _LIBC
217 unsigned char buf[MB_LEN_MAX];
218 assert (MB_LEN_MAX >= pstr->mb_cur_max);
219 #else
220 unsigned char buf[64];
221 #endif
222 mbstate_t prev_st;
223 int byte_idx, end_idx, mbclen, remain_len;
225 /* Build the buffers from pstr->valid_len to either pstr->len or
226 pstr->bufs_len. */
227 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
228 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
230 wchar_t wc;
231 const char *p;
233 remain_len = end_idx - byte_idx;
234 prev_st = pstr->cur_state;
235 /* Apply the translation if we need. */
236 if (BE (pstr->trans != NULL, 0))
238 int i, ch;
240 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
242 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
243 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
245 p = (const char *) buf;
247 else
248 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
249 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
250 if (BE (mbclen == (size_t) -2, 0))
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr->cur_state = prev_st;
254 break;
256 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
258 /* We treat these cases as a singlebyte character. */
259 mbclen = 1;
260 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
261 if (BE (pstr->trans != NULL, 0))
262 wc = pstr->trans[wc];
263 pstr->cur_state = prev_st;
266 /* Write wide character and padding. */
267 pstr->wcs[byte_idx++] = wc;
268 /* Write paddings. */
269 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270 pstr->wcs[byte_idx++] = WEOF;
272 pstr->valid_len = byte_idx;
273 pstr->valid_raw_len = byte_idx;
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
279 static int
280 build_wcs_upper_buffer (pstr)
281 re_string_t *pstr;
283 mbstate_t prev_st;
284 int src_idx, byte_idx, end_idx, mbclen, remain_len;
285 #ifdef _LIBC
286 unsigned char buf[MB_LEN_MAX];
287 assert (MB_LEN_MAX >= pstr->mb_cur_max);
288 #else
289 unsigned char buf[64];
290 #endif
292 byte_idx = pstr->valid_len;
293 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295 /* The following optimization assumes that ASCII characters can be
296 mapped to wide characters with a simple cast. */
297 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299 while (byte_idx < end_idx)
301 wchar_t wc;
303 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
304 && mbsinit (&pstr->cur_state))
306 /* In case of a singlebyte character. */
307 pstr->mbs[byte_idx]
308 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
309 /* The next step uses the assumption that wchar_t is encoded
310 ASCII-safe: all ASCII values can be converted like this. */
311 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
312 ++byte_idx;
313 continue;
316 remain_len = end_idx - byte_idx;
317 prev_st = pstr->cur_state;
318 mbclen = mbrtowc (&wc,
319 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
320 + byte_idx), remain_len, &pstr->cur_state);
321 if (BE (mbclen > 0, 1))
323 wchar_t wcu = wc;
324 if (iswlower (wc))
326 int mbcdlen;
328 wcu = towupper (wc);
329 mbcdlen = wcrtomb (buf, wcu, &prev_st);
330 if (BE (mbclen == mbcdlen, 1))
331 memcpy (pstr->mbs + byte_idx, buf, mbclen);
332 else
334 src_idx = byte_idx;
335 goto offsets_needed;
338 else
339 memcpy (pstr->mbs + byte_idx,
340 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
341 pstr->wcs[byte_idx++] = wcu;
342 /* Write paddings. */
343 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
344 pstr->wcs[byte_idx++] = WEOF;
346 else if (mbclen == (size_t) -1 || mbclen == 0)
348 /* It is an invalid character or '\0'. Just use the byte. */
349 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
350 pstr->mbs[byte_idx] = ch;
351 /* And also cast it to wide char. */
352 pstr->wcs[byte_idx++] = (wchar_t) ch;
353 if (BE (mbclen == (size_t) -1, 0))
354 pstr->cur_state = prev_st;
356 else
358 /* The buffer doesn't have enough space, finish to build. */
359 pstr->cur_state = prev_st;
360 break;
363 pstr->valid_len = byte_idx;
364 pstr->valid_raw_len = byte_idx;
365 return REG_NOERROR;
367 else
368 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
370 wchar_t wc;
371 const char *p;
372 offsets_needed:
373 remain_len = end_idx - byte_idx;
374 prev_st = pstr->cur_state;
375 if (BE (pstr->trans != NULL, 0))
377 int i, ch;
379 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
381 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382 buf[i] = pstr->trans[ch];
384 p = (const char *) buf;
386 else
387 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389 if (BE (mbclen > 0, 1))
391 wchar_t wcu = wc;
392 if (iswlower (wc))
394 int mbcdlen;
396 wcu = towupper (wc);
397 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
398 if (BE (mbclen == mbcdlen, 1))
399 memcpy (pstr->mbs + byte_idx, buf, mbclen);
400 else
402 int i;
404 if (byte_idx + mbcdlen > pstr->bufs_len)
406 pstr->cur_state = prev_st;
407 break;
410 if (pstr->offsets == NULL)
412 pstr->offsets = re_malloc (int, pstr->bufs_len);
414 if (pstr->offsets == NULL)
415 return REG_ESPACE;
417 if (!pstr->offsets_needed)
419 for (i = 0; i < byte_idx; ++i)
420 pstr->offsets[i] = i;
421 pstr->offsets_needed = 1;
424 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
425 pstr->wcs[byte_idx] = wcu;
426 pstr->offsets[byte_idx] = src_idx;
427 for (i = 1; i < mbcdlen; ++i)
429 pstr->offsets[byte_idx + i]
430 = src_idx + (i < mbclen ? i : mbclen - 1);
431 pstr->wcs[byte_idx + i] = WEOF;
433 pstr->len += mbcdlen - mbclen;
434 if (pstr->raw_stop > src_idx)
435 pstr->stop += mbcdlen - mbclen;
436 end_idx = (pstr->bufs_len > pstr->len)
437 ? pstr->len : pstr->bufs_len;
438 byte_idx += mbcdlen;
439 src_idx += mbclen;
440 continue;
443 else
444 memcpy (pstr->mbs + byte_idx, p, mbclen);
446 if (BE (pstr->offsets_needed != 0, 0))
448 int i;
449 for (i = 0; i < mbclen; ++i)
450 pstr->offsets[byte_idx + i] = src_idx + i;
452 src_idx += mbclen;
454 pstr->wcs[byte_idx++] = wcu;
455 /* Write paddings. */
456 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
457 pstr->wcs[byte_idx++] = WEOF;
459 else if (mbclen == (size_t) -1 || mbclen == 0)
461 /* It is an invalid character or '\0'. Just use the byte. */
462 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
464 if (BE (pstr->trans != NULL, 0))
465 ch = pstr->trans [ch];
466 pstr->mbs[byte_idx] = ch;
468 if (BE (pstr->offsets_needed != 0, 0))
469 pstr->offsets[byte_idx] = src_idx;
470 ++src_idx;
472 /* And also cast it to wide char. */
473 pstr->wcs[byte_idx++] = (wchar_t) ch;
474 if (BE (mbclen == (size_t) -1, 0))
475 pstr->cur_state = prev_st;
477 else
479 /* The buffer doesn't have enough space, finish to build. */
480 pstr->cur_state = prev_st;
481 break;
484 pstr->valid_len = byte_idx;
485 pstr->valid_raw_len = src_idx;
486 return REG_NOERROR;
489 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
490 Return the index. */
492 static int
493 re_string_skip_chars (pstr, new_raw_idx, last_wc)
494 re_string_t *pstr;
495 int new_raw_idx;
496 wint_t *last_wc;
498 mbstate_t prev_st;
499 int rawbuf_idx, mbclen;
500 wchar_t wc = 0;
502 /* Skip the characters which are not necessary to check. */
503 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504 rawbuf_idx < new_raw_idx;)
506 int remain_len;
507 remain_len = pstr->len - rawbuf_idx;
508 prev_st = pstr->cur_state;
509 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
510 remain_len, &pstr->cur_state);
511 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
513 /* We treat these cases as a singlebyte character. */
514 mbclen = 1;
515 pstr->cur_state = prev_st;
517 /* Then proceed the next character. */
518 rawbuf_idx += mbclen;
520 *last_wc = (wint_t) wc;
521 return rawbuf_idx;
523 #endif /* RE_ENABLE_I18N */
525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
526 This function is used in case of REG_ICASE. */
528 static void
529 build_upper_buffer (pstr)
530 re_string_t *pstr;
532 int char_idx, end_idx;
533 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
538 if (BE (pstr->trans != NULL, 0))
539 ch = pstr->trans[ch];
540 if (islower (ch))
541 pstr->mbs[char_idx] = toupper (ch);
542 else
543 pstr->mbs[char_idx] = ch;
545 pstr->valid_len = char_idx;
546 pstr->valid_raw_len = char_idx;
549 /* Apply TRANS to the buffer in PSTR. */
551 static void
552 re_string_translate_buffer (pstr)
553 re_string_t *pstr;
555 int buf_idx, end_idx;
556 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
561 pstr->mbs[buf_idx] = pstr->trans[ch];
564 pstr->valid_len = buf_idx;
565 pstr->valid_raw_len = buf_idx;
568 /* This function re-construct the buffers.
569 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
570 convert to upper case in case of REG_ICASE, apply translation. */
572 static reg_errcode_t
573 re_string_reconstruct (pstr, idx, eflags)
574 re_string_t *pstr;
575 int idx, eflags;
577 int offset = idx - pstr->raw_mbs_idx;
578 if (BE (offset < 0, 0))
580 /* Reset buffer. */
581 #ifdef RE_ENABLE_I18N
582 if (pstr->mb_cur_max > 1)
583 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
584 #endif /* RE_ENABLE_I18N */
585 pstr->len = pstr->raw_len;
586 pstr->stop = pstr->raw_stop;
587 pstr->valid_len = 0;
588 pstr->raw_mbs_idx = 0;
589 pstr->valid_raw_len = 0;
590 pstr->offsets_needed = 0;
591 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
592 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
593 if (!pstr->mbs_allocated)
594 pstr->mbs = (unsigned char *) pstr->raw_mbs;
595 offset = idx;
598 if (BE (offset != 0, 1))
600 /* Are the characters which are already checked remain? */
601 if (BE (offset < pstr->valid_raw_len, 1)
602 #ifdef RE_ENABLE_I18N
603 /* Handling this would enlarge the code too much.
604 Accept a slowdown in that case. */
605 && pstr->offsets_needed == 0
606 #endif
609 /* Yes, move them to the front of the buffer. */
610 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
611 #ifdef RE_ENABLE_I18N
612 if (pstr->mb_cur_max > 1)
613 memmove (pstr->wcs, pstr->wcs + offset,
614 (pstr->valid_len - offset) * sizeof (wint_t));
615 #endif /* RE_ENABLE_I18N */
616 if (BE (pstr->mbs_allocated, 0))
617 memmove (pstr->mbs, pstr->mbs + offset,
618 pstr->valid_len - offset);
619 pstr->valid_len -= offset;
620 pstr->valid_raw_len -= offset;
621 #if DEBUG
622 assert (pstr->valid_len > 0);
623 #endif
625 else
627 /* No, skip all characters until IDX. */
628 #ifdef RE_ENABLE_I18N
629 if (BE (pstr->offsets_needed, 0))
631 pstr->len = pstr->raw_len - idx + offset;
632 pstr->stop = pstr->raw_stop - idx + offset;
633 pstr->offsets_needed = 0;
635 #endif
636 pstr->valid_len = 0;
637 pstr->valid_raw_len = 0;
638 #ifdef RE_ENABLE_I18N
639 if (pstr->mb_cur_max > 1)
641 int wcs_idx;
642 wint_t wc = WEOF;
644 if (pstr->is_utf8)
646 const unsigned char *raw, *p, *q, *end;
648 /* Special case UTF-8. Multi-byte chars start with any
649 byte other than 0x80 - 0xbf. */
650 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
651 end = raw + (offset - pstr->mb_cur_max);
652 for (p = raw + offset - 1; p >= end; --p)
653 if ((*p & 0xc0) != 0x80)
655 mbstate_t cur_state;
656 wchar_t wc2;
657 int mlen = raw + pstr->len - p;
658 unsigned char buf[6];
660 q = p;
661 if (BE (pstr->trans != NULL, 0))
663 int i = mlen < 6 ? mlen : 6;
664 while (--i >= 0)
665 buf[i] = pstr->trans[p[i]];
666 q = buf;
668 /* XXX Don't use mbrtowc, we know which conversion
669 to use (UTF-8 -> UCS4). */
670 memset (&cur_state, 0, sizeof (cur_state));
671 mlen = mbrtowc (&wc2, p, mlen, &cur_state)
672 - (raw + offset - p);
673 if (mlen >= 0)
675 memset (&pstr->cur_state, '\0',
676 sizeof (mbstate_t));
677 pstr->valid_len = mlen;
678 wc = wc2;
680 break;
684 if (wc == WEOF)
685 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
686 if (BE (pstr->valid_len, 0))
688 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
689 pstr->wcs[wcs_idx] = WEOF;
690 if (pstr->mbs_allocated)
691 memset (pstr->mbs, 255, pstr->valid_len);
693 pstr->valid_raw_len = pstr->valid_len;
694 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
695 && IS_WIDE_WORD_CHAR (wc))
696 ? CONTEXT_WORD
697 : ((IS_WIDE_NEWLINE (wc)
698 && pstr->newline_anchor)
699 ? CONTEXT_NEWLINE : 0));
701 else
702 #endif /* RE_ENABLE_I18N */
704 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
705 if (pstr->trans)
706 c = pstr->trans[c];
707 pstr->tip_context = (bitset_contain (pstr->word_char, c)
708 ? CONTEXT_WORD
709 : ((IS_NEWLINE (c) && pstr->newline_anchor)
710 ? CONTEXT_NEWLINE : 0));
713 if (!BE (pstr->mbs_allocated, 0))
714 pstr->mbs += offset;
716 pstr->raw_mbs_idx = idx;
717 pstr->len -= offset;
718 pstr->stop -= offset;
720 /* Then build the buffers. */
721 #ifdef RE_ENABLE_I18N
722 if (pstr->mb_cur_max > 1)
724 if (pstr->icase)
726 int ret = build_wcs_upper_buffer (pstr);
727 if (BE (ret != REG_NOERROR, 0))
728 return ret;
730 else
731 build_wcs_buffer (pstr);
733 else
734 #endif /* RE_ENABLE_I18N */
735 if (BE (pstr->mbs_allocated, 0))
737 if (pstr->icase)
738 build_upper_buffer (pstr);
739 else if (pstr->trans != NULL)
740 re_string_translate_buffer (pstr);
742 else
743 pstr->valid_len = pstr->len;
745 pstr->cur_idx = 0;
746 return REG_NOERROR;
749 static unsigned char
750 re_string_peek_byte_case (pstr, idx)
751 const re_string_t *pstr;
752 int idx;
754 int ch, off;
756 /* Handle the common (easiest) cases first. */
757 if (BE (!pstr->mbs_allocated, 1))
758 return re_string_peek_byte (pstr, idx);
760 #ifdef RE_ENABLE_I18N
761 if (pstr->mb_cur_max > 1
762 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
763 return re_string_peek_byte (pstr, idx);
764 #endif
766 off = pstr->cur_idx + idx;
767 #ifdef RE_ENABLE_I18N
768 if (pstr->offsets_needed)
769 off = pstr->offsets[off];
770 #endif
772 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
774 #ifdef RE_ENABLE_I18N
775 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
776 this function returns CAPITAL LETTER I instead of first byte of
777 DOTLESS SMALL LETTER I. The latter would confuse the parser,
778 since peek_byte_case doesn't advance cur_idx in any way. */
779 if (pstr->offsets_needed && !isascii (ch))
780 return re_string_peek_byte (pstr, idx);
781 #endif
783 return ch;
786 static unsigned char
787 re_string_fetch_byte_case (pstr)
788 re_string_t *pstr;
790 if (BE (!pstr->mbs_allocated, 1))
791 return re_string_fetch_byte (pstr);
793 #ifdef RE_ENABLE_I18N
794 if (pstr->offsets_needed)
796 int off, ch;
798 /* For tr_TR.UTF-8 [[:islower:]] there is
799 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
800 in that case the whole multi-byte character and return
801 the original letter. On the other side, with
802 [[: DOTLESS SMALL LETTER I return [[:I, as doing
803 anything else would complicate things too much. */
805 if (!re_string_first_byte (pstr, pstr->cur_idx))
806 return re_string_fetch_byte (pstr);
808 off = pstr->offsets[pstr->cur_idx];
809 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
811 if (! isascii (ch))
812 return re_string_fetch_byte (pstr);
814 re_string_skip_bytes (pstr,
815 re_string_char_size_at (pstr, pstr->cur_idx));
816 return ch;
818 #endif
820 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
823 static void
824 re_string_destruct (pstr)
825 re_string_t *pstr;
827 #ifdef RE_ENABLE_I18N
828 re_free (pstr->wcs);
829 re_free (pstr->offsets);
830 #endif /* RE_ENABLE_I18N */
831 if (pstr->mbs_allocated)
832 re_free (pstr->mbs);
835 /* Return the context at IDX in INPUT. */
837 static unsigned int
838 re_string_context_at (input, idx, eflags)
839 const re_string_t *input;
840 int idx, eflags;
842 int c;
843 if (BE (idx < 0, 0))
844 /* In this case, we use the value stored in input->tip_context,
845 since we can't know the character in input->mbs[-1] here. */
846 return input->tip_context;
847 if (BE (idx == input->len, 0))
848 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
849 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
850 #ifdef RE_ENABLE_I18N
851 if (input->mb_cur_max > 1)
853 wint_t wc;
854 int wc_idx = idx;
855 while(input->wcs[wc_idx] == WEOF)
857 #ifdef DEBUG
858 /* It must not happen. */
859 assert (wc_idx >= 0);
860 #endif
861 --wc_idx;
862 if (wc_idx < 0)
863 return input->tip_context;
865 wc = input->wcs[wc_idx];
866 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
867 return CONTEXT_WORD;
868 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
869 ? CONTEXT_NEWLINE : 0);
871 else
872 #endif
874 c = re_string_byte_at (input, idx);
875 if (bitset_contain (input->word_char, c))
876 return CONTEXT_WORD;
877 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
881 /* Functions for set operation. */
883 static reg_errcode_t
884 re_node_set_alloc (set, size)
885 re_node_set *set;
886 int size;
888 set->alloc = size;
889 set->nelem = 0;
890 set->elems = re_malloc (int, size);
891 if (BE (set->elems == NULL, 0))
892 return REG_ESPACE;
893 return REG_NOERROR;
896 static reg_errcode_t
897 re_node_set_init_1 (set, elem)
898 re_node_set *set;
899 int elem;
901 set->alloc = 1;
902 set->nelem = 1;
903 set->elems = re_malloc (int, 1);
904 if (BE (set->elems == NULL, 0))
906 set->alloc = set->nelem = 0;
907 return REG_ESPACE;
909 set->elems[0] = elem;
910 return REG_NOERROR;
913 static reg_errcode_t
914 re_node_set_init_2 (set, elem1, elem2)
915 re_node_set *set;
916 int elem1, elem2;
918 set->alloc = 2;
919 set->elems = re_malloc (int, 2);
920 if (BE (set->elems == NULL, 0))
921 return REG_ESPACE;
922 if (elem1 == elem2)
924 set->nelem = 1;
925 set->elems[0] = elem1;
927 else
929 set->nelem = 2;
930 if (elem1 < elem2)
932 set->elems[0] = elem1;
933 set->elems[1] = elem2;
935 else
937 set->elems[0] = elem2;
938 set->elems[1] = elem1;
941 return REG_NOERROR;
944 static reg_errcode_t
945 re_node_set_init_copy (dest, src)
946 re_node_set *dest;
947 const re_node_set *src;
949 dest->nelem = src->nelem;
950 if (src->nelem > 0)
952 dest->alloc = dest->nelem;
953 dest->elems = re_malloc (int, dest->alloc);
954 if (BE (dest->elems == NULL, 0))
956 dest->alloc = dest->nelem = 0;
957 return REG_ESPACE;
959 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
961 else
962 re_node_set_init_empty (dest);
963 return REG_NOERROR;
966 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
967 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
968 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
970 static reg_errcode_t
971 re_node_set_add_intersect (dest, src1, src2)
972 re_node_set *dest;
973 const re_node_set *src1, *src2;
975 int i1, i2, is, id, delta, sbase;
976 if (src1->nelem == 0 || src2->nelem == 0)
977 return REG_NOERROR;
979 /* We need dest->nelem + 2 * elems_in_intersection; this is a
980 conservative estimate. */
981 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
983 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
984 int *new_elems = re_realloc (dest->elems, int, new_alloc);
985 if (BE (new_elems == NULL, 0))
986 return REG_ESPACE;
987 dest->elems = new_elems;
988 dest->alloc = new_alloc;
991 /* Find the items in the intersection of SRC1 and SRC2, and copy
992 into the top of DEST those that are not already in DEST itself. */
993 sbase = dest->nelem + src1->nelem + src2->nelem;
994 i1 = src1->nelem - 1;
995 i2 = src2->nelem - 1;
996 id = dest->nelem - 1;
997 for (;;)
999 if (src1->elems[i1] == src2->elems[i2])
1001 /* Try to find the item in DEST. Maybe we could binary search? */
1002 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1003 --id;
1005 if (id < 0 || dest->elems[id] != src1->elems[i1])
1006 dest->elems[--sbase] = src1->elems[i1];
1008 if (--i1 < 0 || --i2 < 0)
1009 break;
1012 /* Lower the highest of the two items. */
1013 else if (src1->elems[i1] < src2->elems[i2])
1015 if (--i2 < 0)
1016 break;
1018 else
1020 if (--i1 < 0)
1021 break;
1025 id = dest->nelem - 1;
1026 is = dest->nelem + src1->nelem + src2->nelem - 1;
1027 delta = is - sbase + 1;
1029 /* Now copy. When DELTA becomes zero, the remaining
1030 DEST elements are already in place; this is more or
1031 less the same loop that is in re_node_set_merge. */
1032 dest->nelem += delta;
1033 if (delta > 0 && id >= 0)
1034 for (;;)
1036 if (dest->elems[is] > dest->elems[id])
1038 /* Copy from the top. */
1039 dest->elems[id + delta--] = dest->elems[is--];
1040 if (delta == 0)
1041 break;
1043 else
1045 /* Slide from the bottom. */
1046 dest->elems[id + delta] = dest->elems[id];
1047 if (--id < 0)
1048 break;
1052 /* Copy remaining SRC elements. */
1053 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1055 return REG_NOERROR;
1058 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1059 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1061 static reg_errcode_t
1062 re_node_set_init_union (dest, src1, src2)
1063 re_node_set *dest;
1064 const re_node_set *src1, *src2;
1066 int i1, i2, id;
1067 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1069 dest->alloc = src1->nelem + src2->nelem;
1070 dest->elems = re_malloc (int, dest->alloc);
1071 if (BE (dest->elems == NULL, 0))
1072 return REG_ESPACE;
1074 else
1076 if (src1 != NULL && src1->nelem > 0)
1077 return re_node_set_init_copy (dest, src1);
1078 else if (src2 != NULL && src2->nelem > 0)
1079 return re_node_set_init_copy (dest, src2);
1080 else
1081 re_node_set_init_empty (dest);
1082 return REG_NOERROR;
1084 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1086 if (src1->elems[i1] > src2->elems[i2])
1088 dest->elems[id++] = src2->elems[i2++];
1089 continue;
1091 if (src1->elems[i1] == src2->elems[i2])
1092 ++i2;
1093 dest->elems[id++] = src1->elems[i1++];
1095 if (i1 < src1->nelem)
1097 memcpy (dest->elems + id, src1->elems + i1,
1098 (src1->nelem - i1) * sizeof (int));
1099 id += src1->nelem - i1;
1101 else if (i2 < src2->nelem)
1103 memcpy (dest->elems + id, src2->elems + i2,
1104 (src2->nelem - i2) * sizeof (int));
1105 id += src2->nelem - i2;
1107 dest->nelem = id;
1108 return REG_NOERROR;
1111 /* Calculate the union set of the sets DEST and SRC. And store it to
1112 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1114 static reg_errcode_t
1115 re_node_set_merge (dest, src)
1116 re_node_set *dest;
1117 const re_node_set *src;
1119 int is, id, sbase, delta;
1120 if (src == NULL || src->nelem == 0)
1121 return REG_NOERROR;
1122 if (dest->alloc < 2 * src->nelem + dest->nelem)
1124 int new_alloc = 2 * (src->nelem + dest->alloc);
1125 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1126 if (BE (new_buffer == NULL, 0))
1127 return REG_ESPACE;
1128 dest->elems = new_buffer;
1129 dest->alloc = new_alloc;
1132 if (BE (dest->nelem == 0, 0))
1134 dest->nelem = src->nelem;
1135 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1136 return REG_NOERROR;
1139 /* Copy into the top of DEST the items of SRC that are not
1140 found in DEST. Maybe we could binary search in DEST? */
1141 for (sbase = dest->nelem + 2 * src->nelem,
1142 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1144 if (dest->elems[id] == src->elems[is])
1145 is--, id--;
1146 else if (dest->elems[id] < src->elems[is])
1147 dest->elems[--sbase] = src->elems[is--];
1148 else /* if (dest->elems[id] > src->elems[is]) */
1149 --id;
1152 if (is >= 0)
1154 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1155 sbase -= is + 1;
1156 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1159 id = dest->nelem - 1;
1160 is = dest->nelem + 2 * src->nelem - 1;
1161 delta = is - sbase + 1;
1162 if (delta == 0)
1163 return REG_NOERROR;
1165 /* Now copy. When DELTA becomes zero, the remaining
1166 DEST elements are already in place. */
1167 dest->nelem += delta;
1168 for (;;)
1170 if (dest->elems[is] > dest->elems[id])
1172 /* Copy from the top. */
1173 dest->elems[id + delta--] = dest->elems[is--];
1174 if (delta == 0)
1175 break;
1177 else
1179 /* Slide from the bottom. */
1180 dest->elems[id + delta] = dest->elems[id];
1181 if (--id < 0)
1183 /* Copy remaining SRC elements. */
1184 memcpy (dest->elems, dest->elems + sbase,
1185 delta * sizeof (int));
1186 break;
1191 return REG_NOERROR;
1194 /* Insert the new element ELEM to the re_node_set* SET.
1195 SET should not already have ELEM.
1196 return -1 if an error is occured, return 1 otherwise. */
1198 static int
1199 re_node_set_insert (set, elem)
1200 re_node_set *set;
1201 int elem;
1203 int idx;
1204 /* In case the set is empty. */
1205 if (set->alloc == 0)
1207 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1208 return 1;
1209 else
1210 return -1;
1213 if (BE (set->nelem, 0) == 0)
1215 /* We already guaranteed above that set->alloc != 0. */
1216 set->elems[0] = elem;
1217 ++set->nelem;
1218 return 1;
1221 /* Realloc if we need. */
1222 if (set->alloc == set->nelem)
1224 int *new_array;
1225 set->alloc = set->alloc * 2;
1226 new_array = re_realloc (set->elems, int, set->alloc);
1227 if (BE (new_array == NULL, 0))
1228 return -1;
1229 set->elems = new_array;
1232 /* Move the elements which follows the new element. Test the
1233 first element separately to skip a check in the inner loop. */
1234 if (elem < set->elems[0])
1236 idx = 0;
1237 for (idx = set->nelem; idx > 0; idx--)
1238 set->elems[idx] = set->elems[idx - 1];
1240 else
1242 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1243 set->elems[idx] = set->elems[idx - 1];
1246 /* Insert the new element. */
1247 set->elems[idx] = elem;
1248 ++set->nelem;
1249 return 1;
1252 /* Insert the new element ELEM to the re_node_set* SET.
1253 SET should not already have any element greater than or equal to ELEM.
1254 Return -1 if an error is occured, return 1 otherwise. */
1256 static int
1257 re_node_set_insert_last (set, elem)
1258 re_node_set *set;
1259 int elem;
1261 /* Realloc if we need. */
1262 if (set->alloc == set->nelem)
1264 int *new_array;
1265 set->alloc = (set->alloc + 1) * 2;
1266 new_array = re_realloc (set->elems, int, set->alloc);
1267 if (BE (new_array == NULL, 0))
1268 return -1;
1269 set->elems = new_array;
1272 /* Insert the new element. */
1273 set->elems[set->nelem++] = elem;
1274 return 1;
1277 /* Compare two node sets SET1 and SET2.
1278 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1280 static int
1281 re_node_set_compare (set1, set2)
1282 const re_node_set *set1, *set2;
1284 int i;
1285 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1286 return 0;
1287 for (i = set1->nelem ; --i >= 0 ; )
1288 if (set1->elems[i] != set2->elems[i])
1289 return 0;
1290 return 1;
1293 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1295 static int
1296 re_node_set_contains (set, elem)
1297 const re_node_set *set;
1298 int elem;
1300 unsigned int idx, right, mid;
1301 if (set->nelem <= 0)
1302 return 0;
1304 /* Binary search the element. */
1305 idx = 0;
1306 right = set->nelem - 1;
1307 while (idx < right)
1309 mid = (idx + right) / 2;
1310 if (set->elems[mid] < elem)
1311 idx = mid + 1;
1312 else
1313 right = mid;
1315 return set->elems[idx] == elem ? idx + 1 : 0;
1318 static void
1319 re_node_set_remove_at (set, idx)
1320 re_node_set *set;
1321 int idx;
1323 if (idx < 0 || idx >= set->nelem)
1324 return;
1325 --set->nelem;
1326 for (; idx < set->nelem; idx++)
1327 set->elems[idx] = set->elems[idx + 1];
1331 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1332 Or return -1, if an error will be occured. */
1334 static int
1335 re_dfa_add_node (dfa, token, mode)
1336 re_dfa_t *dfa;
1337 re_token_t token;
1338 int mode;
1340 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1342 int new_nodes_alloc = dfa->nodes_alloc * 2;
1343 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1344 new_nodes_alloc);
1345 if (BE (new_array == NULL, 0))
1346 return -1;
1347 dfa->nodes = new_array;
1348 if (mode)
1350 int *new_nexts, *new_indices;
1351 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1353 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1354 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1355 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1356 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1357 new_nodes_alloc);
1358 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1359 new_nodes_alloc);
1360 if (BE (new_nexts == NULL || new_indices == NULL
1361 || new_edests == NULL || new_eclosures == NULL
1362 || new_inveclosures == NULL, 0))
1363 return -1;
1364 dfa->nexts = new_nexts;
1365 dfa->org_indices = new_indices;
1366 dfa->edests = new_edests;
1367 dfa->eclosures = new_eclosures;
1368 dfa->inveclosures = new_inveclosures;
1370 dfa->nodes_alloc = new_nodes_alloc;
1372 dfa->nodes[dfa->nodes_len] = token;
1373 dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1374 dfa->nodes[dfa->nodes_len].duplicated = 0;
1375 dfa->nodes[dfa->nodes_len].constraint = 0;
1376 return dfa->nodes_len++;
1379 static unsigned int inline
1380 calc_state_hash (nodes, context)
1381 const re_node_set *nodes;
1382 unsigned int context;
1384 unsigned int hash = nodes->nelem + context;
1385 int i;
1386 for (i = 0 ; i < nodes->nelem ; i++)
1387 hash += nodes->elems[i];
1388 return hash;
1391 /* Search for the state whose node_set is equivalent to NODES.
1392 Return the pointer to the state, if we found it in the DFA.
1393 Otherwise create the new one and return it. In case of an error
1394 return NULL and set the error code in ERR.
1395 Note: - We assume NULL as the invalid state, then it is possible that
1396 return value is NULL and ERR is REG_NOERROR.
1397 - We never return non-NULL value in case of any errors, it is for
1398 optimization. */
1400 static re_dfastate_t*
1401 re_acquire_state (err, dfa, nodes)
1402 reg_errcode_t *err;
1403 re_dfa_t *dfa;
1404 const re_node_set *nodes;
1406 unsigned int hash;
1407 re_dfastate_t *new_state;
1408 struct re_state_table_entry *spot;
1409 int i;
1410 if (BE (nodes->nelem == 0, 0))
1412 *err = REG_NOERROR;
1413 return NULL;
1415 hash = calc_state_hash (nodes, 0);
1416 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1418 for (i = 0 ; i < spot->num ; i++)
1420 re_dfastate_t *state = spot->array[i];
1421 if (hash != state->hash)
1422 continue;
1423 if (re_node_set_compare (&state->nodes, nodes))
1424 return state;
1427 /* There are no appropriate state in the dfa, create the new one. */
1428 new_state = create_ci_newstate (dfa, nodes, hash);
1429 if (BE (new_state != NULL, 1))
1430 return new_state;
1431 else
1433 *err = REG_ESPACE;
1434 return NULL;
1438 /* Search for the state whose node_set is equivalent to NODES and
1439 whose context is equivalent to CONTEXT.
1440 Return the pointer to the state, if we found it in the DFA.
1441 Otherwise create the new one and return it. In case of an error
1442 return NULL and set the error code in ERR.
1443 Note: - We assume NULL as the invalid state, then it is possible that
1444 return value is NULL and ERR is REG_NOERROR.
1445 - We never return non-NULL value in case of any errors, it is for
1446 optimization. */
1448 static re_dfastate_t*
1449 re_acquire_state_context (err, dfa, nodes, context)
1450 reg_errcode_t *err;
1451 re_dfa_t *dfa;
1452 const re_node_set *nodes;
1453 unsigned int context;
1455 unsigned int hash;
1456 re_dfastate_t *new_state;
1457 struct re_state_table_entry *spot;
1458 int i;
1459 if (nodes->nelem == 0)
1461 *err = REG_NOERROR;
1462 return NULL;
1464 hash = calc_state_hash (nodes, context);
1465 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1467 for (i = 0 ; i < spot->num ; i++)
1469 re_dfastate_t *state = spot->array[i];
1470 if (state->hash == hash
1471 && state->context == context
1472 && re_node_set_compare (state->entrance_nodes, nodes))
1473 return state;
1475 /* There are no appropriate state in `dfa', create the new one. */
1476 new_state = create_cd_newstate (dfa, nodes, context, hash);
1477 if (BE (new_state != NULL, 1))
1478 return new_state;
1479 else
1481 *err = REG_ESPACE;
1482 return NULL;
1486 /* Finish initialization of the new state NEWSTATE, and using its hash value
1487 HASH put in the appropriate bucket of DFA's state table. Return value
1488 indicates the error code if failed. */
1490 static reg_errcode_t
1491 register_state (dfa, newstate, hash)
1492 re_dfa_t *dfa;
1493 re_dfastate_t *newstate;
1494 unsigned int hash;
1496 struct re_state_table_entry *spot;
1497 reg_errcode_t err;
1498 int i;
1500 newstate->hash = hash;
1501 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1502 if (BE (err != REG_NOERROR, 0))
1503 return REG_ESPACE;
1504 for (i = 0; i < newstate->nodes.nelem; i++)
1506 int elem = newstate->nodes.elems[i];
1507 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1508 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1511 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1512 if (BE (spot->alloc <= spot->num, 0))
1514 int new_alloc = 2 * spot->num + 2;
1515 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1516 new_alloc);
1517 if (BE (new_array == NULL, 0))
1518 return REG_ESPACE;
1519 spot->array = new_array;
1520 spot->alloc = new_alloc;
1522 spot->array[spot->num++] = newstate;
1523 return REG_NOERROR;
1526 /* Create the new state which is independ of contexts.
1527 Return the new state if succeeded, otherwise return NULL. */
1529 static re_dfastate_t *
1530 create_ci_newstate (dfa, nodes, hash)
1531 re_dfa_t *dfa;
1532 const re_node_set *nodes;
1533 unsigned int hash;
1535 int i;
1536 reg_errcode_t err;
1537 re_dfastate_t *newstate;
1539 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1540 if (BE (newstate == NULL, 0))
1541 return NULL;
1542 err = re_node_set_init_copy (&newstate->nodes, nodes);
1543 if (BE (err != REG_NOERROR, 0))
1545 re_free (newstate);
1546 return NULL;
1549 newstate->entrance_nodes = &newstate->nodes;
1550 for (i = 0 ; i < nodes->nelem ; i++)
1552 re_token_t *node = dfa->nodes + nodes->elems[i];
1553 re_token_type_t type = node->type;
1554 if (type == CHARACTER && !node->constraint)
1555 continue;
1557 /* If the state has the halt node, the state is a halt state. */
1558 else if (type == END_OF_RE)
1559 newstate->halt = 1;
1560 #ifdef RE_ENABLE_I18N
1561 else if (type == COMPLEX_BRACKET
1562 || type == OP_UTF8_PERIOD
1563 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1564 newstate->accept_mb = 1;
1565 #endif /* RE_ENABLE_I18N */
1566 else if (type == OP_BACK_REF)
1567 newstate->has_backref = 1;
1568 else if (type == ANCHOR || node->constraint)
1569 newstate->has_constraint = 1;
1571 err = register_state (dfa, newstate, hash);
1572 if (BE (err != REG_NOERROR, 0))
1574 free_state (newstate);
1575 newstate = NULL;
1577 return newstate;
1580 /* Create the new state which is depend on the context CONTEXT.
1581 Return the new state if succeeded, otherwise return NULL. */
1583 static re_dfastate_t *
1584 create_cd_newstate (dfa, nodes, context, hash)
1585 re_dfa_t *dfa;
1586 const re_node_set *nodes;
1587 unsigned int context, hash;
1589 int i, nctx_nodes = 0;
1590 reg_errcode_t err;
1591 re_dfastate_t *newstate;
1593 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1594 if (BE (newstate == NULL, 0))
1595 return NULL;
1596 err = re_node_set_init_copy (&newstate->nodes, nodes);
1597 if (BE (err != REG_NOERROR, 0))
1599 re_free (newstate);
1600 return NULL;
1603 newstate->context = context;
1604 newstate->entrance_nodes = &newstate->nodes;
1606 for (i = 0 ; i < nodes->nelem ; i++)
1608 unsigned int constraint = 0;
1609 re_token_t *node = dfa->nodes + nodes->elems[i];
1610 re_token_type_t type = node->type;
1611 if (node->constraint)
1612 constraint = node->constraint;
1614 if (type == CHARACTER && !constraint)
1615 continue;
1616 /* If the state has the halt node, the state is a halt state. */
1617 else if (type == END_OF_RE)
1618 newstate->halt = 1;
1619 #ifdef RE_ENABLE_I18N
1620 else if (type == COMPLEX_BRACKET
1621 || type == OP_UTF8_PERIOD
1622 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1623 newstate->accept_mb = 1;
1624 #endif /* RE_ENABLE_I18N */
1625 else if (type == OP_BACK_REF)
1626 newstate->has_backref = 1;
1627 else if (type == ANCHOR)
1628 constraint = node->opr.ctx_type;
1630 if (constraint)
1632 if (newstate->entrance_nodes == &newstate->nodes)
1634 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1635 if (BE (newstate->entrance_nodes == NULL, 0))
1637 free_state (newstate);
1638 return NULL;
1640 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1641 nctx_nodes = 0;
1642 newstate->has_constraint = 1;
1645 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1647 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1648 ++nctx_nodes;
1652 err = register_state (dfa, newstate, hash);
1653 if (BE (err != REG_NOERROR, 0))
1655 free_state (newstate);
1656 newstate = NULL;
1658 return newstate;
1661 static void
1662 free_state (state)
1663 re_dfastate_t *state;
1665 re_node_set_free (&state->non_eps_nodes);
1666 re_node_set_free (&state->inveclosure);
1667 if (state->entrance_nodes != &state->nodes)
1669 re_node_set_free (state->entrance_nodes);
1670 re_free (state->entrance_nodes);
1672 re_node_set_free (&state->nodes);
1673 re_free (state->trtable);
1674 re_free (state);