Revert last. It is still wrong.
[gnupg.git] / util / regex_internal.c
blob0f8b897b1073b0162fcabd4a17d4d22a300f3206
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02110-1301 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #if defined HAVE_WCHAR_H || defined _LIBC
29 # include <wchar.h>
30 #endif /* HAVE_WCHAR_H || _LIBC */
31 #if defined HAVE_WCTYPE_H || defined _LIBC
32 # include <wctype.h>
33 #endif /* HAVE_WCTYPE_H || _LIBC */
35 #ifdef _LIBC
36 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
37 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
38 # include <locale/localeinfo.h>
39 # include <locale/elem-hash.h>
40 # include <locale/coll-lookup.h>
41 # endif
42 #endif
44 /* This is for other GNU distributions with internationalized messages. */
45 #if HAVE_LIBINTL_H || defined _LIBC
46 # include <libintl.h>
47 # ifdef _LIBC
48 # undef gettext
49 # define gettext(msgid) \
50 INTUSE(__dcgettext) (_libc_intl_domainname_internal, msgid, LC_MESSAGES)
51 # endif
52 #else
53 # define gettext(msgid) (msgid)
54 #endif
56 #ifndef gettext_noop
57 /* This define is so xgettext can find the internationalizable
58 strings. */
59 # define gettext_noop(String) String
60 #endif
62 #include "_regex.h" /* gnupg */
63 #include "regex_internal.h"
65 static void re_string_construct_common (const char *str, int len,
66 re_string_t *pstr,
67 RE_TRANSLATE_TYPE trans, int icase);
68 #ifdef RE_ENABLE_I18N
69 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx);
70 #endif /* RE_ENABLE_I18N */
71 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
72 const re_node_set *nodes,
73 unsigned int hash);
74 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
75 unsigned int hash);
76 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
77 const re_node_set *nodes,
78 unsigned int hash);
79 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
80 const re_node_set *nodes,
81 unsigned int context,
82 unsigned int hash);
83 static unsigned int inline calc_state_hash (const re_node_set *nodes,
84 unsigned int context);
86 /* Functions for string operation. */
88 /* This function allocate the buffers. It is necessary to call
89 re_string_reconstruct before using the object. */
91 static reg_errcode_t
92 re_string_allocate (pstr, str, len, init_len, trans, icase)
93 re_string_t *pstr;
94 const char *str;
95 int len, init_len, icase;
96 RE_TRANSLATE_TYPE trans;
98 reg_errcode_t ret;
99 int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
100 re_string_construct_common (str, len, pstr, trans, icase);
101 pstr->stop = pstr->len;
103 ret = re_string_realloc_buffers (pstr, init_buf_len);
104 if (BE (ret != REG_NOERROR, 0))
105 return ret;
107 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
108 : (unsigned char *) str);
109 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
110 pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
111 || MB_CUR_MAX > 1) ? pstr->valid_len : len;
112 return REG_NOERROR;
115 /* This function allocate the buffers, and initialize them. */
117 static reg_errcode_t
118 re_string_construct (pstr, str, len, trans, icase)
119 re_string_t *pstr;
120 const char *str;
121 int len, icase;
122 RE_TRANSLATE_TYPE trans;
124 reg_errcode_t ret;
125 re_string_construct_common (str, len, pstr, trans, icase);
126 pstr->stop = pstr->len;
127 /* Set 0 so that this function can initialize whole buffers. */
128 pstr->valid_len = 0;
130 if (len > 0)
132 ret = re_string_realloc_buffers (pstr, len + 1);
133 if (BE (ret != REG_NOERROR, 0))
134 return ret;
136 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
137 : (unsigned char *) str);
138 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
140 if (icase)
142 #ifdef RE_ENABLE_I18N
143 if (MB_CUR_MAX > 1)
144 build_wcs_upper_buffer (pstr);
145 else
146 #endif /* RE_ENABLE_I18N */
147 build_upper_buffer (pstr);
149 else
151 #ifdef RE_ENABLE_I18N
152 if (MB_CUR_MAX > 1)
153 build_wcs_buffer (pstr);
154 else
155 #endif /* RE_ENABLE_I18N */
157 if (trans != NULL)
158 re_string_translate_buffer (pstr);
159 else
160 pstr->valid_len = len;
164 /* Initialized whole buffers, then valid_len == bufs_len. */
165 pstr->valid_len = pstr->bufs_len;
166 return REG_NOERROR;
169 /* Helper functions for re_string_allocate, and re_string_construct. */
171 static reg_errcode_t
172 re_string_realloc_buffers (pstr, new_buf_len)
173 re_string_t *pstr;
174 int new_buf_len;
176 #ifdef RE_ENABLE_I18N
177 if (MB_CUR_MAX > 1)
179 pstr->wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
180 if (BE (pstr->wcs == NULL, 0))
181 return REG_ESPACE;
183 #endif /* RE_ENABLE_I18N */
184 if (MBS_ALLOCATED (pstr))
186 pstr->mbs = re_realloc (pstr->mbs, unsigned char, new_buf_len);
187 if (BE (pstr->mbs == NULL, 0))
188 return REG_ESPACE;
190 if (MBS_CASE_ALLOCATED (pstr))
192 pstr->mbs_case = re_realloc (pstr->mbs_case, unsigned char, new_buf_len);
193 if (BE (pstr->mbs_case == NULL, 0))
194 return REG_ESPACE;
195 if (!MBS_ALLOCATED (pstr))
196 pstr->mbs = pstr->mbs_case;
198 pstr->bufs_len = new_buf_len;
199 return REG_NOERROR;
203 static void
204 re_string_construct_common (str, len, pstr, trans, icase)
205 const char *str;
206 int len;
207 re_string_t *pstr;
208 RE_TRANSLATE_TYPE trans;
209 int icase;
211 memset (pstr, '\0', sizeof (re_string_t));
212 pstr->raw_mbs = (const unsigned char *) str;
213 pstr->len = len;
214 pstr->trans = trans;
215 pstr->icase = icase ? 1 : 0;
218 #ifdef RE_ENABLE_I18N
220 /* Build wide character buffer PSTR->WCS.
221 If the byte sequence of the string are:
222 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
223 Then wide character buffer will be:
224 <wc1> , WEOF , <wc2> , WEOF , <wc3>
225 We use WEOF for padding, they indicate that the position isn't
226 a first byte of a multibyte character.
228 Note that this function assumes PSTR->VALID_LEN elements are already
229 built and starts from PSTR->VALID_LEN. */
231 static void
232 build_wcs_buffer (pstr)
233 re_string_t *pstr;
235 mbstate_t prev_st;
236 int byte_idx, end_idx, mbclen, remain_len;
237 /* Build the buffers from pstr->valid_len to either pstr->len or
238 pstr->bufs_len. */
239 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
240 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
242 wchar_t wc;
243 remain_len = end_idx - byte_idx;
244 prev_st = pstr->cur_state;
245 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
246 + byte_idx), remain_len, &pstr->cur_state);
247 if (BE (mbclen == (size_t) -2, 0))
249 /* The buffer doesn't have enough space, finish to build. */
250 pstr->cur_state = prev_st;
251 break;
253 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
255 /* We treat these cases as a singlebyte character. */
256 mbclen = 1;
257 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
258 pstr->cur_state = prev_st;
261 /* Apply the translateion if we need. */
262 if (pstr->trans != NULL && mbclen == 1)
264 int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
265 pstr->mbs_case[byte_idx] = ch;
267 /* Write wide character and padding. */
268 pstr->wcs[byte_idx++] = wc;
269 /* Write paddings. */
270 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
271 pstr->wcs[byte_idx++] = WEOF;
273 pstr->valid_len = byte_idx;
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
279 static void
280 build_wcs_upper_buffer (pstr)
281 re_string_t *pstr;
283 mbstate_t prev_st;
284 int byte_idx, end_idx, mbclen, remain_len;
285 /* Build the buffers from pstr->valid_len to either pstr->len or
286 pstr->bufs_len. */
287 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
288 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
290 wchar_t wc;
291 remain_len = end_idx - byte_idx;
292 prev_st = pstr->cur_state;
293 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
294 + byte_idx), remain_len, &pstr->cur_state);
295 if (BE (mbclen == (size_t) -2, 0))
297 /* The buffer doesn't have enough space, finish to build. */
298 pstr->cur_state = prev_st;
299 break;
301 else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
303 /* In case of a singlebyte character. */
304 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
305 /* Apply the translateion if we need. */
306 if (pstr->trans != NULL && mbclen == 1)
308 ch = pstr->trans[ch];
309 pstr->mbs_case[byte_idx] = ch;
311 pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc;
312 pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
313 if (BE (mbclen == (size_t) -1, 0))
314 pstr->cur_state = prev_st;
316 else /* mbclen > 1 */
318 if (iswlower (wc))
319 wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
320 else
321 memcpy (pstr->mbs + byte_idx,
322 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
323 pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc;
324 /* Write paddings. */
325 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
326 pstr->wcs[byte_idx++] = WEOF;
329 pstr->valid_len = byte_idx;
332 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
333 Return the index. */
335 static int
336 re_string_skip_chars (pstr, new_raw_idx)
337 re_string_t *pstr;
338 int new_raw_idx;
340 mbstate_t prev_st;
341 int rawbuf_idx, mbclen;
343 /* Skip the characters which are not necessary to check. */
344 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
345 rawbuf_idx < new_raw_idx;)
347 int remain_len = pstr->len - rawbuf_idx;
348 prev_st = pstr->cur_state;
349 mbclen = mbrlen ((const char *) pstr->raw_mbs + rawbuf_idx, remain_len,
350 &pstr->cur_state);
351 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
353 /* We treat these cases as a singlebyte character. */
354 mbclen = 1;
355 pstr->cur_state = prev_st;
357 /* Then proceed the next character. */
358 rawbuf_idx += mbclen;
360 return rawbuf_idx;
362 #endif /* RE_ENABLE_I18N */
364 /* Build the buffer PSTR->MBS, and apply the translation if we need.
365 This function is used in case of REG_ICASE. */
367 static void
368 build_upper_buffer (pstr)
369 re_string_t *pstr;
371 int char_idx, end_idx;
372 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
374 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
376 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
377 if (pstr->trans != NULL)
379 ch = pstr->trans[ch];
380 pstr->mbs_case[char_idx] = ch;
382 if (islower (ch))
383 pstr->mbs[char_idx] = toupper (ch);
384 else
385 pstr->mbs[char_idx] = ch;
387 pstr->valid_len = char_idx;
390 /* Apply TRANS to the buffer in PSTR. */
392 static void
393 re_string_translate_buffer (pstr)
394 re_string_t *pstr;
396 int buf_idx, end_idx;
397 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
399 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
401 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
402 pstr->mbs_case[buf_idx] = pstr->trans[ch];
405 pstr->valid_len = buf_idx;
408 /* This function re-construct the buffers.
409 Concretely, convert to wide character in case of MB_CUR_MAX > 1,
410 convert to upper case in case of REG_ICASE, apply translation. */
412 static reg_errcode_t
413 re_string_reconstruct (pstr, idx, eflags, newline)
414 re_string_t *pstr;
415 int idx, eflags, newline;
417 int offset = idx - pstr->raw_mbs_idx;
418 if (offset < 0)
420 /* Reset buffer. */
421 #ifdef RE_ENABLE_I18N
422 if (MB_CUR_MAX > 1)
423 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
424 #endif /* RE_ENABLE_I18N */
425 pstr->len += pstr->raw_mbs_idx;
426 pstr->stop += pstr->raw_mbs_idx;
427 pstr->valid_len = pstr->raw_mbs_idx = 0;
428 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
429 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
430 if (!MBS_CASE_ALLOCATED (pstr))
431 pstr->mbs_case = (unsigned char *) pstr->raw_mbs;
432 if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
433 pstr->mbs = (unsigned char *) pstr->raw_mbs;
434 offset = idx;
437 if (offset != 0)
439 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
440 newline);
441 /* Are the characters which are already checked remain? */
442 if (offset < pstr->valid_len)
444 /* Yes, move them to the front of the buffer. */
445 #ifdef RE_ENABLE_I18N
446 if (MB_CUR_MAX > 1)
447 memmove (pstr->wcs, pstr->wcs + offset,
448 (pstr->valid_len - offset) * sizeof (wint_t));
449 #endif /* RE_ENABLE_I18N */
450 if (MBS_ALLOCATED (pstr))
451 memmove (pstr->mbs, pstr->mbs + offset,
452 pstr->valid_len - offset);
453 if (MBS_CASE_ALLOCATED (pstr))
454 memmove (pstr->mbs_case, pstr->mbs_case + offset,
455 pstr->valid_len - offset);
456 pstr->valid_len -= offset;
457 #if DEBUG
458 assert (pstr->valid_len > 0);
459 #endif
461 else
463 /* No, skip all characters until IDX. */
464 pstr->valid_len = 0;
465 #ifdef RE_ENABLE_I18N
466 if (MB_CUR_MAX > 1)
468 int wcs_idx;
469 pstr->valid_len = re_string_skip_chars (pstr, idx) - idx;
470 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
471 pstr->wcs[wcs_idx] = WEOF;
473 #endif /* RE_ENABLE_I18N */
475 if (!MBS_CASE_ALLOCATED (pstr))
477 pstr->mbs_case += offset;
478 /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
479 if (!MBS_ALLOCATED (pstr))
480 pstr->mbs += offset;
483 pstr->raw_mbs_idx = idx;
484 pstr->len -= offset;
485 pstr->stop -= offset;
487 /* Then build the buffers. */
488 #ifdef RE_ENABLE_I18N
489 if (MB_CUR_MAX > 1)
491 if (pstr->icase)
492 build_wcs_upper_buffer (pstr);
493 else
494 build_wcs_buffer (pstr);
496 else
497 #endif /* RE_ENABLE_I18N */
499 if (pstr->icase)
500 build_upper_buffer (pstr);
501 else if (pstr->trans != NULL)
502 re_string_translate_buffer (pstr);
504 pstr->cur_idx = 0;
506 return REG_NOERROR;
509 static void
510 re_string_destruct (pstr)
511 re_string_t *pstr;
513 #ifdef RE_ENABLE_I18N
514 re_free (pstr->wcs);
515 #endif /* RE_ENABLE_I18N */
516 if (MBS_ALLOCATED (pstr))
517 re_free (pstr->mbs);
518 if (MBS_CASE_ALLOCATED (pstr))
519 re_free (pstr->mbs_case);
522 /* Return the context at IDX in INPUT. */
524 static unsigned int
525 re_string_context_at (input, idx, eflags, newline_anchor)
526 const re_string_t *input;
527 int idx, eflags, newline_anchor;
529 int c;
530 if (idx < 0 || idx == input->len)
532 if (idx < 0)
533 /* In this case, we use the value stored in input->tip_context,
534 since we can't know the character in input->mbs[-1] here. */
535 return input->tip_context;
536 else /* (idx == input->len) */
537 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
538 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
540 c = re_string_byte_at (input, idx);
541 if (IS_WORD_CHAR (c))
542 return CONTEXT_WORD;
543 return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
546 /* Functions for set operation. */
548 static reg_errcode_t
549 re_node_set_alloc (set, size)
550 re_node_set *set;
551 int size;
553 set->alloc = size;
554 set->nelem = 0;
555 set->elems = re_malloc (int, size);
556 if (BE (set->elems == NULL, 0))
557 return REG_ESPACE;
558 return REG_NOERROR;
561 static reg_errcode_t
562 re_node_set_init_1 (set, elem)
563 re_node_set *set;
564 int elem;
566 set->alloc = 1;
567 set->nelem = 1;
568 set->elems = re_malloc (int, 1);
569 if (BE (set->elems == NULL, 0))
570 return REG_ESPACE;
571 set->elems[0] = elem;
572 return REG_NOERROR;
575 static reg_errcode_t
576 re_node_set_init_2 (set, elem1, elem2)
577 re_node_set *set;
578 int elem1, elem2;
580 set->alloc = 2;
581 set->elems = re_malloc (int, 2);
582 if (BE (set->elems == NULL, 0))
583 return REG_ESPACE;
584 if (elem1 == elem2)
586 set->nelem = 1;
587 set->elems[0] = elem1;
589 else
591 set->nelem = 2;
592 if (elem1 < elem2)
594 set->elems[0] = elem1;
595 set->elems[1] = elem2;
597 else
599 set->elems[0] = elem2;
600 set->elems[1] = elem1;
603 return REG_NOERROR;
606 static reg_errcode_t
607 re_node_set_init_copy (dest, src)
608 re_node_set *dest;
609 const re_node_set *src;
611 dest->nelem = src->nelem;
612 if (src->nelem > 0)
614 dest->alloc = dest->nelem;
615 dest->elems = re_malloc (int, dest->alloc);
616 if (BE (dest->elems == NULL, 0))
617 return REG_ESPACE;
618 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
620 else
621 re_node_set_init_empty (dest);
622 return REG_NOERROR;
625 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
626 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
627 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
629 static reg_errcode_t
630 re_node_set_add_intersect (dest, src1, src2)
631 re_node_set *dest;
632 const re_node_set *src1, *src2;
634 int i1, i2, id;
635 if (src1->nelem > 0 && src2->nelem > 0)
637 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
639 dest->alloc = src1->nelem + src2->nelem + dest->nelem;
640 dest->elems = re_realloc (dest->elems, int, dest->alloc);
641 if (BE (dest->elems == NULL, 0))
642 return REG_ESPACE;
645 else
646 return REG_NOERROR;
648 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
650 if (src1->elems[i1] > src2->elems[i2])
652 ++i2;
653 continue;
655 if (src1->elems[i1] == src2->elems[i2])
657 while (id < dest->nelem && dest->elems[id] < src2->elems[i2])
658 ++id;
659 if (id < dest->nelem && dest->elems[id] == src2->elems[i2])
660 ++id;
661 else
663 memmove (dest->elems + id + 1, dest->elems + id,
664 sizeof (int) * (dest->nelem - id));
665 dest->elems[id++] = src2->elems[i2++];
666 ++dest->nelem;
669 ++i1;
671 return REG_NOERROR;
674 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
675 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
677 static reg_errcode_t
678 re_node_set_init_union (dest, src1, src2)
679 re_node_set *dest;
680 const re_node_set *src1, *src2;
682 int i1, i2, id;
683 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
685 dest->alloc = src1->nelem + src2->nelem;
686 dest->elems = re_malloc (int, dest->alloc);
687 if (BE (dest->elems == NULL, 0))
688 return REG_ESPACE;
690 else
692 if (src1 != NULL && src1->nelem > 0)
693 return re_node_set_init_copy (dest, src1);
694 else if (src2 != NULL && src2->nelem > 0)
695 return re_node_set_init_copy (dest, src2);
696 else
697 re_node_set_init_empty (dest);
698 return REG_NOERROR;
700 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
702 if (src1->elems[i1] > src2->elems[i2])
704 dest->elems[id++] = src2->elems[i2++];
705 continue;
707 if (src1->elems[i1] == src2->elems[i2])
708 ++i2;
709 dest->elems[id++] = src1->elems[i1++];
711 if (i1 < src1->nelem)
713 memcpy (dest->elems + id, src1->elems + i1,
714 (src1->nelem - i1) * sizeof (int));
715 id += src1->nelem - i1;
717 else if (i2 < src2->nelem)
719 memcpy (dest->elems + id, src2->elems + i2,
720 (src2->nelem - i2) * sizeof (int));
721 id += src2->nelem - i2;
723 dest->nelem = id;
724 return REG_NOERROR;
727 /* Calculate the union set of the sets DEST and SRC. And store it to
728 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
730 static reg_errcode_t
731 re_node_set_merge (dest, src)
732 re_node_set *dest;
733 const re_node_set *src;
735 int si, di;
736 if (src == NULL || src->nelem == 0)
737 return REG_NOERROR;
738 if (dest->alloc < src->nelem + dest->nelem)
740 dest->alloc = 2 * (src->nelem + dest->alloc);
741 dest->elems = re_realloc (dest->elems, int, dest->alloc);
742 if (BE (dest->elems == NULL, 0))
743 return REG_ESPACE;
746 for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;)
748 int cp_from, ncp, mid, right, src_elem = src->elems[si];
749 /* Binary search the spot we will add the new element. */
750 right = dest->nelem;
751 while (di < right)
753 mid = (di + right) / 2;
754 if (dest->elems[mid] < src_elem)
755 di = mid + 1;
756 else
757 right = mid;
759 if (di >= dest->nelem)
760 break;
762 if (dest->elems[di] == src_elem)
764 /* Skip since, DEST already has the element. */
765 ++di;
766 ++si;
767 continue;
770 /* Skip the src elements which are less than dest->elems[di]. */
771 cp_from = si;
772 while (si < src->nelem && src->elems[si] < dest->elems[di])
773 ++si;
774 /* Copy these src elements. */
775 ncp = si - cp_from;
776 memmove (dest->elems + di + ncp, dest->elems + di,
777 sizeof (int) * (dest->nelem - di));
778 memcpy (dest->elems + di, src->elems + cp_from,
779 sizeof (int) * ncp);
780 /* Update counters. */
781 di += ncp;
782 dest->nelem += ncp;
785 /* Copy remaining src elements. */
786 if (si < src->nelem)
788 memcpy (dest->elems + di, src->elems + si,
789 sizeof (int) * (src->nelem - si));
790 dest->nelem += src->nelem - si;
792 return REG_NOERROR;
795 /* Insert the new element ELEM to the re_node_set* SET.
796 return 0 if SET already has ELEM,
797 return -1 if an error is occured, return 1 otherwise. */
799 static int
800 re_node_set_insert (set, elem)
801 re_node_set *set;
802 int elem;
804 int idx, right, mid;
805 /* In case of the set is empty. */
806 if (set->elems == NULL || set->alloc == 0)
808 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
809 return 1;
810 else
811 return -1;
814 /* Binary search the spot we will add the new element. */
815 idx = 0;
816 right = set->nelem;
817 while (idx < right)
819 mid = (idx + right) / 2;
820 if (set->elems[mid] < elem)
821 idx = mid + 1;
822 else
823 right = mid;
826 /* Realloc if we need. */
827 if (set->alloc < set->nelem + 1)
829 int *new_array;
830 set->alloc = set->alloc * 2;
831 new_array = re_malloc (int, set->alloc);
832 if (BE (new_array == NULL, 0))
833 return -1;
834 /* Copy the elements they are followed by the new element. */
835 if (idx > 0)
836 memcpy (new_array, set->elems, sizeof (int) * (idx));
837 /* Copy the elements which follows the new element. */
838 if (set->nelem - idx > 0)
839 memcpy (new_array + idx + 1, set->elems + idx,
840 sizeof (int) * (set->nelem - idx));
841 re_free (set->elems);
842 set->elems = new_array;
844 else
846 /* Move the elements which follows the new element. */
847 if (set->nelem - idx > 0)
848 memmove (set->elems + idx + 1, set->elems + idx,
849 sizeof (int) * (set->nelem - idx));
851 /* Insert the new element. */
852 set->elems[idx] = elem;
853 ++set->nelem;
854 return 1;
857 /* Compare two node sets SET1 and SET2.
858 return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
860 static int
861 re_node_set_compare (set1, set2)
862 const re_node_set *set1, *set2;
864 int i;
865 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
866 return 0;
867 for (i = 0 ; i < set1->nelem ; i++)
868 if (set1->elems[i] != set2->elems[i])
869 return 0;
870 return 1;
873 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
875 static int
876 re_node_set_contains (set, elem)
877 const re_node_set *set;
878 int elem;
880 int idx, right, mid;
881 if (set->nelem <= 0)
882 return 0;
884 /* Binary search the element. */
885 idx = 0;
886 right = set->nelem - 1;
887 while (idx < right)
889 mid = (idx + right) / 2;
890 if (set->elems[mid] < elem)
891 idx = mid + 1;
892 else
893 right = mid;
895 return set->elems[idx] == elem ? idx + 1 : 0;
898 static void
899 re_node_set_remove_at (set, idx)
900 re_node_set *set;
901 int idx;
903 if (idx < 0 || idx >= set->nelem)
904 return;
905 if (idx < set->nelem - 1)
906 memmove (set->elems + idx, set->elems + idx + 1,
907 sizeof (int) * (set->nelem - idx - 1));
908 --set->nelem;
912 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
913 Or return -1, if an error will be occured. */
915 static int
916 re_dfa_add_node (dfa, token, mode)
917 re_dfa_t *dfa;
918 re_token_t token;
919 int mode;
921 if (dfa->nodes_len >= dfa->nodes_alloc)
923 re_token_t *new_array;
924 dfa->nodes_alloc *= 2;
925 new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc);
926 if (BE (new_array == NULL, 0))
927 return -1;
928 else
929 dfa->nodes = new_array;
930 if (mode)
932 int *new_firsts, *new_nexts;
933 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
935 new_firsts = re_realloc (dfa->firsts, int, dfa->nodes_alloc);
936 new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc);
937 new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc);
938 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
939 dfa->nodes_alloc);
940 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
941 dfa->nodes_alloc);
942 if (BE (new_firsts == NULL || new_nexts == NULL || new_edests == NULL
943 || new_eclosures == NULL || new_inveclosures == NULL, 0))
944 return -1;
945 dfa->firsts = new_firsts;
946 dfa->nexts = new_nexts;
947 dfa->edests = new_edests;
948 dfa->eclosures = new_eclosures;
949 dfa->inveclosures = new_inveclosures;
952 dfa->nodes[dfa->nodes_len] = token;
953 dfa->nodes[dfa->nodes_len].duplicated = 0;
954 return dfa->nodes_len++;
957 static unsigned int inline
958 calc_state_hash (nodes, context)
959 const re_node_set *nodes;
960 unsigned int context;
962 unsigned int hash = nodes->nelem + context;
963 int i;
964 for (i = 0 ; i < nodes->nelem ; i++)
965 hash += nodes->elems[i];
966 return hash;
969 /* Search for the state whose node_set is equivalent to NODES.
970 Return the pointer to the state, if we found it in the DFA.
971 Otherwise create the new one and return it. In case of an error
972 return NULL and set the error code in ERR.
973 Note: - We assume NULL as the invalid state, then it is possible that
974 return value is NULL and ERR is REG_NOERROR.
975 - We never return non-NULL value in case of any errors, it is for
976 optimization. */
978 static re_dfastate_t*
979 re_acquire_state (err, dfa, nodes)
980 reg_errcode_t *err;
981 re_dfa_t *dfa;
982 const re_node_set *nodes;
984 unsigned int hash;
985 re_dfastate_t *new_state;
986 struct re_state_table_entry *spot;
987 int i;
988 if (BE (nodes->nelem == 0, 0))
990 *err = REG_NOERROR;
991 return NULL;
993 hash = calc_state_hash (nodes, 0);
994 spot = dfa->state_table + (hash & dfa->state_hash_mask);
996 for (i = 0 ; i < spot->num ; i++)
998 re_dfastate_t *state = spot->array[i];
999 if (hash != state->hash)
1000 continue;
1001 if (re_node_set_compare (&state->nodes, nodes))
1002 return state;
1005 /* There are no appropriate state in the dfa, create the new one. */
1006 new_state = create_ci_newstate (dfa, nodes, hash);
1007 if (BE (new_state != NULL, 1))
1008 return new_state;
1009 else
1011 *err = REG_ESPACE;
1012 return NULL;
1016 /* Search for the state whose node_set is equivalent to NODES and
1017 whose context is equivalent to CONTEXT.
1018 Return the pointer to the state, if we found it in the DFA.
1019 Otherwise create the new one and return it. In case of an error
1020 return NULL and set the error code in ERR.
1021 Note: - We assume NULL as the invalid state, then it is possible that
1022 return value is NULL and ERR is REG_NOERROR.
1023 - We never return non-NULL value in case of any errors, it is for
1024 optimization. */
1026 static re_dfastate_t*
1027 re_acquire_state_context (err, dfa, nodes, context)
1028 reg_errcode_t *err;
1029 re_dfa_t *dfa;
1030 const re_node_set *nodes;
1031 unsigned int context;
1033 unsigned int hash;
1034 re_dfastate_t *new_state;
1035 struct re_state_table_entry *spot;
1036 int i;
1037 if (nodes->nelem == 0)
1039 *err = REG_NOERROR;
1040 return NULL;
1042 hash = calc_state_hash (nodes, context);
1043 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1045 for (i = 0 ; i < spot->num ; i++)
1047 re_dfastate_t *state = spot->array[i];
1048 if (hash != state->hash)
1049 continue;
1050 if (re_node_set_compare (state->entrance_nodes, nodes)
1051 && state->context == context)
1052 return state;
1054 /* There are no appropriate state in `dfa', create the new one. */
1055 new_state = create_cd_newstate (dfa, nodes, context, hash);
1056 if (BE (new_state != NULL, 1))
1057 return new_state;
1058 else
1060 *err = REG_ESPACE;
1061 return NULL;
1065 /* Allocate memory for DFA state and initialize common properties.
1066 Return the new state if succeeded, otherwise return NULL. */
1068 static re_dfastate_t *
1069 create_newstate_common (dfa, nodes, hash)
1070 re_dfa_t *dfa;
1071 const re_node_set *nodes;
1072 unsigned int hash;
1074 re_dfastate_t *newstate;
1075 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1076 if (BE (newstate == NULL, 0))
1077 return NULL;
1078 re_node_set_init_copy (&newstate->nodes, nodes);
1079 newstate->trtable = NULL;
1080 newstate->trtable_search = NULL;
1081 newstate->hash = hash;
1082 return newstate;
1085 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1086 position. Return value indicate the error code if failed. */
1088 static reg_errcode_t
1089 register_state (dfa, newstate, hash)
1090 re_dfa_t *dfa;
1091 re_dfastate_t *newstate;
1092 unsigned int hash;
1094 struct re_state_table_entry *spot;
1095 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1097 if (spot->alloc <= spot->num)
1099 spot->alloc = 2 * spot->num + 2;
1100 spot->array = re_realloc (spot->array, re_dfastate_t *, spot->alloc);
1101 if (BE (spot->array == NULL, 0))
1102 return REG_ESPACE;
1104 spot->array[spot->num++] = newstate;
1105 return REG_NOERROR;
1108 /* Create the new state which is independ of contexts.
1109 Return the new state if succeeded, otherwise return NULL. */
1111 static re_dfastate_t *
1112 create_ci_newstate (dfa, nodes, hash)
1113 re_dfa_t *dfa;
1114 const re_node_set *nodes;
1115 unsigned int hash;
1117 int i;
1118 reg_errcode_t err;
1119 re_dfastate_t *newstate;
1120 newstate = create_newstate_common (dfa, nodes, hash);
1121 if (BE (newstate == NULL, 0))
1122 return NULL;
1123 newstate->entrance_nodes = &newstate->nodes;
1125 for (i = 0 ; i < nodes->nelem ; i++)
1127 re_token_t *node = dfa->nodes + nodes->elems[i];
1128 re_token_type_t type = node->type;
1129 if (type == CHARACTER)
1130 continue;
1132 /* If the state has the halt node, the state is a halt state. */
1133 else if (type == END_OF_RE)
1134 newstate->halt = 1;
1135 #ifdef RE_ENABLE_I18N
1136 else if (type == COMPLEX_BRACKET
1137 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1138 newstate->accept_mb = 1;
1139 #endif /* RE_ENABLE_I18N */
1140 else if (type == OP_BACK_REF)
1141 newstate->has_backref = 1;
1142 else if (type == ANCHOR || OP_CONTEXT_NODE)
1144 newstate->has_constraint = 1;
1145 if (type == OP_CONTEXT_NODE
1146 && dfa->nodes[node->opr.ctx_info->entity].type == END_OF_RE)
1147 newstate->halt = 1;
1150 err = register_state (dfa, newstate, hash);
1151 return (err != REG_NOERROR) ? NULL : newstate;
1154 /* Create the new state which is depend on the context CONTEXT.
1155 Return the new state if succeeded, otherwise return NULL. */
1157 static re_dfastate_t *
1158 create_cd_newstate (dfa, nodes, context, hash)
1159 re_dfa_t *dfa;
1160 const re_node_set *nodes;
1161 unsigned int context, hash;
1163 int i, nctx_nodes = 0;
1164 reg_errcode_t err;
1165 re_dfastate_t *newstate;
1167 newstate = create_newstate_common (dfa, nodes, hash);
1168 if (BE (newstate == NULL, 0))
1169 return NULL;
1170 newstate->context = context;
1171 newstate->entrance_nodes = &newstate->nodes;
1173 for (i = 0 ; i < nodes->nelem ; i++)
1175 unsigned int constraint = 0;
1176 re_token_t *node = dfa->nodes + nodes->elems[i];
1177 re_token_type_t type = node->type;
1178 if (type == CHARACTER)
1179 continue;
1181 /* If the state has the halt node, the state is a halt state. */
1182 else if (type == END_OF_RE)
1183 newstate->halt = 1;
1184 #ifdef RE_ENABLE_I18N
1185 else if (type == COMPLEX_BRACKET
1186 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1187 newstate->accept_mb = 1;
1188 #endif /* RE_ENABLE_I18N */
1189 else if (type == OP_BACK_REF)
1190 newstate->has_backref = 1;
1191 else if (type == ANCHOR)
1192 constraint = node->opr.ctx_type;
1193 else if (type == OP_CONTEXT_NODE)
1195 re_token_type_t ctype = dfa->nodes[node->opr.ctx_info->entity].type;
1196 constraint = node->constraint;
1197 if (ctype == END_OF_RE)
1198 newstate->halt = 1;
1199 else if (ctype == OP_BACK_REF)
1200 newstate->has_backref = 1;
1201 #ifdef RE_ENABLE_I18N
1202 else if (ctype == COMPLEX_BRACKET
1203 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1204 newstate->accept_mb = 1;
1205 #endif /* RE_ENABLE_I18N */
1208 if (constraint)
1210 if (newstate->entrance_nodes == &newstate->nodes)
1212 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1213 if (BE (newstate->entrance_nodes == NULL, 0))
1214 return NULL;
1215 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1216 nctx_nodes = 0;
1217 newstate->has_constraint = 1;
1220 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1222 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1223 ++nctx_nodes;
1227 err = register_state (dfa, newstate, hash);
1228 return (err != REG_NOERROR) ? NULL : newstate;