1.4.15 prep
[monitoring-plugins.git] / gl / regcomp.c
blob86ca02b0c5b08b70f466ff16aea09790eb8826cc
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3 Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 Idx node, bool root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 Idx nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 bool accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 Idx *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 Idx *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 bool non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 static const char __re_error_msgid[] =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 static const size_t __re_error_msgid_idx[] =
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
213 #ifdef _LIBC
214 const char *
215 re_compile_pattern (pattern, length, bufp)
216 const char *pattern;
217 size_t length;
218 struct re_pattern_buffer *bufp;
219 #else /* size_t might promote */
220 const char *
221 re_compile_pattern (const char *pattern, size_t length,
222 struct re_pattern_buffer *bufp)
223 #endif
225 reg_errcode_t ret;
227 /* And GNU code determines whether or not to get register information
228 by passing null for the REGS argument to re_match, etc., not by
229 setting no_sub, unless RE_NO_SUB is set. */
230 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232 /* Match anchors at newline. */
233 bufp->newline_anchor = 1;
235 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237 if (!ret)
238 return NULL;
239 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 #ifdef _LIBC
242 weak_alias (__re_compile_pattern, re_compile_pattern)
243 #endif
245 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */
248 /* This has no initializer because initialized variables in Emacs
249 become read-only after dumping. */
250 reg_syntax_t re_syntax_options;
253 /* Specify the precise syntax of regexps for compilation. This provides
254 for compatibility for various utilities which historically have
255 different, incompatible syntaxes.
257 The argument SYNTAX is a bit mask comprised of the various bits
258 defined in regex.h. We return the old syntax. */
260 reg_syntax_t
261 re_set_syntax (syntax)
262 reg_syntax_t syntax;
264 reg_syntax_t ret = re_syntax_options;
266 re_syntax_options = syntax;
267 return ret;
269 #ifdef _LIBC
270 weak_alias (__re_set_syntax, re_set_syntax)
271 #endif
274 re_compile_fastmap (bufp)
275 struct re_pattern_buffer *bufp;
277 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
278 char *fastmap = bufp->fastmap;
280 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282 if (dfa->init_state != dfa->init_state_word)
283 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284 if (dfa->init_state != dfa->init_state_nl)
285 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286 if (dfa->init_state != dfa->init_state_begbuf)
287 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288 bufp->fastmap_accurate = 1;
289 return 0;
291 #ifdef _LIBC
292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
293 #endif
295 static inline void
296 __attribute ((always_inline))
297 re_set_fastmap (char *fastmap, bool icase, int ch)
299 fastmap[ch] = 1;
300 if (icase)
301 fastmap[tolower (ch)] = 1;
304 /* Helper function for re_compile_fastmap.
305 Compile fastmap for the initial_state INIT_STATE. */
307 static void
308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309 char *fastmap)
311 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
312 Idx node_cnt;
313 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316 Idx node = init_state->nodes.elems[node_cnt];
317 re_token_type_t type = dfa->nodes[node].type;
319 if (type == CHARACTER)
321 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322 #ifdef RE_ENABLE_I18N
323 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325 unsigned char buf[MB_LEN_MAX];
326 unsigned char *p;
327 wchar_t wc;
328 mbstate_t state;
330 p = buf;
331 *p++ = dfa->nodes[node].opr.c;
332 while (++node < dfa->nodes_len
333 && dfa->nodes[node].type == CHARACTER
334 && dfa->nodes[node].mb_partial)
335 *p++ = dfa->nodes[node].opr.c;
336 memset (&state, '\0', sizeof (state));
337 if (__mbrtowc (&wc, (const char *) buf, p - buf,
338 &state) == p - buf
339 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 != (size_t) -1))
341 re_set_fastmap (fastmap, false, buf[0]);
343 #endif
345 else if (type == SIMPLE_BRACKET)
347 int i, ch;
348 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 int j;
351 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353 if (w & ((bitset_word_t) 1 << j))
354 re_set_fastmap (fastmap, icase, ch);
357 #ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 Idx i;
363 # ifdef _LIBC
364 /* See if we have to try all bytes which start multiple collation
365 elements.
366 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367 collation element, and don't catch 'b' since 'b' is
368 the only collation element which starts from 'b' (and
369 it is caught by SIMPLE_BRACKET). */
370 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371 && (cset->ncoll_syms || cset->nranges))
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
376 if (table[i] < 0)
377 re_set_fastmap (fastmap, icase, i);
379 # endif /* _LIBC */
381 /* See if we have to start the match at all multibyte characters,
382 i.e. where we would not find an invalid sequence. This only
383 applies to multibyte character sets; for single byte character
384 sets, the SIMPLE_BRACKET again suffices. */
385 if (dfa->mb_cur_max > 1
386 && (cset->nchar_classes || cset->non_match || cset->nranges
387 # ifdef _LIBC
388 || cset->nequiv_classes
389 # endif /* _LIBC */
392 unsigned char c = 0;
395 mbstate_t mbs;
396 memset (&mbs, 0, sizeof (mbs));
397 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398 re_set_fastmap (fastmap, false, (int) c);
400 while (++c != 0);
403 else
405 /* ... Else catch all bytes which can start the mbchars. */
406 for (i = 0; i < cset->nmbchars; ++i)
408 char buf[256];
409 mbstate_t state;
410 memset (&state, '\0', sizeof (state));
411 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416 != (size_t) -1)
417 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
422 #endif /* RE_ENABLE_I18N */
423 else if (type == OP_PERIOD
424 #ifdef RE_ENABLE_I18N
425 || type == OP_UTF8_PERIOD
426 #endif /* RE_ENABLE_I18N */
427 || type == END_OF_RE)
429 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430 if (type == END_OF_RE)
431 bufp->can_be_null = 1;
432 return;
437 /* Entry point for POSIX code. */
438 /* regcomp takes a regular expression as a string and compiles it.
440 PREG is a regex_t *. We do not expect any fields to be initialized,
441 since POSIX says we shouldn't. Thus, we set
443 `buffer' to the compiled pattern;
444 `used' to the length of the compiled pattern;
445 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446 REG_EXTENDED bit in CFLAGS is set; otherwise, to
447 RE_SYNTAX_POSIX_BASIC;
448 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
449 `fastmap' to an allocated space for the fastmap;
450 `fastmap_accurate' to zero;
451 `re_nsub' to the number of subexpressions in PATTERN.
453 PATTERN is the address of the pattern string.
455 CFLAGS is a series of bits which affect compilation.
457 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458 use POSIX basic syntax.
460 If REG_NEWLINE is set, then . and [^...] don't match newline.
461 Also, regexec will try a match beginning after every newline.
463 If REG_ICASE is set, then we considers upper- and lowercase
464 versions of letters to be equivalent when matching.
466 If REG_NOSUB is set, then when PREG is passed to regexec, that
467 routine will report only success or failure, and nothing about the
468 registers.
470 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471 the return codes and their meanings.) */
474 regcomp (preg, pattern, cflags)
475 regex_t *_Restrict_ preg;
476 const char *_Restrict_ pattern;
477 int cflags;
479 reg_errcode_t ret;
480 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
481 : RE_SYNTAX_POSIX_BASIC);
483 preg->buffer = NULL;
484 preg->allocated = 0;
485 preg->used = 0;
487 /* Try to allocate space for the fastmap. */
488 preg->fastmap = re_malloc (char, SBC_MAX);
489 if (BE (preg->fastmap == NULL, 0))
490 return REG_ESPACE;
492 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494 /* If REG_NEWLINE is set, newlines are treated differently. */
495 if (cflags & REG_NEWLINE)
496 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
497 syntax &= ~RE_DOT_NEWLINE;
498 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
499 /* It also changes the matching behavior. */
500 preg->newline_anchor = 1;
502 else
503 preg->newline_anchor = 0;
504 preg->no_sub = !!(cflags & REG_NOSUB);
505 preg->translate = NULL;
507 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509 /* POSIX doesn't distinguish between an unmatched open-group and an
510 unmatched close-group: both are REG_EPAREN. */
511 if (ret == REG_ERPAREN)
512 ret = REG_EPAREN;
514 /* We have already checked preg->fastmap != NULL. */
515 if (BE (ret == REG_NOERROR, 1))
516 /* Compute the fastmap now, since regexec cannot modify the pattern
517 buffer. This function never fails in this implementation. */
518 (void) re_compile_fastmap (preg);
519 else
521 /* Some error occurred while compiling the expression. */
522 re_free (preg->fastmap);
523 preg->fastmap = NULL;
526 return (int) ret;
528 #ifdef _LIBC
529 weak_alias (__regcomp, regcomp)
530 #endif
532 /* Returns a message corresponding to an error code, ERRCODE, returned
533 from either regcomp or regexec. We don't use PREG here. */
535 #ifdef _LIBC
536 size_t
537 regerror (errcode, preg, errbuf, errbuf_size)
538 int errcode;
539 const regex_t *_Restrict_ preg;
540 char *_Restrict_ errbuf;
541 size_t errbuf_size;
542 #else /* size_t might promote */
543 size_t
544 regerror (int errcode, const regex_t *_Restrict_ preg,
545 char *_Restrict_ errbuf, size_t errbuf_size)
546 #endif
548 const char *msg;
549 size_t msg_size;
551 if (BE (errcode < 0
552 || errcode >= (int) (sizeof (__re_error_msgid_idx)
553 / sizeof (__re_error_msgid_idx[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
558 abort ();
560 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562 msg_size = strlen (msg) + 1; /* Includes the null. */
564 if (BE (errbuf_size != 0, 1))
566 size_t cpy_size = msg_size;
567 if (BE (msg_size > errbuf_size, 0))
569 cpy_size = errbuf_size - 1;
570 errbuf[cpy_size] = '\0';
572 memcpy (errbuf, msg, cpy_size);
575 return msg_size;
577 #ifdef _LIBC
578 weak_alias (__regerror, regerror)
579 #endif
582 #ifdef RE_ENABLE_I18N
583 /* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
587 static const bitset_t utf8_sb_map =
589 /* Set the first 128 bits. */
590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
591 # error "bitset_word_t is narrower than 32 bits"
592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX,
598 # endif
599 (BITSET_WORD_MAX
600 >> (SBC_MAX % BITSET_WORD_BITS == 0
602 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
604 #endif
607 static void
608 free_dfa_content (re_dfa_t *dfa)
610 Idx i, j;
612 if (dfa->nodes)
613 for (i = 0; i < dfa->nodes_len; ++i)
614 free_token (dfa->nodes + i);
615 re_free (dfa->nexts);
616 for (i = 0; i < dfa->nodes_len; ++i)
618 if (dfa->eclosures != NULL)
619 re_node_set_free (dfa->eclosures + i);
620 if (dfa->inveclosures != NULL)
621 re_node_set_free (dfa->inveclosures + i);
622 if (dfa->edests != NULL)
623 re_node_set_free (dfa->edests + i);
625 re_free (dfa->edests);
626 re_free (dfa->eclosures);
627 re_free (dfa->inveclosures);
628 re_free (dfa->nodes);
630 if (dfa->state_table)
631 for (i = 0; i <= dfa->state_hash_mask; ++i)
633 struct re_state_table_entry *entry = dfa->state_table + i;
634 for (j = 0; j < entry->num; ++j)
636 re_dfastate_t *state = entry->array[j];
637 free_state (state);
639 re_free (entry->array);
641 re_free (dfa->state_table);
642 #ifdef RE_ENABLE_I18N
643 if (dfa->sb_char != utf8_sb_map)
644 re_free (dfa->sb_char);
645 #endif
646 re_free (dfa->subexp_map);
647 #ifdef DEBUG
648 re_free (dfa->re_str);
649 #endif
651 re_free (dfa);
655 /* Free dynamically allocated space used by PREG. */
657 void
658 regfree (preg)
659 regex_t *preg;
661 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
662 if (BE (dfa != NULL, 1))
663 free_dfa_content (dfa);
664 preg->buffer = NULL;
665 preg->allocated = 0;
667 re_free (preg->fastmap);
668 preg->fastmap = NULL;
670 re_free (preg->translate);
671 preg->translate = NULL;
673 #ifdef _LIBC
674 weak_alias (__regfree, regfree)
675 #endif
677 /* Entry points compatible with 4.2 BSD regex library. We don't define
678 them unless specifically requested. */
680 #if defined _REGEX_RE_COMP || defined _LIBC
682 /* BSD has one and only one pattern buffer. */
683 static struct re_pattern_buffer re_comp_buf;
685 char *
686 # ifdef _LIBC
687 /* Make these definitions weak in libc, so POSIX programs can redefine
688 these names if they don't use our functions, and still use
689 regcomp/regexec above without link errors. */
690 weak_function
691 # endif
692 re_comp (s)
693 const char *s;
695 reg_errcode_t ret;
696 char *fastmap;
698 if (!s)
700 if (!re_comp_buf.buffer)
701 return gettext ("No previous regular expression");
702 return 0;
705 if (re_comp_buf.buffer)
707 fastmap = re_comp_buf.fastmap;
708 re_comp_buf.fastmap = NULL;
709 __regfree (&re_comp_buf);
710 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
711 re_comp_buf.fastmap = fastmap;
714 if (re_comp_buf.fastmap == NULL)
716 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
717 if (re_comp_buf.fastmap == NULL)
718 return (char *) gettext (__re_error_msgid
719 + __re_error_msgid_idx[(int) REG_ESPACE]);
722 /* Since `re_exec' always passes NULL for the `regs' argument, we
723 don't need to initialize the pattern buffer fields which affect it. */
725 /* Match anchors at newlines. */
726 re_comp_buf.newline_anchor = 1;
728 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
730 if (!ret)
731 return NULL;
733 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
734 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 #ifdef _LIBC
738 libc_freeres_fn (free_mem)
740 __regfree (&re_comp_buf);
742 #endif
744 #endif /* _REGEX_RE_COMP */
746 /* Internal entry point.
747 Compile the regular expression PATTERN, whose length is LENGTH.
748 SYNTAX indicate regular expression's syntax. */
750 static reg_errcode_t
751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
752 reg_syntax_t syntax)
754 reg_errcode_t err = REG_NOERROR;
755 re_dfa_t *dfa;
756 re_string_t regexp;
758 /* Initialize the pattern buffer. */
759 preg->fastmap_accurate = 0;
760 preg->syntax = syntax;
761 preg->not_bol = preg->not_eol = 0;
762 preg->used = 0;
763 preg->re_nsub = 0;
764 preg->can_be_null = 0;
765 preg->regs_allocated = REGS_UNALLOCATED;
767 /* Initialize the dfa. */
768 dfa = (re_dfa_t *) preg->buffer;
769 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
771 /* If zero allocated, but buffer is non-null, try to realloc
772 enough space. This loses if buffer's address is bogus, but
773 that is the user's responsibility. If ->buffer is NULL this
774 is a simple allocation. */
775 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
776 if (dfa == NULL)
777 return REG_ESPACE;
778 preg->allocated = sizeof (re_dfa_t);
779 preg->buffer = (unsigned char *) dfa;
781 preg->used = sizeof (re_dfa_t);
783 err = init_dfa (dfa, length);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
787 preg->buffer = NULL;
788 preg->allocated = 0;
789 return err;
791 #ifdef DEBUG
792 /* Note: length+1 will not overflow since it is checked in init_dfa. */
793 dfa->re_str = re_malloc (char, length + 1);
794 strncpy (dfa->re_str, pattern, length + 1);
795 #endif
797 __libc_lock_init (dfa->lock);
799 err = re_string_construct (&regexp, pattern, length, preg->translate,
800 (syntax & RE_ICASE) != 0, dfa);
801 if (BE (err != REG_NOERROR, 0))
803 re_compile_internal_free_return:
804 free_workarea_compile (preg);
805 re_string_destruct (&regexp);
806 free_dfa_content (dfa);
807 preg->buffer = NULL;
808 preg->allocated = 0;
809 return err;
812 /* Parse the regular expression, and build a structure tree. */
813 preg->re_nsub = 0;
814 dfa->str_tree = parse (&regexp, preg, syntax, &err);
815 if (BE (dfa->str_tree == NULL, 0))
816 goto re_compile_internal_free_return;
818 /* Analyze the tree and create the nfa. */
819 err = analyze (preg);
820 if (BE (err != REG_NOERROR, 0))
821 goto re_compile_internal_free_return;
823 #ifdef RE_ENABLE_I18N
824 /* If possible, do searching in single byte encoding to speed things up. */
825 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
826 optimize_utf8 (dfa);
827 #endif
829 /* Then create the initial state of the dfa. */
830 err = create_initial_state (dfa);
832 /* Release work areas. */
833 free_workarea_compile (preg);
834 re_string_destruct (&regexp);
836 if (BE (err != REG_NOERROR, 0))
838 free_dfa_content (dfa);
839 preg->buffer = NULL;
840 preg->allocated = 0;
843 return err;
846 /* Initialize DFA. We use the length of the regular expression PAT_LEN
847 as the initial length of some arrays. */
849 static reg_errcode_t
850 init_dfa (re_dfa_t *dfa, size_t pat_len)
852 __re_size_t table_size;
853 #ifndef _LIBC
854 char *codeset_name;
855 #endif
856 #ifdef RE_ENABLE_I18N
857 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
858 #else
859 size_t max_i18n_object_size = 0;
860 #endif
861 size_t max_object_size =
862 MAX (sizeof (struct re_state_table_entry),
863 MAX (sizeof (re_token_t),
864 MAX (sizeof (re_node_set),
865 MAX (sizeof (regmatch_t),
866 max_i18n_object_size))));
868 memset (dfa, '\0', sizeof (re_dfa_t));
870 /* Force allocation of str_tree_storage the first time. */
871 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
873 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
874 calculation below, and for similar doubling calculations
875 elsewhere. And it's <= rather than <, because some of the
876 doubling calculations add 1 afterwards. */
877 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
878 return REG_ESPACE;
880 dfa->nodes_alloc = pat_len + 1;
881 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
883 /* table_size = 2 ^ ceil(log pat_len) */
884 for (table_size = 1; ; table_size <<= 1)
885 if (table_size > pat_len)
886 break;
888 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
889 dfa->state_hash_mask = table_size - 1;
891 dfa->mb_cur_max = MB_CUR_MAX;
892 #ifdef _LIBC
893 if (dfa->mb_cur_max == 6
894 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
895 dfa->is_utf8 = 1;
896 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
897 != 0);
898 #else
899 codeset_name = nl_langinfo (CODESET);
900 if (strcasecmp (codeset_name, "UTF-8") == 0
901 || strcasecmp (codeset_name, "UTF8") == 0)
902 dfa->is_utf8 = 1;
904 /* We check exhaustively in the loop below if this charset is a
905 superset of ASCII. */
906 dfa->map_notascii = 0;
907 #endif
909 #ifdef RE_ENABLE_I18N
910 if (dfa->mb_cur_max > 1)
912 if (dfa->is_utf8)
913 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
914 else
916 int i, j, ch;
918 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
919 if (BE (dfa->sb_char == NULL, 0))
920 return REG_ESPACE;
922 /* Set the bits corresponding to single byte chars. */
923 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
924 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
926 wint_t wch = __btowc (ch);
927 if (wch != WEOF)
928 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
929 # ifndef _LIBC
930 if (isascii (ch) && wch != ch)
931 dfa->map_notascii = 1;
932 # endif
936 #endif
938 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
939 return REG_ESPACE;
940 return REG_NOERROR;
943 /* Initialize WORD_CHAR table, which indicate which character is
944 "word". In this case "word" means that it is the word construction
945 character used by some operators like "\<", "\>", etc. */
947 static void
948 internal_function
949 init_word_char (re_dfa_t *dfa)
951 int i, j, ch;
952 dfa->word_ops_used = 1;
953 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
954 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
955 if (isalnum (ch) || ch == '_')
956 dfa->word_char[i] |= (bitset_word_t) 1 << j;
959 /* Free the work area which are only used while compiling. */
961 static void
962 free_workarea_compile (regex_t *preg)
964 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
965 bin_tree_storage_t *storage, *next;
966 for (storage = dfa->str_tree_storage; storage; storage = next)
968 next = storage->next;
969 re_free (storage);
971 dfa->str_tree_storage = NULL;
972 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
973 dfa->str_tree = NULL;
974 re_free (dfa->org_indices);
975 dfa->org_indices = NULL;
978 /* Create initial states for all contexts. */
980 static reg_errcode_t
981 create_initial_state (re_dfa_t *dfa)
983 Idx first, i;
984 reg_errcode_t err;
985 re_node_set init_nodes;
987 /* Initial states have the epsilon closure of the node which is
988 the first node of the regular expression. */
989 first = dfa->str_tree->first->node_idx;
990 dfa->init_node = first;
991 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
992 if (BE (err != REG_NOERROR, 0))
993 return err;
995 /* The back-references which are in initial states can epsilon transit,
996 since in this case all of the subexpressions can be null.
997 Then we add epsilon closures of the nodes which are the next nodes of
998 the back-references. */
999 if (dfa->nbackref > 0)
1000 for (i = 0; i < init_nodes.nelem; ++i)
1002 Idx node_idx = init_nodes.elems[i];
1003 re_token_type_t type = dfa->nodes[node_idx].type;
1005 Idx clexp_idx;
1006 if (type != OP_BACK_REF)
1007 continue;
1008 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010 re_token_t *clexp_node;
1011 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1012 if (clexp_node->type == OP_CLOSE_SUBEXP
1013 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1014 break;
1016 if (clexp_idx == init_nodes.nelem)
1017 continue;
1019 if (type == OP_BACK_REF)
1021 Idx dest_idx = dfa->edests[node_idx].elems[0];
1022 if (!re_node_set_contains (&init_nodes, dest_idx))
1024 reg_errcode_t merge_err
1025 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1026 if (merge_err != REG_NOERROR)
1027 return merge_err;
1028 i = 0;
1033 /* It must be the first time to invoke acquire_state. */
1034 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1035 /* We don't check ERR here, since the initial state must not be NULL. */
1036 if (BE (dfa->init_state == NULL, 0))
1037 return err;
1038 if (dfa->init_state->has_constraint)
1040 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1041 CONTEXT_WORD);
1042 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1043 CONTEXT_NEWLINE);
1044 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1045 &init_nodes,
1046 CONTEXT_NEWLINE
1047 | CONTEXT_BEGBUF);
1048 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1049 || dfa->init_state_begbuf == NULL, 0))
1050 return err;
1052 else
1053 dfa->init_state_word = dfa->init_state_nl
1054 = dfa->init_state_begbuf = dfa->init_state;
1056 re_node_set_free (&init_nodes);
1057 return REG_NOERROR;
1060 #ifdef RE_ENABLE_I18N
1061 /* If it is possible to do searching in single byte encoding instead of UTF-8
1062 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1063 DFA nodes where needed. */
1065 static void
1066 optimize_utf8 (re_dfa_t *dfa)
1068 Idx node;
1069 int i;
1070 bool mb_chars = false;
1071 bool has_period = false;
1073 for (node = 0; node < dfa->nodes_len; ++node)
1074 switch (dfa->nodes[node].type)
1076 case CHARACTER:
1077 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1078 mb_chars = true;
1079 break;
1080 case ANCHOR:
1081 switch (dfa->nodes[node].opr.ctx_type)
1083 case LINE_FIRST:
1084 case LINE_LAST:
1085 case BUF_FIRST:
1086 case BUF_LAST:
1087 break;
1088 default:
1089 /* Word anchors etc. cannot be handled. It's okay to test
1090 opr.ctx_type since constraints (for all DFA nodes) are
1091 created by ORing one or more opr.ctx_type values. */
1092 return;
1094 break;
1095 case OP_PERIOD:
1096 has_period = true;
1097 break;
1098 case OP_BACK_REF:
1099 case OP_ALT:
1100 case END_OF_RE:
1101 case OP_DUP_ASTERISK:
1102 case OP_OPEN_SUBEXP:
1103 case OP_CLOSE_SUBEXP:
1104 break;
1105 case COMPLEX_BRACKET:
1106 return;
1107 case SIMPLE_BRACKET:
1108 /* Just double check. */
1110 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1112 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1113 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1116 return;
1117 rshift = 0;
1120 break;
1121 default:
1122 abort ();
1125 if (mb_chars || has_period)
1126 for (node = 0; node < dfa->nodes_len; ++node)
1128 if (dfa->nodes[node].type == CHARACTER
1129 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1130 dfa->nodes[node].mb_partial = 0;
1131 else if (dfa->nodes[node].type == OP_PERIOD)
1132 dfa->nodes[node].type = OP_UTF8_PERIOD;
1135 /* The search can be in single byte locale. */
1136 dfa->mb_cur_max = 1;
1137 dfa->is_utf8 = 0;
1138 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1140 #endif
1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
1143 "eclosure", and "inveclosure". */
1145 static reg_errcode_t
1146 analyze (regex_t *preg)
1148 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1149 reg_errcode_t ret;
1151 /* Allocate arrays. */
1152 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1153 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1154 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157 || dfa->eclosures == NULL, 0))
1158 return REG_ESPACE;
1160 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1161 if (dfa->subexp_map != NULL)
1163 Idx i;
1164 for (i = 0; i < preg->re_nsub; i++)
1165 dfa->subexp_map[i] = i;
1166 preorder (dfa->str_tree, optimize_subexps, dfa);
1167 for (i = 0; i < preg->re_nsub; i++)
1168 if (dfa->subexp_map[i] != i)
1169 break;
1170 if (i == preg->re_nsub)
1172 free (dfa->subexp_map);
1173 dfa->subexp_map = NULL;
1177 ret = postorder (dfa->str_tree, lower_subexps, preg);
1178 if (BE (ret != REG_NOERROR, 0))
1179 return ret;
1180 ret = postorder (dfa->str_tree, calc_first, dfa);
1181 if (BE (ret != REG_NOERROR, 0))
1182 return ret;
1183 preorder (dfa->str_tree, calc_next, dfa);
1184 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185 if (BE (ret != REG_NOERROR, 0))
1186 return ret;
1187 ret = calc_eclosure (dfa);
1188 if (BE (ret != REG_NOERROR, 0))
1189 return ret;
1191 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1193 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1194 || dfa->nbackref)
1196 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197 if (BE (dfa->inveclosures == NULL, 0))
1198 return REG_ESPACE;
1199 ret = calc_inveclosure (dfa);
1202 return ret;
1205 /* Our parse trees are very unbalanced, so we cannot use a stack to
1206 implement parse tree visits. Instead, we use parent pointers and
1207 some hairy code in these two functions. */
1208 static reg_errcode_t
1209 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1210 void *extra)
1212 bin_tree_t *node, *prev;
1214 for (node = root; ; )
1216 /* Descend down the tree, preferably to the left (or to the right
1217 if that's the only child). */
1218 while (node->left || node->right)
1219 if (node->left)
1220 node = node->left;
1221 else
1222 node = node->right;
1226 reg_errcode_t err = fn (extra, node);
1227 if (BE (err != REG_NOERROR, 0))
1228 return err;
1229 if (node->parent == NULL)
1230 return REG_NOERROR;
1231 prev = node;
1232 node = node->parent;
1234 /* Go up while we have a node that is reached from the right. */
1235 while (node->right == prev || node->right == NULL);
1236 node = node->right;
1240 static reg_errcode_t
1241 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242 void *extra)
1244 bin_tree_t *node;
1246 for (node = root; ; )
1248 reg_errcode_t err = fn (extra, node);
1249 if (BE (err != REG_NOERROR, 0))
1250 return err;
1252 /* Go to the left node, or up and to the right. */
1253 if (node->left)
1254 node = node->left;
1255 else
1257 bin_tree_t *prev = NULL;
1258 while (node->right == prev || node->right == NULL)
1260 prev = node;
1261 node = node->parent;
1262 if (!node)
1263 return REG_NOERROR;
1265 node = node->right;
1270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1272 backreferences as well. Requires a preorder visit. */
1273 static reg_errcode_t
1274 optimize_subexps (void *extra, bin_tree_t *node)
1276 re_dfa_t *dfa = (re_dfa_t *) extra;
1278 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1280 int idx = node->token.opr.idx;
1281 node->token.opr.idx = dfa->subexp_map[idx];
1282 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285 else if (node->token.type == SUBEXP
1286 && node->left && node->left->token.type == SUBEXP)
1288 Idx other_idx = node->left->token.opr.idx;
1290 node->left = node->left->left;
1291 if (node->left)
1292 node->left->parent = node;
1294 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295 if (other_idx < BITSET_WORD_BITS)
1296 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299 return REG_NOERROR;
1302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1304 static reg_errcode_t
1305 lower_subexps (void *extra, bin_tree_t *node)
1307 regex_t *preg = (regex_t *) extra;
1308 reg_errcode_t err = REG_NOERROR;
1310 if (node->left && node->left->token.type == SUBEXP)
1312 node->left = lower_subexp (&err, preg, node->left);
1313 if (node->left)
1314 node->left->parent = node;
1316 if (node->right && node->right->token.type == SUBEXP)
1318 node->right = lower_subexp (&err, preg, node->right);
1319 if (node->right)
1320 node->right->parent = node;
1323 return err;
1326 static bin_tree_t *
1327 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1329 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330 bin_tree_t *body = node->left;
1331 bin_tree_t *op, *cls, *tree1, *tree;
1333 if (preg->no_sub
1334 /* We do not optimize empty subexpressions, because otherwise we may
1335 have bad CONCAT nodes with NULL children. This is obviously not
1336 very common, so we do not lose much. An example that triggers
1337 this case is the sed "script" /\(\)/x. */
1338 && node->left != NULL
1339 && (node->token.opr.idx >= BITSET_WORD_BITS
1340 || !(dfa->used_bkref_map
1341 & ((bitset_word_t) 1 << node->token.opr.idx))))
1342 return node->left;
1344 /* Convert the SUBEXP node to the concatenation of an
1345 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1346 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349 tree = create_tree (dfa, op, tree1, CONCAT);
1350 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1352 *err = REG_ESPACE;
1353 return NULL;
1356 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358 return tree;
1361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362 nodes. Requires a postorder visit. */
1363 static reg_errcode_t
1364 calc_first (void *extra, bin_tree_t *node)
1366 re_dfa_t *dfa = (re_dfa_t *) extra;
1367 if (node->token.type == CONCAT)
1369 node->first = node->left->first;
1370 node->node_idx = node->left->node_idx;
1372 else
1374 node->first = node;
1375 node->node_idx = re_dfa_add_node (dfa, node->token);
1376 if (BE (node->node_idx == REG_MISSING, 0))
1377 return REG_ESPACE;
1378 if (node->token.type == ANCHOR)
1379 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1381 return REG_NOERROR;
1384 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1385 static reg_errcode_t
1386 calc_next (void *extra, bin_tree_t *node)
1388 switch (node->token.type)
1390 case OP_DUP_ASTERISK:
1391 node->left->next = node;
1392 break;
1393 case CONCAT:
1394 node->left->next = node->right->first;
1395 node->right->next = node->next;
1396 break;
1397 default:
1398 if (node->left)
1399 node->left->next = node->next;
1400 if (node->right)
1401 node->right->next = node->next;
1402 break;
1404 return REG_NOERROR;
1407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1408 static reg_errcode_t
1409 link_nfa_nodes (void *extra, bin_tree_t *node)
1411 re_dfa_t *dfa = (re_dfa_t *) extra;
1412 Idx idx = node->node_idx;
1413 reg_errcode_t err = REG_NOERROR;
1415 switch (node->token.type)
1417 case CONCAT:
1418 break;
1420 case END_OF_RE:
1421 assert (node->next == NULL);
1422 break;
1424 case OP_DUP_ASTERISK:
1425 case OP_ALT:
1427 Idx left, right;
1428 dfa->has_plural_match = 1;
1429 if (node->left != NULL)
1430 left = node->left->first->node_idx;
1431 else
1432 left = node->next->node_idx;
1433 if (node->right != NULL)
1434 right = node->right->first->node_idx;
1435 else
1436 right = node->next->node_idx;
1437 assert (REG_VALID_INDEX (left));
1438 assert (REG_VALID_INDEX (right));
1439 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1441 break;
1443 case ANCHOR:
1444 case OP_OPEN_SUBEXP:
1445 case OP_CLOSE_SUBEXP:
1446 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447 break;
1449 case OP_BACK_REF:
1450 dfa->nexts[idx] = node->next->node_idx;
1451 if (node->token.type == OP_BACK_REF)
1452 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453 break;
1455 default:
1456 assert (!IS_EPSILON_NODE (node->token.type));
1457 dfa->nexts[idx] = node->next->node_idx;
1458 break;
1461 return err;
1464 /* Duplicate the epsilon closure of the node ROOT_NODE.
1465 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466 to their own constraint. */
1468 static reg_errcode_t
1469 internal_function
1470 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1471 Idx root_node, unsigned int init_constraint)
1473 Idx org_node, clone_node;
1474 bool ok;
1475 unsigned int constraint = init_constraint;
1476 for (org_node = top_org_node, clone_node = top_clone_node;;)
1478 Idx org_dest, clone_dest;
1479 if (dfa->nodes[org_node].type == OP_BACK_REF)
1481 /* If the back reference epsilon-transit, its destination must
1482 also have the constraint. Then duplicate the epsilon closure
1483 of the destination of the back reference, and store it in
1484 edests of the back reference. */
1485 org_dest = dfa->nexts[org_node];
1486 re_node_set_empty (dfa->edests + clone_node);
1487 clone_dest = duplicate_node (dfa, org_dest, constraint);
1488 if (BE (clone_dest == REG_MISSING, 0))
1489 return REG_ESPACE;
1490 dfa->nexts[clone_node] = dfa->nexts[org_node];
1491 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492 if (BE (! ok, 0))
1493 return REG_ESPACE;
1495 else if (dfa->edests[org_node].nelem == 0)
1497 /* In case of the node can't epsilon-transit, don't duplicate the
1498 destination and store the original destination as the
1499 destination of the node. */
1500 dfa->nexts[clone_node] = dfa->nexts[org_node];
1501 break;
1503 else if (dfa->edests[org_node].nelem == 1)
1505 /* In case of the node can epsilon-transit, and it has only one
1506 destination. */
1507 org_dest = dfa->edests[org_node].elems[0];
1508 re_node_set_empty (dfa->edests + clone_node);
1509 /* If the node is root_node itself, it means the epsilon closure
1510 has a loop. Then tie it to the destination of the root_node. */
1511 if (org_node == root_node && clone_node != org_node)
1513 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1514 if (BE (! ok, 0))
1515 return REG_ESPACE;
1516 break;
1518 /* In case the node has another constraint, append it. */
1519 constraint |= dfa->nodes[org_node].constraint;
1520 clone_dest = duplicate_node (dfa, org_dest, constraint);
1521 if (BE (clone_dest == REG_MISSING, 0))
1522 return REG_ESPACE;
1523 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524 if (BE (! ok, 0))
1525 return REG_ESPACE;
1527 else /* dfa->edests[org_node].nelem == 2 */
1529 /* In case of the node can epsilon-transit, and it has two
1530 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1531 org_dest = dfa->edests[org_node].elems[0];
1532 re_node_set_empty (dfa->edests + clone_node);
1533 /* Search for a duplicated node which satisfies the constraint. */
1534 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1535 if (clone_dest == REG_MISSING)
1537 /* There is no such duplicated node, create a new one. */
1538 reg_errcode_t err;
1539 clone_dest = duplicate_node (dfa, org_dest, constraint);
1540 if (BE (clone_dest == REG_MISSING, 0))
1541 return REG_ESPACE;
1542 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543 if (BE (! ok, 0))
1544 return REG_ESPACE;
1545 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1546 root_node, constraint);
1547 if (BE (err != REG_NOERROR, 0))
1548 return err;
1550 else
1552 /* There is a duplicated node which satisfies the constraint,
1553 use it to avoid infinite loop. */
1554 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555 if (BE (! ok, 0))
1556 return REG_ESPACE;
1559 org_dest = dfa->edests[org_node].elems[1];
1560 clone_dest = duplicate_node (dfa, org_dest, constraint);
1561 if (BE (clone_dest == REG_MISSING, 0))
1562 return REG_ESPACE;
1563 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564 if (BE (! ok, 0))
1565 return REG_ESPACE;
1567 org_node = org_dest;
1568 clone_node = clone_dest;
1570 return REG_NOERROR;
1573 /* Search for a node which is duplicated from the node ORG_NODE, and
1574 satisfies the constraint CONSTRAINT. */
1576 static Idx
1577 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1578 unsigned int constraint)
1580 Idx idx;
1581 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583 if (org_node == dfa->org_indices[idx]
1584 && constraint == dfa->nodes[idx].constraint)
1585 return idx; /* Found. */
1587 return REG_MISSING; /* Not found. */
1590 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1591 Return the index of the new node, or REG_MISSING if insufficient storage is
1592 available. */
1594 static Idx
1595 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1597 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1598 if (BE (dup_idx != REG_MISSING, 1))
1600 dfa->nodes[dup_idx].constraint = constraint;
1601 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1602 dfa->nodes[dup_idx].duplicated = 1;
1604 /* Store the index of the original node. */
1605 dfa->org_indices[dup_idx] = org_idx;
1607 return dup_idx;
1610 static reg_errcode_t
1611 calc_inveclosure (re_dfa_t *dfa)
1613 Idx src, idx;
1614 bool ok;
1615 for (idx = 0; idx < dfa->nodes_len; ++idx)
1616 re_node_set_init_empty (dfa->inveclosures + idx);
1618 for (src = 0; src < dfa->nodes_len; ++src)
1620 Idx *elems = dfa->eclosures[src].elems;
1621 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1624 if (BE (! ok, 0))
1625 return REG_ESPACE;
1629 return REG_NOERROR;
1632 /* Calculate "eclosure" for all the node in DFA. */
1634 static reg_errcode_t
1635 calc_eclosure (re_dfa_t *dfa)
1637 Idx node_idx;
1638 bool incomplete;
1639 #ifdef DEBUG
1640 assert (dfa->nodes_len > 0);
1641 #endif
1642 incomplete = false;
1643 /* For each nodes, calculate epsilon closure. */
1644 for (node_idx = 0; ; ++node_idx)
1646 reg_errcode_t err;
1647 re_node_set eclosure_elem;
1648 if (node_idx == dfa->nodes_len)
1650 if (!incomplete)
1651 break;
1652 incomplete = false;
1653 node_idx = 0;
1656 #ifdef DEBUG
1657 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1658 #endif
1660 /* If we have already calculated, skip it. */
1661 if (dfa->eclosures[node_idx].nelem != 0)
1662 continue;
1663 /* Calculate epsilon closure of `node_idx'. */
1664 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1665 if (BE (err != REG_NOERROR, 0))
1666 return err;
1668 if (dfa->eclosures[node_idx].nelem == 0)
1670 incomplete = true;
1671 re_node_set_free (&eclosure_elem);
1674 return REG_NOERROR;
1677 /* Calculate epsilon closure of NODE. */
1679 static reg_errcode_t
1680 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1682 reg_errcode_t err;
1683 Idx i;
1684 re_node_set eclosure;
1685 bool ok;
1686 bool incomplete = false;
1687 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1688 if (BE (err != REG_NOERROR, 0))
1689 return err;
1691 /* This indicates that we are calculating this node now.
1692 We reference this value to avoid infinite loop. */
1693 dfa->eclosures[node].nelem = REG_MISSING;
1695 /* If the current node has constraints, duplicate all nodes
1696 since they must inherit the constraints. */
1697 if (dfa->nodes[node].constraint
1698 && dfa->edests[node].nelem
1699 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701 err = duplicate_node_closure (dfa, node, node, node,
1702 dfa->nodes[node].constraint);
1703 if (BE (err != REG_NOERROR, 0))
1704 return err;
1707 /* Expand each epsilon destination nodes. */
1708 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1709 for (i = 0; i < dfa->edests[node].nelem; ++i)
1711 re_node_set eclosure_elem;
1712 Idx edest = dfa->edests[node].elems[i];
1713 /* If calculating the epsilon closure of `edest' is in progress,
1714 return intermediate result. */
1715 if (dfa->eclosures[edest].nelem == REG_MISSING)
1717 incomplete = true;
1718 continue;
1720 /* If we haven't calculated the epsilon closure of `edest' yet,
1721 calculate now. Otherwise use calculated epsilon closure. */
1722 if (dfa->eclosures[edest].nelem == 0)
1724 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1725 if (BE (err != REG_NOERROR, 0))
1726 return err;
1728 else
1729 eclosure_elem = dfa->eclosures[edest];
1730 /* Merge the epsilon closure of `edest'. */
1731 err = re_node_set_merge (&eclosure, &eclosure_elem);
1732 if (BE (err != REG_NOERROR, 0))
1733 return err;
1734 /* If the epsilon closure of `edest' is incomplete,
1735 the epsilon closure of this node is also incomplete. */
1736 if (dfa->eclosures[edest].nelem == 0)
1738 incomplete = true;
1739 re_node_set_free (&eclosure_elem);
1743 /* An epsilon closure includes itself. */
1744 ok = re_node_set_insert (&eclosure, node);
1745 if (BE (! ok, 0))
1746 return REG_ESPACE;
1747 if (incomplete && !root)
1748 dfa->eclosures[node].nelem = 0;
1749 else
1750 dfa->eclosures[node] = eclosure;
1751 *new_set = eclosure;
1752 return REG_NOERROR;
1755 /* Functions for token which are used in the parser. */
1757 /* Fetch a token from INPUT.
1758 We must not use this function inside bracket expressions. */
1760 static void
1761 internal_function
1762 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764 re_string_skip_bytes (input, peek_token (result, input, syntax));
1767 /* Peek a token from INPUT, and return the length of the token.
1768 We must not use this function inside bracket expressions. */
1770 static int
1771 internal_function
1772 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1774 unsigned char c;
1776 if (re_string_eoi (input))
1778 token->type = END_OF_RE;
1779 return 0;
1782 c = re_string_peek_byte (input, 0);
1783 token->opr.c = c;
1785 token->word_char = 0;
1786 #ifdef RE_ENABLE_I18N
1787 token->mb_partial = 0;
1788 if (input->mb_cur_max > 1 &&
1789 !re_string_first_byte (input, re_string_cur_idx (input)))
1791 token->type = CHARACTER;
1792 token->mb_partial = 1;
1793 return 1;
1795 #endif
1796 if (c == '\\')
1798 unsigned char c2;
1799 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801 token->type = BACK_SLASH;
1802 return 1;
1805 c2 = re_string_peek_byte_case (input, 1);
1806 token->opr.c = c2;
1807 token->type = CHARACTER;
1808 #ifdef RE_ENABLE_I18N
1809 if (input->mb_cur_max > 1)
1811 wint_t wc = re_string_wchar_at (input,
1812 re_string_cur_idx (input) + 1);
1813 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1815 else
1816 #endif
1817 token->word_char = IS_WORD_CHAR (c2) != 0;
1819 switch (c2)
1821 case '|':
1822 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1823 token->type = OP_ALT;
1824 break;
1825 case '1': case '2': case '3': case '4': case '5':
1826 case '6': case '7': case '8': case '9':
1827 if (!(syntax & RE_NO_BK_REFS))
1829 token->type = OP_BACK_REF;
1830 token->opr.idx = c2 - '1';
1832 break;
1833 case '<':
1834 if (!(syntax & RE_NO_GNU_OPS))
1836 token->type = ANCHOR;
1837 token->opr.ctx_type = WORD_FIRST;
1839 break;
1840 case '>':
1841 if (!(syntax & RE_NO_GNU_OPS))
1843 token->type = ANCHOR;
1844 token->opr.ctx_type = WORD_LAST;
1846 break;
1847 case 'b':
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = WORD_DELIM;
1853 break;
1854 case 'B':
1855 if (!(syntax & RE_NO_GNU_OPS))
1857 token->type = ANCHOR;
1858 token->opr.ctx_type = NOT_WORD_DELIM;
1860 break;
1861 case 'w':
1862 if (!(syntax & RE_NO_GNU_OPS))
1863 token->type = OP_WORD;
1864 break;
1865 case 'W':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = OP_NOTWORD;
1868 break;
1869 case 's':
1870 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = OP_SPACE;
1872 break;
1873 case 'S':
1874 if (!(syntax & RE_NO_GNU_OPS))
1875 token->type = OP_NOTSPACE;
1876 break;
1877 case '`':
1878 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = ANCHOR;
1881 token->opr.ctx_type = BUF_FIRST;
1883 break;
1884 case '\'':
1885 if (!(syntax & RE_NO_GNU_OPS))
1887 token->type = ANCHOR;
1888 token->opr.ctx_type = BUF_LAST;
1890 break;
1891 case '(':
1892 if (!(syntax & RE_NO_BK_PARENS))
1893 token->type = OP_OPEN_SUBEXP;
1894 break;
1895 case ')':
1896 if (!(syntax & RE_NO_BK_PARENS))
1897 token->type = OP_CLOSE_SUBEXP;
1898 break;
1899 case '+':
1900 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1901 token->type = OP_DUP_PLUS;
1902 break;
1903 case '?':
1904 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905 token->type = OP_DUP_QUESTION;
1906 break;
1907 case '{':
1908 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1909 token->type = OP_OPEN_DUP_NUM;
1910 break;
1911 case '}':
1912 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913 token->type = OP_CLOSE_DUP_NUM;
1914 break;
1915 default:
1916 break;
1918 return 2;
1921 token->type = CHARACTER;
1922 #ifdef RE_ENABLE_I18N
1923 if (input->mb_cur_max > 1)
1925 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1926 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1928 else
1929 #endif
1930 token->word_char = IS_WORD_CHAR (token->opr.c);
1932 switch (c)
1934 case '\n':
1935 if (syntax & RE_NEWLINE_ALT)
1936 token->type = OP_ALT;
1937 break;
1938 case '|':
1939 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1940 token->type = OP_ALT;
1941 break;
1942 case '*':
1943 token->type = OP_DUP_ASTERISK;
1944 break;
1945 case '+':
1946 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1947 token->type = OP_DUP_PLUS;
1948 break;
1949 case '?':
1950 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951 token->type = OP_DUP_QUESTION;
1952 break;
1953 case '{':
1954 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1955 token->type = OP_OPEN_DUP_NUM;
1956 break;
1957 case '}':
1958 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959 token->type = OP_CLOSE_DUP_NUM;
1960 break;
1961 case '(':
1962 if (syntax & RE_NO_BK_PARENS)
1963 token->type = OP_OPEN_SUBEXP;
1964 break;
1965 case ')':
1966 if (syntax & RE_NO_BK_PARENS)
1967 token->type = OP_CLOSE_SUBEXP;
1968 break;
1969 case '[':
1970 token->type = OP_OPEN_BRACKET;
1971 break;
1972 case '.':
1973 token->type = OP_PERIOD;
1974 break;
1975 case '^':
1976 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1977 re_string_cur_idx (input) != 0)
1979 char prev = re_string_peek_byte (input, -1);
1980 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1981 break;
1983 token->type = ANCHOR;
1984 token->opr.ctx_type = LINE_FIRST;
1985 break;
1986 case '$':
1987 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1988 re_string_cur_idx (input) + 1 != re_string_length (input))
1990 re_token_t next;
1991 re_string_skip_bytes (input, 1);
1992 peek_token (&next, input, syntax);
1993 re_string_skip_bytes (input, -1);
1994 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1995 break;
1997 token->type = ANCHOR;
1998 token->opr.ctx_type = LINE_LAST;
1999 break;
2000 default:
2001 break;
2003 return 1;
2006 /* Peek a token from INPUT, and return the length of the token.
2007 We must not use this function out of bracket expressions. */
2009 static int
2010 internal_function
2011 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013 unsigned char c;
2014 if (re_string_eoi (input))
2016 token->type = END_OF_RE;
2017 return 0;
2019 c = re_string_peek_byte (input, 0);
2020 token->opr.c = c;
2022 #ifdef RE_ENABLE_I18N
2023 if (input->mb_cur_max > 1 &&
2024 !re_string_first_byte (input, re_string_cur_idx (input)))
2026 token->type = CHARACTER;
2027 return 1;
2029 #endif /* RE_ENABLE_I18N */
2031 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2032 && re_string_cur_idx (input) + 1 < re_string_length (input))
2034 /* In this case, '\' escape a character. */
2035 unsigned char c2;
2036 re_string_skip_bytes (input, 1);
2037 c2 = re_string_peek_byte (input, 0);
2038 token->opr.c = c2;
2039 token->type = CHARACTER;
2040 return 1;
2042 if (c == '[') /* '[' is a special char in a bracket exps. */
2044 unsigned char c2;
2045 int token_len;
2046 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2047 c2 = re_string_peek_byte (input, 1);
2048 else
2049 c2 = 0;
2050 token->opr.c = c2;
2051 token_len = 2;
2052 switch (c2)
2054 case '.':
2055 token->type = OP_OPEN_COLL_ELEM;
2056 break;
2057 case '=':
2058 token->type = OP_OPEN_EQUIV_CLASS;
2059 break;
2060 case ':':
2061 if (syntax & RE_CHAR_CLASSES)
2063 token->type = OP_OPEN_CHAR_CLASS;
2064 break;
2066 /* else fall through. */
2067 default:
2068 token->type = CHARACTER;
2069 token->opr.c = c;
2070 token_len = 1;
2071 break;
2073 return token_len;
2075 switch (c)
2077 case '-':
2078 token->type = OP_CHARSET_RANGE;
2079 break;
2080 case ']':
2081 token->type = OP_CLOSE_BRACKET;
2082 break;
2083 case '^':
2084 token->type = OP_NON_MATCH_LIST;
2085 break;
2086 default:
2087 token->type = CHARACTER;
2089 return 1;
2092 /* Functions for parser. */
2094 /* Entry point of the parser.
2095 Parse the regular expression REGEXP and return the structure tree.
2096 If an error is occured, ERR is set by error code, and return NULL.
2097 This function build the following tree, from regular expression <reg_exp>:
2101 <reg_exp> EOR
2103 CAT means concatenation.
2104 EOR means end of regular expression. */
2106 static bin_tree_t *
2107 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2108 reg_errcode_t *err)
2110 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111 bin_tree_t *tree, *eor, *root;
2112 re_token_t current_token;
2113 dfa->syntax = syntax;
2114 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2115 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2116 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2117 return NULL;
2118 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2119 if (tree != NULL)
2120 root = create_tree (dfa, tree, eor, CONCAT);
2121 else
2122 root = eor;
2123 if (BE (eor == NULL || root == NULL, 0))
2125 *err = REG_ESPACE;
2126 return NULL;
2128 return root;
2131 /* This function build the following tree, from regular expression
2132 <branch1>|<branch2>:
2136 <branch1> <branch2>
2138 ALT means alternative, which represents the operator `|'. */
2140 static bin_tree_t *
2141 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2142 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2144 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145 bin_tree_t *tree, *branch = NULL;
2146 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2147 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2148 return NULL;
2150 while (token->type == OP_ALT)
2152 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2153 if (token->type != OP_ALT && token->type != END_OF_RE
2154 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2157 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2158 return NULL;
2160 else
2161 branch = NULL;
2162 tree = create_tree (dfa, tree, branch, OP_ALT);
2163 if (BE (tree == NULL, 0))
2165 *err = REG_ESPACE;
2166 return NULL;
2169 return tree;
2172 /* This function build the following tree, from regular expression
2173 <exp1><exp2>:
2177 <exp1> <exp2>
2179 CAT means concatenation. */
2181 static bin_tree_t *
2182 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2183 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2185 bin_tree_t *tree, *expr;
2186 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2187 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2188 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189 return NULL;
2191 while (token->type != OP_ALT && token->type != END_OF_RE
2192 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2197 return NULL;
2199 if (tree != NULL && expr != NULL)
2201 tree = create_tree (dfa, tree, expr, CONCAT);
2202 if (tree == NULL)
2204 *err = REG_ESPACE;
2205 return NULL;
2208 else if (tree == NULL)
2209 tree = expr;
2210 /* Otherwise expr == NULL, we don't need to create new tree. */
2212 return tree;
2215 /* This function build the following tree, from regular expression a*:
2221 static bin_tree_t *
2222 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2226 bin_tree_t *tree;
2227 switch (token->type)
2229 case CHARACTER:
2230 tree = create_token_tree (dfa, NULL, NULL, token);
2231 if (BE (tree == NULL, 0))
2233 *err = REG_ESPACE;
2234 return NULL;
2236 #ifdef RE_ENABLE_I18N
2237 if (dfa->mb_cur_max > 1)
2239 while (!re_string_eoi (regexp)
2240 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242 bin_tree_t *mbc_remain;
2243 fetch_token (token, regexp, syntax);
2244 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2245 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2246 if (BE (mbc_remain == NULL || tree == NULL, 0))
2248 *err = REG_ESPACE;
2249 return NULL;
2253 #endif
2254 break;
2255 case OP_OPEN_SUBEXP:
2256 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2257 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258 return NULL;
2259 break;
2260 case OP_OPEN_BRACKET:
2261 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2262 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2263 return NULL;
2264 break;
2265 case OP_BACK_REF:
2266 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268 *err = REG_ESUBREG;
2269 return NULL;
2271 dfa->used_bkref_map |= 1 << token->opr.idx;
2272 tree = create_token_tree (dfa, NULL, NULL, token);
2273 if (BE (tree == NULL, 0))
2275 *err = REG_ESPACE;
2276 return NULL;
2278 ++dfa->nbackref;
2279 dfa->has_mb_node = 1;
2280 break;
2281 case OP_OPEN_DUP_NUM:
2282 if (syntax & RE_CONTEXT_INVALID_DUP)
2284 *err = REG_BADRPT;
2285 return NULL;
2287 /* FALLTHROUGH */
2288 case OP_DUP_ASTERISK:
2289 case OP_DUP_PLUS:
2290 case OP_DUP_QUESTION:
2291 if (syntax & RE_CONTEXT_INVALID_OPS)
2293 *err = REG_BADRPT;
2294 return NULL;
2296 else if (syntax & RE_CONTEXT_INDEP_OPS)
2298 fetch_token (token, regexp, syntax);
2299 return parse_expression (regexp, preg, token, syntax, nest, err);
2301 /* else fall through */
2302 case OP_CLOSE_SUBEXP:
2303 if ((token->type == OP_CLOSE_SUBEXP) &&
2304 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306 *err = REG_ERPAREN;
2307 return NULL;
2309 /* else fall through */
2310 case OP_CLOSE_DUP_NUM:
2311 /* We treat it as a normal character. */
2313 /* Then we can these characters as normal characters. */
2314 token->type = CHARACTER;
2315 /* mb_partial and word_char bits should be initialized already
2316 by peek_token. */
2317 tree = create_token_tree (dfa, NULL, NULL, token);
2318 if (BE (tree == NULL, 0))
2320 *err = REG_ESPACE;
2321 return NULL;
2323 break;
2324 case ANCHOR:
2325 if ((token->opr.ctx_type
2326 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2327 && dfa->word_ops_used == 0)
2328 init_word_char (dfa);
2329 if (token->opr.ctx_type == WORD_DELIM
2330 || token->opr.ctx_type == NOT_WORD_DELIM)
2332 bin_tree_t *tree_first, *tree_last;
2333 if (token->opr.ctx_type == WORD_DELIM)
2335 token->opr.ctx_type = WORD_FIRST;
2336 tree_first = create_token_tree (dfa, NULL, NULL, token);
2337 token->opr.ctx_type = WORD_LAST;
2339 else
2341 token->opr.ctx_type = INSIDE_WORD;
2342 tree_first = create_token_tree (dfa, NULL, NULL, token);
2343 token->opr.ctx_type = INSIDE_NOTWORD;
2345 tree_last = create_token_tree (dfa, NULL, NULL, token);
2346 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2347 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349 *err = REG_ESPACE;
2350 return NULL;
2353 else
2355 tree = create_token_tree (dfa, NULL, NULL, token);
2356 if (BE (tree == NULL, 0))
2358 *err = REG_ESPACE;
2359 return NULL;
2362 /* We must return here, since ANCHORs can't be followed
2363 by repetition operators.
2364 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2365 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2366 fetch_token (token, regexp, syntax);
2367 return tree;
2368 case OP_PERIOD:
2369 tree = create_token_tree (dfa, NULL, NULL, token);
2370 if (BE (tree == NULL, 0))
2372 *err = REG_ESPACE;
2373 return NULL;
2375 if (dfa->mb_cur_max > 1)
2376 dfa->has_mb_node = 1;
2377 break;
2378 case OP_WORD:
2379 case OP_NOTWORD:
2380 tree = build_charclass_op (dfa, regexp->trans,
2381 (const unsigned char *) "alnum",
2382 (const unsigned char *) "_",
2383 token->type == OP_NOTWORD, err);
2384 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385 return NULL;
2386 break;
2387 case OP_SPACE:
2388 case OP_NOTSPACE:
2389 tree = build_charclass_op (dfa, regexp->trans,
2390 (const unsigned char *) "space",
2391 (const unsigned char *) "",
2392 token->type == OP_NOTSPACE, err);
2393 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394 return NULL;
2395 break;
2396 case OP_ALT:
2397 case END_OF_RE:
2398 return NULL;
2399 case BACK_SLASH:
2400 *err = REG_EESCAPE;
2401 return NULL;
2402 default:
2403 /* Must not happen? */
2404 #ifdef DEBUG
2405 assert (0);
2406 #endif
2407 return NULL;
2409 fetch_token (token, regexp, syntax);
2411 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2412 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2415 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2416 return NULL;
2417 /* In BRE consecutive duplications are not allowed. */
2418 if ((syntax & RE_CONTEXT_INVALID_DUP)
2419 && (token->type == OP_DUP_ASTERISK
2420 || token->type == OP_OPEN_DUP_NUM))
2422 *err = REG_BADRPT;
2423 return NULL;
2427 return tree;
2430 /* This function build the following tree, from regular expression
2431 (<reg_exp>):
2432 SUBEXP
2434 <reg_exp>
2437 static bin_tree_t *
2438 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2439 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2441 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2442 bin_tree_t *tree;
2443 size_t cur_nsub;
2444 cur_nsub = preg->re_nsub++;
2446 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448 /* The subexpression may be a null string. */
2449 if (token->type == OP_CLOSE_SUBEXP)
2450 tree = NULL;
2451 else
2453 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2454 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2455 *err = REG_EPAREN;
2456 if (BE (*err != REG_NOERROR, 0))
2457 return NULL;
2460 if (cur_nsub <= '9' - '1')
2461 dfa->completed_bkref_map |= 1 << cur_nsub;
2463 tree = create_tree (dfa, tree, NULL, SUBEXP);
2464 if (BE (tree == NULL, 0))
2466 *err = REG_ESPACE;
2467 return NULL;
2469 tree->token.opr.idx = cur_nsub;
2470 return tree;
2473 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2475 static bin_tree_t *
2476 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2477 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479 bin_tree_t *tree = NULL, *old_tree = NULL;
2480 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2481 re_token_t start_token = *token;
2483 if (token->type == OP_OPEN_DUP_NUM)
2485 end = 0;
2486 start = fetch_number (regexp, token, syntax);
2487 if (start == REG_MISSING)
2489 if (token->type == CHARACTER && token->opr.c == ',')
2490 start = 0; /* We treat "{,m}" as "{0,m}". */
2491 else
2493 *err = REG_BADBR; /* <re>{} is invalid. */
2494 return NULL;
2497 if (BE (start != REG_ERROR, 1))
2499 /* We treat "{n}" as "{n,n}". */
2500 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2501 : ((token->type == CHARACTER && token->opr.c == ',')
2502 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2504 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2506 /* Invalid sequence. */
2507 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2509 if (token->type == END_OF_RE)
2510 *err = REG_EBRACE;
2511 else
2512 *err = REG_BADBR;
2514 return NULL;
2517 /* If the syntax bit is set, rollback. */
2518 re_string_set_index (regexp, start_idx);
2519 *token = start_token;
2520 token->type = CHARACTER;
2521 /* mb_partial and word_char bits should be already initialized by
2522 peek_token. */
2523 return elem;
2526 if (BE ((end != REG_MISSING && start > end)
2527 || token->type != OP_CLOSE_DUP_NUM, 0))
2529 /* First number greater than second. */
2530 *err = REG_BADBR;
2531 return NULL;
2534 else
2536 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2537 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2540 fetch_token (token, regexp, syntax);
2542 if (BE (elem == NULL, 0))
2543 return NULL;
2544 if (BE (start == 0 && end == 0, 0))
2546 postorder (elem, free_tree, NULL);
2547 return NULL;
2550 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2551 if (BE (start > 0, 0))
2553 tree = elem;
2554 for (i = 2; i <= start; ++i)
2556 elem = duplicate_tree (elem, dfa);
2557 tree = create_tree (dfa, tree, elem, CONCAT);
2558 if (BE (elem == NULL || tree == NULL, 0))
2559 goto parse_dup_op_espace;
2562 if (start == end)
2563 return tree;
2565 /* Duplicate ELEM before it is marked optional. */
2566 elem = duplicate_tree (elem, dfa);
2567 old_tree = tree;
2569 else
2570 old_tree = NULL;
2572 if (elem->token.type == SUBEXP)
2573 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575 tree = create_tree (dfa, elem, NULL,
2576 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2577 if (BE (tree == NULL, 0))
2578 goto parse_dup_op_espace;
2580 /* From gnulib's "intprops.h":
2581 True if the arithmetic type T is signed. */
2582 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2584 /* This loop is actually executed only when end != REG_MISSING,
2585 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2586 already created the start+1-th copy. */
2587 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2588 for (i = start + 2; i <= end; ++i)
2590 elem = duplicate_tree (elem, dfa);
2591 tree = create_tree (dfa, tree, elem, CONCAT);
2592 if (BE (elem == NULL || tree == NULL, 0))
2593 goto parse_dup_op_espace;
2595 tree = create_tree (dfa, tree, NULL, OP_ALT);
2596 if (BE (tree == NULL, 0))
2597 goto parse_dup_op_espace;
2600 if (old_tree)
2601 tree = create_tree (dfa, old_tree, tree, CONCAT);
2603 return tree;
2605 parse_dup_op_espace:
2606 *err = REG_ESPACE;
2607 return NULL;
2610 /* Size of the names for collating symbol/equivalence_class/character_class.
2611 I'm not sure, but maybe enough. */
2612 #define BRACKET_NAME_BUF_SIZE 32
2614 #ifndef _LIBC
2615 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2616 Build the range expression which starts from START_ELEM, and ends
2617 at END_ELEM. The result are written to MBCSET and SBCSET.
2618 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2619 mbcset->range_ends, is a pointer argument sinse we may
2620 update it. */
2622 static reg_errcode_t
2623 internal_function
2624 # ifdef RE_ENABLE_I18N
2625 build_range_exp (const reg_syntax_t syntax,
2626 bitset_t sbcset,
2627 re_charset_t *mbcset,
2628 Idx *range_alloc,
2629 const bracket_elem_t *start_elem,
2630 const bracket_elem_t *end_elem)
2631 # else /* not RE_ENABLE_I18N */
2632 build_range_exp (const reg_syntax_t syntax,
2633 bitset_t sbcset,
2634 const bracket_elem_t *start_elem,
2635 const bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2638 unsigned int start_ch, end_ch;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2643 return REG_ERANGE;
2645 /* We can handle no multi character collating elements without libc
2646 support. */
2647 if (BE ((start_elem->type == COLL_SYM
2648 && strlen ((char *) start_elem->opr.name) > 1)
2649 || (end_elem->type == COLL_SYM
2650 && strlen ((char *) end_elem->opr.name) > 1), 0))
2651 return REG_ECOLLATE;
2653 # ifdef RE_ENABLE_I18N
2655 wchar_t wc;
2656 wint_t start_wc;
2657 wint_t end_wc;
2658 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662 : 0));
2663 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665 : 0));
2666 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667 ? __btowc (start_ch) : start_elem->opr.wch);
2668 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669 ? __btowc (end_ch) : end_elem->opr.wch);
2670 if (start_wc == WEOF || end_wc == WEOF)
2671 return REG_ECOLLATE;
2672 cmp_buf[0] = start_wc;
2673 cmp_buf[4] = end_wc;
2675 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2676 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2677 return REG_ERANGE;
2679 /* Got valid collation sequence values, add them as a new entry.
2680 However, for !_LIBC we have no collation elements: if the
2681 character set is single byte, the single byte character set
2682 that we build below suffices. parse_bracket_exp passes
2683 no MBCSET if dfa->mb_cur_max == 1. */
2684 if (mbcset)
2686 /* Check the space of the arrays. */
2687 if (BE (*range_alloc == mbcset->nranges, 0))
2689 /* There is not enough space, need realloc. */
2690 wchar_t *new_array_start, *new_array_end;
2691 Idx new_nranges;
2693 /* +1 in case of mbcset->nranges is 0. */
2694 new_nranges = 2 * mbcset->nranges + 1;
2695 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2696 are NULL if *range_alloc == 0. */
2697 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2698 new_nranges);
2699 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2700 new_nranges);
2702 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2703 return REG_ESPACE;
2705 mbcset->range_starts = new_array_start;
2706 mbcset->range_ends = new_array_end;
2707 *range_alloc = new_nranges;
2710 mbcset->range_starts[mbcset->nranges] = start_wc;
2711 mbcset->range_ends[mbcset->nranges++] = end_wc;
2714 /* Build the table for single byte characters. */
2715 for (wc = 0; wc < SBC_MAX; ++wc)
2717 cmp_buf[2] = wc;
2718 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2719 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2720 bitset_set (sbcset, wc);
2723 # else /* not RE_ENABLE_I18N */
2725 unsigned int ch;
2726 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2727 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2728 : 0));
2729 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2730 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2731 : 0));
2732 if (start_ch > end_ch)
2733 return REG_ERANGE;
2734 /* Build the table for single byte characters. */
2735 for (ch = 0; ch < SBC_MAX; ++ch)
2736 if (start_ch <= ch && ch <= end_ch)
2737 bitset_set (sbcset, ch);
2739 # endif /* not RE_ENABLE_I18N */
2740 return REG_NOERROR;
2742 #endif /* not _LIBC */
2744 #ifndef _LIBC
2745 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2746 Build the collating element which is represented by NAME.
2747 The result are written to MBCSET and SBCSET.
2748 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2749 pointer argument since we may update it. */
2751 static reg_errcode_t
2752 internal_function
2753 build_collating_symbol (bitset_t sbcset,
2754 # ifdef RE_ENABLE_I18N
2755 re_charset_t *mbcset, Idx *coll_sym_alloc,
2756 # endif
2757 const unsigned char *name)
2759 size_t name_len = strlen ((const char *) name);
2760 if (BE (name_len != 1, 0))
2761 return REG_ECOLLATE;
2762 else
2764 bitset_set (sbcset, name[0]);
2765 return REG_NOERROR;
2768 #endif /* not _LIBC */
2770 /* This function parse bracket expression like "[abc]", "[a-c]",
2771 "[[.a-a.]]" etc. */
2773 static bin_tree_t *
2774 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2775 reg_syntax_t syntax, reg_errcode_t *err)
2777 #ifdef _LIBC
2778 const unsigned char *collseqmb;
2779 const char *collseqwc;
2780 uint32_t nrules;
2781 int32_t table_size;
2782 const int32_t *symb_table;
2783 const unsigned char *extra;
2785 /* Local function for parse_bracket_exp used in _LIBC environement.
2786 Seek the collating symbol entry correspondings to NAME.
2787 Return the index of the symbol in the SYMB_TABLE. */
2789 auto inline int32_t
2790 __attribute ((always_inline))
2791 seek_collating_symbol_entry (name, name_len)
2792 const unsigned char *name;
2793 size_t name_len;
2795 int32_t hash = elem_hash ((const char *) name, name_len);
2796 int32_t elem = hash % table_size;
2797 if (symb_table[2 * elem] != 0)
2799 int32_t second = hash % (table_size - 2) + 1;
2803 /* First compare the hashing value. */
2804 if (symb_table[2 * elem] == hash
2805 /* Compare the length of the name. */
2806 && name_len == extra[symb_table[2 * elem + 1]]
2807 /* Compare the name. */
2808 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2809 name_len) == 0)
2811 /* Yep, this is the entry. */
2812 break;
2815 /* Next entry. */
2816 elem += second;
2818 while (symb_table[2 * elem] != 0);
2820 return elem;
2823 /* Local function for parse_bracket_exp used in _LIBC environment.
2824 Look up the collation sequence value of BR_ELEM.
2825 Return the value if succeeded, UINT_MAX otherwise. */
2827 auto inline unsigned int
2828 __attribute ((always_inline))
2829 lookup_collation_sequence_value (br_elem)
2830 bracket_elem_t *br_elem;
2832 if (br_elem->type == SB_CHAR)
2835 if (MB_CUR_MAX == 1)
2837 if (nrules == 0)
2838 return collseqmb[br_elem->opr.ch];
2839 else
2841 wint_t wc = __btowc (br_elem->opr.ch);
2842 return __collseq_table_lookup (collseqwc, wc);
2845 else if (br_elem->type == MB_CHAR)
2847 if (nrules != 0)
2848 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2850 else if (br_elem->type == COLL_SYM)
2852 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2853 if (nrules != 0)
2855 int32_t elem, idx;
2856 elem = seek_collating_symbol_entry (br_elem->opr.name,
2857 sym_name_len);
2858 if (symb_table[2 * elem] != 0)
2860 /* We found the entry. */
2861 idx = symb_table[2 * elem + 1];
2862 /* Skip the name of collating element name. */
2863 idx += 1 + extra[idx];
2864 /* Skip the byte sequence of the collating element. */
2865 idx += 1 + extra[idx];
2866 /* Adjust for the alignment. */
2867 idx = (idx + 3) & ~3;
2868 /* Skip the multibyte collation sequence value. */
2869 idx += sizeof (unsigned int);
2870 /* Skip the wide char sequence of the collating element. */
2871 idx += sizeof (unsigned int) *
2872 (1 + *(unsigned int *) (extra + idx));
2873 /* Return the collation sequence value. */
2874 return *(unsigned int *) (extra + idx);
2876 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2878 /* No valid character. Match it as a single byte
2879 character. */
2880 return collseqmb[br_elem->opr.name[0]];
2883 else if (sym_name_len == 1)
2884 return collseqmb[br_elem->opr.name[0]];
2886 return UINT_MAX;
2889 /* Local function for parse_bracket_exp used in _LIBC environement.
2890 Build the range expression which starts from START_ELEM, and ends
2891 at END_ELEM. The result are written to MBCSET and SBCSET.
2892 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2893 mbcset->range_ends, is a pointer argument sinse we may
2894 update it. */
2896 auto inline reg_errcode_t
2897 __attribute ((always_inline))
2898 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2899 re_charset_t *mbcset;
2900 Idx *range_alloc;
2901 bitset_t sbcset;
2902 bracket_elem_t *start_elem, *end_elem;
2904 unsigned int ch;
2905 uint32_t start_collseq;
2906 uint32_t end_collseq;
2908 /* Equivalence Classes and Character Classes can't be a range
2909 start/end. */
2910 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2911 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2913 return REG_ERANGE;
2915 start_collseq = lookup_collation_sequence_value (start_elem);
2916 end_collseq = lookup_collation_sequence_value (end_elem);
2917 /* Check start/end collation sequence values. */
2918 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2919 return REG_ECOLLATE;
2920 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2921 return REG_ERANGE;
2923 /* Got valid collation sequence values, add them as a new entry.
2924 However, if we have no collation elements, and the character set
2925 is single byte, the single byte character set that we
2926 build below suffices. */
2927 if (nrules > 0 || dfa->mb_cur_max > 1)
2929 /* Check the space of the arrays. */
2930 if (BE (*range_alloc == mbcset->nranges, 0))
2932 /* There is not enough space, need realloc. */
2933 uint32_t *new_array_start;
2934 uint32_t *new_array_end;
2935 Idx new_nranges;
2937 /* +1 in case of mbcset->nranges is 0. */
2938 new_nranges = 2 * mbcset->nranges + 1;
2939 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2940 new_nranges);
2941 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2942 new_nranges);
2944 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2945 return REG_ESPACE;
2947 mbcset->range_starts = new_array_start;
2948 mbcset->range_ends = new_array_end;
2949 *range_alloc = new_nranges;
2952 mbcset->range_starts[mbcset->nranges] = start_collseq;
2953 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2956 /* Build the table for single byte characters. */
2957 for (ch = 0; ch < SBC_MAX; ch++)
2959 uint32_t ch_collseq;
2961 if (MB_CUR_MAX == 1)
2963 if (nrules == 0)
2964 ch_collseq = collseqmb[ch];
2965 else
2966 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2967 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2968 bitset_set (sbcset, ch);
2970 return REG_NOERROR;
2973 /* Local function for parse_bracket_exp used in _LIBC environement.
2974 Build the collating element which is represented by NAME.
2975 The result are written to MBCSET and SBCSET.
2976 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2977 pointer argument sinse we may update it. */
2979 auto inline reg_errcode_t
2980 __attribute ((always_inline))
2981 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2982 re_charset_t *mbcset;
2983 Idx *coll_sym_alloc;
2984 bitset_t sbcset;
2985 const unsigned char *name;
2987 int32_t elem, idx;
2988 size_t name_len = strlen ((const char *) name);
2989 if (nrules != 0)
2991 elem = seek_collating_symbol_entry (name, name_len);
2992 if (symb_table[2 * elem] != 0)
2994 /* We found the entry. */
2995 idx = symb_table[2 * elem + 1];
2996 /* Skip the name of collating element name. */
2997 idx += 1 + extra[idx];
2999 else if (symb_table[2 * elem] == 0 && name_len == 1)
3001 /* No valid character, treat it as a normal
3002 character. */
3003 bitset_set (sbcset, name[0]);
3004 return REG_NOERROR;
3006 else
3007 return REG_ECOLLATE;
3009 /* Got valid collation sequence, add it as a new entry. */
3010 /* Check the space of the arrays. */
3011 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3013 /* Not enough, realloc it. */
3014 /* +1 in case of mbcset->ncoll_syms is 0. */
3015 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3016 /* Use realloc since mbcset->coll_syms is NULL
3017 if *alloc == 0. */
3018 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3019 new_coll_sym_alloc);
3020 if (BE (new_coll_syms == NULL, 0))
3021 return REG_ESPACE;
3022 mbcset->coll_syms = new_coll_syms;
3023 *coll_sym_alloc = new_coll_sym_alloc;
3025 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3026 return REG_NOERROR;
3028 else
3030 if (BE (name_len != 1, 0))
3031 return REG_ECOLLATE;
3032 else
3034 bitset_set (sbcset, name[0]);
3035 return REG_NOERROR;
3039 #endif
3041 re_token_t br_token;
3042 re_bitset_ptr_t sbcset;
3043 #ifdef RE_ENABLE_I18N
3044 re_charset_t *mbcset;
3045 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3046 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3047 #endif /* not RE_ENABLE_I18N */
3048 bool non_match = false;
3049 bin_tree_t *work_tree;
3050 int token_len;
3051 bool first_round = true;
3052 #ifdef _LIBC
3053 collseqmb = (const unsigned char *)
3054 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3055 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3056 if (nrules)
3059 if (MB_CUR_MAX > 1)
3061 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3062 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3063 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3064 _NL_COLLATE_SYMB_TABLEMB);
3065 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3066 _NL_COLLATE_SYMB_EXTRAMB);
3068 #endif
3069 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3070 #ifdef RE_ENABLE_I18N
3071 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3072 #endif /* RE_ENABLE_I18N */
3073 #ifdef RE_ENABLE_I18N
3074 if (BE (sbcset == NULL || mbcset == NULL, 0))
3075 #else
3076 if (BE (sbcset == NULL, 0))
3077 #endif /* RE_ENABLE_I18N */
3079 *err = REG_ESPACE;
3080 return NULL;
3083 token_len = peek_token_bracket (token, regexp, syntax);
3084 if (BE (token->type == END_OF_RE, 0))
3086 *err = REG_BADPAT;
3087 goto parse_bracket_exp_free_return;
3089 if (token->type == OP_NON_MATCH_LIST)
3091 #ifdef RE_ENABLE_I18N
3092 mbcset->non_match = 1;
3093 #endif /* not RE_ENABLE_I18N */
3094 non_match = true;
3095 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3096 bitset_set (sbcset, '\n');
3097 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3098 token_len = peek_token_bracket (token, regexp, syntax);
3099 if (BE (token->type == END_OF_RE, 0))
3101 *err = REG_BADPAT;
3102 goto parse_bracket_exp_free_return;
3106 /* We treat the first ']' as a normal character. */
3107 if (token->type == OP_CLOSE_BRACKET)
3108 token->type = CHARACTER;
3110 while (1)
3112 bracket_elem_t start_elem, end_elem;
3113 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3114 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3115 reg_errcode_t ret;
3116 int token_len2 = 0;
3117 bool is_range_exp = false;
3118 re_token_t token2;
3120 start_elem.opr.name = start_name_buf;
3121 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3122 syntax, first_round);
3123 if (BE (ret != REG_NOERROR, 0))
3125 *err = ret;
3126 goto parse_bracket_exp_free_return;
3128 first_round = false;
3130 /* Get information about the next token. We need it in any case. */
3131 token_len = peek_token_bracket (token, regexp, syntax);
3133 /* Do not check for ranges if we know they are not allowed. */
3134 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3136 if (BE (token->type == END_OF_RE, 0))
3138 *err = REG_EBRACK;
3139 goto parse_bracket_exp_free_return;
3141 if (token->type == OP_CHARSET_RANGE)
3143 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3144 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3145 if (BE (token2.type == END_OF_RE, 0))
3147 *err = REG_EBRACK;
3148 goto parse_bracket_exp_free_return;
3150 if (token2.type == OP_CLOSE_BRACKET)
3152 /* We treat the last '-' as a normal character. */
3153 re_string_skip_bytes (regexp, -token_len);
3154 token->type = CHARACTER;
3156 else
3157 is_range_exp = true;
3161 if (is_range_exp == true)
3163 end_elem.opr.name = end_name_buf;
3164 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3165 dfa, syntax, true);
3166 if (BE (ret != REG_NOERROR, 0))
3168 *err = ret;
3169 goto parse_bracket_exp_free_return;
3172 token_len = peek_token_bracket (token, regexp, syntax);
3174 #ifdef _LIBC
3175 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3176 &start_elem, &end_elem);
3177 #else
3178 # ifdef RE_ENABLE_I18N
3179 *err = build_range_exp (syntax, sbcset,
3180 dfa->mb_cur_max > 1 ? mbcset : NULL,
3181 &range_alloc, &start_elem, &end_elem);
3182 # else
3183 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3184 # endif
3185 #endif /* RE_ENABLE_I18N */
3186 if (BE (*err != REG_NOERROR, 0))
3187 goto parse_bracket_exp_free_return;
3189 else
3191 switch (start_elem.type)
3193 case SB_CHAR:
3194 bitset_set (sbcset, start_elem.opr.ch);
3195 break;
3196 #ifdef RE_ENABLE_I18N
3197 case MB_CHAR:
3198 /* Check whether the array has enough space. */
3199 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3201 wchar_t *new_mbchars;
3202 /* Not enough, realloc it. */
3203 /* +1 in case of mbcset->nmbchars is 0. */
3204 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3205 /* Use realloc since array is NULL if *alloc == 0. */
3206 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3207 mbchar_alloc);
3208 if (BE (new_mbchars == NULL, 0))
3209 goto parse_bracket_exp_espace;
3210 mbcset->mbchars = new_mbchars;
3212 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3213 break;
3214 #endif /* RE_ENABLE_I18N */
3215 case EQUIV_CLASS:
3216 *err = build_equiv_class (sbcset,
3217 #ifdef RE_ENABLE_I18N
3218 mbcset, &equiv_class_alloc,
3219 #endif /* RE_ENABLE_I18N */
3220 start_elem.opr.name);
3221 if (BE (*err != REG_NOERROR, 0))
3222 goto parse_bracket_exp_free_return;
3223 break;
3224 case COLL_SYM:
3225 *err = build_collating_symbol (sbcset,
3226 #ifdef RE_ENABLE_I18N
3227 mbcset, &coll_sym_alloc,
3228 #endif /* RE_ENABLE_I18N */
3229 start_elem.opr.name);
3230 if (BE (*err != REG_NOERROR, 0))
3231 goto parse_bracket_exp_free_return;
3232 break;
3233 case CHAR_CLASS:
3234 *err = build_charclass (regexp->trans, sbcset,
3235 #ifdef RE_ENABLE_I18N
3236 mbcset, &char_class_alloc,
3237 #endif /* RE_ENABLE_I18N */
3238 start_elem.opr.name, syntax);
3239 if (BE (*err != REG_NOERROR, 0))
3240 goto parse_bracket_exp_free_return;
3241 break;
3242 default:
3243 assert (0);
3244 break;
3247 if (BE (token->type == END_OF_RE, 0))
3249 *err = REG_EBRACK;
3250 goto parse_bracket_exp_free_return;
3252 if (token->type == OP_CLOSE_BRACKET)
3253 break;
3256 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3258 /* If it is non-matching list. */
3259 if (non_match)
3260 bitset_not (sbcset);
3262 #ifdef RE_ENABLE_I18N
3263 /* Ensure only single byte characters are set. */
3264 if (dfa->mb_cur_max > 1)
3265 bitset_mask (sbcset, dfa->sb_char);
3267 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3268 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3269 || mbcset->non_match)))
3271 bin_tree_t *mbc_tree;
3272 int sbc_idx;
3273 /* Build a tree for complex bracket. */
3274 dfa->has_mb_node = 1;
3275 br_token.type = COMPLEX_BRACKET;
3276 br_token.opr.mbcset = mbcset;
3277 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3278 if (BE (mbc_tree == NULL, 0))
3279 goto parse_bracket_exp_espace;
3280 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3281 if (sbcset[sbc_idx])
3282 break;
3283 /* If there are no bits set in sbcset, there is no point
3284 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3285 if (sbc_idx < BITSET_WORDS)
3287 /* Build a tree for simple bracket. */
3288 br_token.type = SIMPLE_BRACKET;
3289 br_token.opr.sbcset = sbcset;
3290 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3291 if (BE (work_tree == NULL, 0))
3292 goto parse_bracket_exp_espace;
3294 /* Then join them by ALT node. */
3295 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3296 if (BE (work_tree == NULL, 0))
3297 goto parse_bracket_exp_espace;
3299 else
3301 re_free (sbcset);
3302 work_tree = mbc_tree;
3305 else
3306 #endif /* not RE_ENABLE_I18N */
3308 #ifdef RE_ENABLE_I18N
3309 free_charset (mbcset);
3310 #endif
3311 /* Build a tree for simple bracket. */
3312 br_token.type = SIMPLE_BRACKET;
3313 br_token.opr.sbcset = sbcset;
3314 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3315 if (BE (work_tree == NULL, 0))
3316 goto parse_bracket_exp_espace;
3318 return work_tree;
3320 parse_bracket_exp_espace:
3321 *err = REG_ESPACE;
3322 parse_bracket_exp_free_return:
3323 re_free (sbcset);
3324 #ifdef RE_ENABLE_I18N
3325 free_charset (mbcset);
3326 #endif /* RE_ENABLE_I18N */
3327 return NULL;
3330 /* Parse an element in the bracket expression. */
3332 static reg_errcode_t
3333 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3334 re_token_t *token, int token_len, re_dfa_t *dfa,
3335 reg_syntax_t syntax, bool accept_hyphen)
3337 #ifdef RE_ENABLE_I18N
3338 int cur_char_size;
3339 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3340 if (cur_char_size > 1)
3342 elem->type = MB_CHAR;
3343 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3344 re_string_skip_bytes (regexp, cur_char_size);
3345 return REG_NOERROR;
3347 #endif /* RE_ENABLE_I18N */
3348 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3349 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3350 || token->type == OP_OPEN_EQUIV_CLASS)
3351 return parse_bracket_symbol (elem, regexp, token);
3352 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3354 /* A '-' must only appear as anything but a range indicator before
3355 the closing bracket. Everything else is an error. */
3356 re_token_t token2;
3357 (void) peek_token_bracket (&token2, regexp, syntax);
3358 if (token2.type != OP_CLOSE_BRACKET)
3359 /* The actual error value is not standardized since this whole
3360 case is undefined. But ERANGE makes good sense. */
3361 return REG_ERANGE;
3363 elem->type = SB_CHAR;
3364 elem->opr.ch = token->opr.c;
3365 return REG_NOERROR;
3368 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3369 such as [:<character_class>:], [.<collating_element>.], and
3370 [=<equivalent_class>=]. */
3372 static reg_errcode_t
3373 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3374 re_token_t *token)
3376 unsigned char ch, delim = token->opr.c;
3377 int i = 0;
3378 if (re_string_eoi(regexp))
3379 return REG_EBRACK;
3380 for (;; ++i)
3382 if (i >= BRACKET_NAME_BUF_SIZE)
3383 return REG_EBRACK;
3384 if (token->type == OP_OPEN_CHAR_CLASS)
3385 ch = re_string_fetch_byte_case (regexp);
3386 else
3387 ch = re_string_fetch_byte (regexp);
3388 if (re_string_eoi(regexp))
3389 return REG_EBRACK;
3390 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3391 break;
3392 elem->opr.name[i] = ch;
3394 re_string_skip_bytes (regexp, 1);
3395 elem->opr.name[i] = '\0';
3396 switch (token->type)
3398 case OP_OPEN_COLL_ELEM:
3399 elem->type = COLL_SYM;
3400 break;
3401 case OP_OPEN_EQUIV_CLASS:
3402 elem->type = EQUIV_CLASS;
3403 break;
3404 case OP_OPEN_CHAR_CLASS:
3405 elem->type = CHAR_CLASS;
3406 break;
3407 default:
3408 break;
3410 return REG_NOERROR;
3413 /* Helper function for parse_bracket_exp.
3414 Build the equivalence class which is represented by NAME.
3415 The result are written to MBCSET and SBCSET.
3416 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3417 is a pointer argument sinse we may update it. */
3419 static reg_errcode_t
3420 #ifdef RE_ENABLE_I18N
3421 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3422 Idx *equiv_class_alloc, const unsigned char *name)
3423 #else /* not RE_ENABLE_I18N */
3424 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3425 #endif /* not RE_ENABLE_I18N */
3427 #ifdef _LIBC
3428 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3429 if (nrules != 0)
3431 const int32_t *table, *indirect;
3432 const unsigned char *weights, *extra, *cp;
3433 unsigned char char_buf[2];
3434 int32_t idx1, idx2;
3435 unsigned int ch;
3436 size_t len;
3437 /* This #include defines a local function! */
3438 # include <locale/weight.h>
3439 /* Calculate the index for equivalence class. */
3440 cp = name;
3441 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3442 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3443 _NL_COLLATE_WEIGHTMB);
3444 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3445 _NL_COLLATE_EXTRAMB);
3446 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3447 _NL_COLLATE_INDIRECTMB);
3448 idx1 = findidx (&cp);
3449 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3450 /* This isn't a valid character. */
3451 return REG_ECOLLATE;
3453 /* Build single byte matcing table for this equivalence class. */
3454 char_buf[1] = (unsigned char) '\0';
3455 len = weights[idx1 & 0xffffff];
3456 for (ch = 0; ch < SBC_MAX; ++ch)
3458 char_buf[0] = ch;
3459 cp = char_buf;
3460 idx2 = findidx (&cp);
3462 idx2 = table[ch];
3464 if (idx2 == 0)
3465 /* This isn't a valid character. */
3466 continue;
3467 /* Compare only if the length matches and the collation rule
3468 index is the same. */
3469 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3471 int cnt = 0;
3473 while (cnt <= len &&
3474 weights[(idx1 & 0xffffff) + 1 + cnt]
3475 == weights[(idx2 & 0xffffff) + 1 + cnt])
3476 ++cnt;
3478 if (cnt > len)
3479 bitset_set (sbcset, ch);
3482 /* Check whether the array has enough space. */
3483 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3485 /* Not enough, realloc it. */
3486 /* +1 in case of mbcset->nequiv_classes is 0. */
3487 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3488 /* Use realloc since the array is NULL if *alloc == 0. */
3489 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3490 int32_t,
3491 new_equiv_class_alloc);
3492 if (BE (new_equiv_classes == NULL, 0))
3493 return REG_ESPACE;
3494 mbcset->equiv_classes = new_equiv_classes;
3495 *equiv_class_alloc = new_equiv_class_alloc;
3497 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3499 else
3500 #endif /* _LIBC */
3502 if (BE (strlen ((const char *) name) != 1, 0))
3503 return REG_ECOLLATE;
3504 bitset_set (sbcset, *name);
3506 return REG_NOERROR;
3509 /* Helper function for parse_bracket_exp.
3510 Build the character class which is represented by NAME.
3511 The result are written to MBCSET and SBCSET.
3512 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3513 is a pointer argument sinse we may update it. */
3515 static reg_errcode_t
3516 #ifdef RE_ENABLE_I18N
3517 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3518 re_charset_t *mbcset, Idx *char_class_alloc,
3519 const unsigned char *class_name, reg_syntax_t syntax)
3520 #else /* not RE_ENABLE_I18N */
3521 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3522 const unsigned char *class_name, reg_syntax_t syntax)
3523 #endif /* not RE_ENABLE_I18N */
3525 int i;
3526 const char *name = (const char *) class_name;
3528 /* In case of REG_ICASE "upper" and "lower" match the both of
3529 upper and lower cases. */
3530 if ((syntax & RE_ICASE)
3531 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3532 name = "alpha";
3534 #ifdef RE_ENABLE_I18N
3535 /* Check the space of the arrays. */
3536 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3538 /* Not enough, realloc it. */
3539 /* +1 in case of mbcset->nchar_classes is 0. */
3540 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3541 /* Use realloc since array is NULL if *alloc == 0. */
3542 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3543 new_char_class_alloc);
3544 if (BE (new_char_classes == NULL, 0))
3545 return REG_ESPACE;
3546 mbcset->char_classes = new_char_classes;
3547 *char_class_alloc = new_char_class_alloc;
3549 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3550 #endif /* RE_ENABLE_I18N */
3552 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3553 do { \
3554 if (BE (trans != NULL, 0)) \
3556 for (i = 0; i < SBC_MAX; ++i) \
3557 if (ctype_func (i)) \
3558 bitset_set (sbcset, trans[i]); \
3560 else \
3562 for (i = 0; i < SBC_MAX; ++i) \
3563 if (ctype_func (i)) \
3564 bitset_set (sbcset, i); \
3566 } while (0)
3568 if (strcmp (name, "alnum") == 0)
3569 BUILD_CHARCLASS_LOOP (isalnum);
3570 else if (strcmp (name, "cntrl") == 0)
3571 BUILD_CHARCLASS_LOOP (iscntrl);
3572 else if (strcmp (name, "lower") == 0)
3573 BUILD_CHARCLASS_LOOP (islower);
3574 else if (strcmp (name, "space") == 0)
3575 BUILD_CHARCLASS_LOOP (isspace);
3576 else if (strcmp (name, "alpha") == 0)
3577 BUILD_CHARCLASS_LOOP (isalpha);
3578 else if (strcmp (name, "digit") == 0)
3579 BUILD_CHARCLASS_LOOP (isdigit);
3580 else if (strcmp (name, "print") == 0)
3581 BUILD_CHARCLASS_LOOP (isprint);
3582 else if (strcmp (name, "upper") == 0)
3583 BUILD_CHARCLASS_LOOP (isupper);
3584 else if (strcmp (name, "blank") == 0)
3585 BUILD_CHARCLASS_LOOP (isblank);
3586 else if (strcmp (name, "graph") == 0)
3587 BUILD_CHARCLASS_LOOP (isgraph);
3588 else if (strcmp (name, "punct") == 0)
3589 BUILD_CHARCLASS_LOOP (ispunct);
3590 else if (strcmp (name, "xdigit") == 0)
3591 BUILD_CHARCLASS_LOOP (isxdigit);
3592 else
3593 return REG_ECTYPE;
3595 return REG_NOERROR;
3598 static bin_tree_t *
3599 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3600 const unsigned char *class_name,
3601 const unsigned char *extra, bool non_match,
3602 reg_errcode_t *err)
3604 re_bitset_ptr_t sbcset;
3605 #ifdef RE_ENABLE_I18N
3606 re_charset_t *mbcset;
3607 Idx alloc = 0;
3608 #endif /* not RE_ENABLE_I18N */
3609 reg_errcode_t ret;
3610 re_token_t br_token;
3611 bin_tree_t *tree;
3613 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3614 #ifdef RE_ENABLE_I18N
3615 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3616 #endif /* RE_ENABLE_I18N */
3618 #ifdef RE_ENABLE_I18N
3619 if (BE (sbcset == NULL || mbcset == NULL, 0))
3620 #else /* not RE_ENABLE_I18N */
3621 if (BE (sbcset == NULL, 0))
3622 #endif /* not RE_ENABLE_I18N */
3624 *err = REG_ESPACE;
3625 return NULL;
3628 if (non_match)
3630 #ifdef RE_ENABLE_I18N
3631 mbcset->non_match = 1;
3632 #endif /* not RE_ENABLE_I18N */
3635 /* We don't care the syntax in this case. */
3636 ret = build_charclass (trans, sbcset,
3637 #ifdef RE_ENABLE_I18N
3638 mbcset, &alloc,
3639 #endif /* RE_ENABLE_I18N */
3640 class_name, 0);
3642 if (BE (ret != REG_NOERROR, 0))
3644 re_free (sbcset);
3645 #ifdef RE_ENABLE_I18N
3646 free_charset (mbcset);
3647 #endif /* RE_ENABLE_I18N */
3648 *err = ret;
3649 return NULL;
3651 /* \w match '_' also. */
3652 for (; *extra; extra++)
3653 bitset_set (sbcset, *extra);
3655 /* If it is non-matching list. */
3656 if (non_match)
3657 bitset_not (sbcset);
3659 #ifdef RE_ENABLE_I18N
3660 /* Ensure only single byte characters are set. */
3661 if (dfa->mb_cur_max > 1)
3662 bitset_mask (sbcset, dfa->sb_char);
3663 #endif
3665 /* Build a tree for simple bracket. */
3666 br_token.type = SIMPLE_BRACKET;
3667 br_token.opr.sbcset = sbcset;
3668 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3669 if (BE (tree == NULL, 0))
3670 goto build_word_op_espace;
3672 #ifdef RE_ENABLE_I18N
3673 if (dfa->mb_cur_max > 1)
3675 bin_tree_t *mbc_tree;
3676 /* Build a tree for complex bracket. */
3677 br_token.type = COMPLEX_BRACKET;
3678 br_token.opr.mbcset = mbcset;
3679 dfa->has_mb_node = 1;
3680 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3681 if (BE (mbc_tree == NULL, 0))
3682 goto build_word_op_espace;
3683 /* Then join them by ALT node. */
3684 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3685 if (BE (mbc_tree != NULL, 1))
3686 return tree;
3688 else
3690 free_charset (mbcset);
3691 return tree;
3693 #else /* not RE_ENABLE_I18N */
3694 return tree;
3695 #endif /* not RE_ENABLE_I18N */
3697 build_word_op_espace:
3698 re_free (sbcset);
3699 #ifdef RE_ENABLE_I18N
3700 free_charset (mbcset);
3701 #endif /* RE_ENABLE_I18N */
3702 *err = REG_ESPACE;
3703 return NULL;
3706 /* This is intended for the expressions like "a{1,3}".
3707 Fetch a number from `input', and return the number.
3708 Return REG_MISSING if the number field is empty like "{,1}".
3709 Return REG_ERROR if an error occurred. */
3711 static Idx
3712 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3714 Idx num = REG_MISSING;
3715 unsigned char c;
3716 while (1)
3718 fetch_token (token, input, syntax);
3719 c = token->opr.c;
3720 if (BE (token->type == END_OF_RE, 0))
3721 return REG_ERROR;
3722 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3723 break;
3724 num = ((token->type != CHARACTER || c < '0' || '9' < c
3725 || num == REG_ERROR)
3726 ? REG_ERROR
3727 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3728 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3730 return num;
3733 #ifdef RE_ENABLE_I18N
3734 static void
3735 free_charset (re_charset_t *cset)
3737 re_free (cset->mbchars);
3738 # ifdef _LIBC
3739 re_free (cset->coll_syms);
3740 re_free (cset->equiv_classes);
3741 re_free (cset->range_starts);
3742 re_free (cset->range_ends);
3743 # endif
3744 re_free (cset->char_classes);
3745 re_free (cset);
3747 #endif /* RE_ENABLE_I18N */
3749 /* Functions for binary tree operation. */
3751 /* Create a tree node. */
3753 static bin_tree_t *
3754 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3755 re_token_type_t type)
3757 re_token_t t;
3758 t.type = type;
3759 return create_token_tree (dfa, left, right, &t);
3762 static bin_tree_t *
3763 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3764 const re_token_t *token)
3766 bin_tree_t *tree;
3767 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3769 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3771 if (storage == NULL)
3772 return NULL;
3773 storage->next = dfa->str_tree_storage;
3774 dfa->str_tree_storage = storage;
3775 dfa->str_tree_storage_idx = 0;
3777 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3779 tree->parent = NULL;
3780 tree->left = left;
3781 tree->right = right;
3782 tree->token = *token;
3783 tree->token.duplicated = 0;
3784 tree->token.opt_subexp = 0;
3785 tree->first = NULL;
3786 tree->next = NULL;
3787 tree->node_idx = REG_MISSING;
3789 if (left != NULL)
3790 left->parent = tree;
3791 if (right != NULL)
3792 right->parent = tree;
3793 return tree;
3796 /* Mark the tree SRC as an optional subexpression.
3797 To be called from preorder or postorder. */
3799 static reg_errcode_t
3800 mark_opt_subexp (void *extra, bin_tree_t *node)
3802 Idx idx = (Idx) (long) extra;
3803 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3804 node->token.opt_subexp = 1;
3806 return REG_NOERROR;
3809 /* Free the allocated memory inside NODE. */
3811 static void
3812 free_token (re_token_t *node)
3814 #ifdef RE_ENABLE_I18N
3815 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3816 free_charset (node->opr.mbcset);
3817 else
3818 #endif /* RE_ENABLE_I18N */
3819 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3820 re_free (node->opr.sbcset);
3823 /* Worker function for tree walking. Free the allocated memory inside NODE
3824 and its children. */
3826 static reg_errcode_t
3827 free_tree (void *extra, bin_tree_t *node)
3829 free_token (&node->token);
3830 return REG_NOERROR;
3834 /* Duplicate the node SRC, and return new node. This is a preorder
3835 visit similar to the one implemented by the generic visitor, but
3836 we need more infrastructure to maintain two parallel trees --- so,
3837 it's easier to duplicate. */
3839 static bin_tree_t *
3840 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3842 const bin_tree_t *node;
3843 bin_tree_t *dup_root;
3844 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3846 for (node = root; ; )
3848 /* Create a new tree and link it back to the current parent. */
3849 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3850 if (*p_new == NULL)
3851 return NULL;
3852 (*p_new)->parent = dup_node;
3853 (*p_new)->token.duplicated = 1;
3854 dup_node = *p_new;
3856 /* Go to the left node, or up and to the right. */
3857 if (node->left)
3859 node = node->left;
3860 p_new = &dup_node->left;
3862 else
3864 const bin_tree_t *prev = NULL;
3865 while (node->right == prev || node->right == NULL)
3867 prev = node;
3868 node = node->parent;
3869 dup_node = dup_node->parent;
3870 if (!node)
3871 return dup_root;
3873 node = node->right;
3874 p_new = &dup_node->right;