Interested in helping open source friends on HP-UX?
[git/mjg.git] / compat / regex / regcomp.c
blob06f30887080bc293994ed470ff9b313ae048dcfb
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const char *class_name,
111 const char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid[] attribute_hidden =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx[] attribute_hidden =
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
204 /* Entry points for GNU code. */
207 #ifdef ZOS_USS
209 /* For ZOS USS we must define btowc */
211 wchar_t
212 btowc (int c)
214 wchar_t wtmp[2];
215 char tmp[2];
217 tmp[0] = c;
218 tmp[1] = 0;
220 mbtowc (wtmp, tmp, 1);
221 return wtmp[0];
223 #endif
225 /* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
232 const char *
233 re_compile_pattern (const char *pattern,
234 size_t length,
235 struct re_pattern_buffer *bufp)
237 reg_errcode_t ret;
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
241 setting no_sub, unless RE_NO_SUB is set. */
242 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
244 /* Match anchors at newline. */
245 bufp->newline_anchor = 1;
247 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
249 if (!ret)
250 return NULL;
251 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
253 #ifdef _LIBC
254 weak_alias (__re_compile_pattern, re_compile_pattern)
255 #endif
257 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260 /* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262 reg_syntax_t re_syntax_options;
265 /* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
272 reg_syntax_t
273 re_set_syntax (reg_syntax_t syntax)
275 reg_syntax_t ret = re_syntax_options;
277 re_syntax_options = syntax;
278 return ret;
280 #ifdef _LIBC
281 weak_alias (__re_set_syntax, re_set_syntax)
282 #endif
285 re_compile_fastmap (struct re_pattern_buffer *bufp)
287 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
288 char *fastmap = bufp->fastmap;
290 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
291 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
292 if (dfa->init_state != dfa->init_state_word)
293 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
294 if (dfa->init_state != dfa->init_state_nl)
295 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
296 if (dfa->init_state != dfa->init_state_begbuf)
297 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
298 bufp->fastmap_accurate = 1;
299 return 0;
301 #ifdef _LIBC
302 weak_alias (__re_compile_fastmap, re_compile_fastmap)
303 #endif
305 static inline void
306 __attribute ((always_inline))
307 re_set_fastmap (char *fastmap, int icase, int ch)
309 fastmap[ch] = 1;
310 if (icase)
311 fastmap[tolower (ch)] = 1;
314 /* Helper function for re_compile_fastmap.
315 Compile fastmap for the initial_state INIT_STATE. */
317 static void
318 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
319 char *fastmap)
321 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
322 int node_cnt;
323 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
324 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
326 int node = init_state->nodes.elems[node_cnt];
327 re_token_type_t type = dfa->nodes[node].type;
329 if (type == CHARACTER)
331 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
332 #ifdef RE_ENABLE_I18N
333 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
335 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
336 wchar_t wc;
337 mbstate_t state;
339 p = buf;
340 *p++ = dfa->nodes[node].opr.c;
341 while (++node < dfa->nodes_len
342 && dfa->nodes[node].type == CHARACTER
343 && dfa->nodes[node].mb_partial)
344 *p++ = dfa->nodes[node].opr.c;
345 memset (&state, '\0', sizeof (state));
346 if (__mbrtowc (&wc, (const char *) buf, p - buf,
347 &state) == p - buf
348 && (__wcrtomb ((char *) buf, towlower (wc), &state)
349 != (size_t) -1))
350 re_set_fastmap (fastmap, 0, buf[0]);
351 re_free (buf);
353 #endif
355 else if (type == SIMPLE_BRACKET)
357 int i, ch;
358 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
360 int j;
361 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
362 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
363 if (w & ((bitset_word_t) 1 << j))
364 re_set_fastmap (fastmap, icase, ch);
367 #ifdef RE_ENABLE_I18N
368 else if (type == COMPLEX_BRACKET)
370 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
371 int i;
373 # ifdef _LIBC
374 /* See if we have to try all bytes which start multiple collation
375 elements.
376 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
377 collation element, and don't catch 'b' since 'b' is
378 the only collation element which starts from 'b' (and
379 it is caught by SIMPLE_BRACKET). */
380 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
381 && (cset->ncoll_syms || cset->nranges))
383 const int32_t *table = (const int32_t *)
384 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
385 for (i = 0; i < SBC_MAX; ++i)
386 if (table[i] < 0)
387 re_set_fastmap (fastmap, icase, i);
389 # endif /* _LIBC */
391 /* See if we have to start the match at all multibyte characters,
392 i.e. where we would not find an invalid sequence. This only
393 applies to multibyte character sets; for single byte character
394 sets, the SIMPLE_BRACKET again suffices. */
395 if (dfa->mb_cur_max > 1
396 && (cset->nchar_classes || cset->non_match || cset->nranges
397 # ifdef _LIBC
398 || cset->nequiv_classes
399 # endif /* _LIBC */
402 unsigned char c = 0;
405 mbstate_t mbs;
406 memset (&mbs, 0, sizeof (mbs));
407 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
408 re_set_fastmap (fastmap, false, (int) c);
410 while (++c != 0);
413 else
415 /* ... Else catch all bytes which can start the mbchars. */
416 for (i = 0; i < cset->nmbchars; ++i)
418 char buf[256];
419 mbstate_t state;
420 memset (&state, '\0', sizeof (state));
421 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
422 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
423 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
425 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
426 != (size_t) -1)
427 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
432 #endif /* RE_ENABLE_I18N */
433 else if (type == OP_PERIOD
434 #ifdef RE_ENABLE_I18N
435 || type == OP_UTF8_PERIOD
436 #endif /* RE_ENABLE_I18N */
437 || type == END_OF_RE)
439 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
440 if (type == END_OF_RE)
441 bufp->can_be_null = 1;
442 return;
447 /* Entry point for POSIX code. */
448 /* regcomp takes a regular expression as a string and compiles it.
450 PREG is a regex_t *. We do not expect any fields to be initialized,
451 since POSIX says we shouldn't. Thus, we set
453 `buffer' to the compiled pattern;
454 `used' to the length of the compiled pattern;
455 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
456 REG_EXTENDED bit in CFLAGS is set; otherwise, to
457 RE_SYNTAX_POSIX_BASIC;
458 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
459 `fastmap' to an allocated space for the fastmap;
460 `fastmap_accurate' to zero;
461 `re_nsub' to the number of subexpressions in PATTERN.
463 PATTERN is the address of the pattern string.
465 CFLAGS is a series of bits which affect compilation.
467 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
468 use POSIX basic syntax.
470 If REG_NEWLINE is set, then . and [^...] don't match newline.
471 Also, regexec will try a match beginning after every newline.
473 If REG_ICASE is set, then we considers upper- and lowercase
474 versions of letters to be equivalent when matching.
476 If REG_NOSUB is set, then when PREG is passed to regexec, that
477 routine will report only success or failure, and nothing about the
478 registers.
480 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
481 the return codes and their meanings.) */
484 regcomp (regex_t *__restrict preg,
485 const char *__restrict pattern,
486 int cflags)
488 reg_errcode_t ret;
489 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
490 : RE_SYNTAX_POSIX_BASIC);
492 preg->buffer = NULL;
493 preg->allocated = 0;
494 preg->used = 0;
496 /* Try to allocate space for the fastmap. */
497 preg->fastmap = re_malloc (char, SBC_MAX);
498 if (BE (preg->fastmap == NULL, 0))
499 return REG_ESPACE;
501 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
503 /* If REG_NEWLINE is set, newlines are treated differently. */
504 if (cflags & REG_NEWLINE)
505 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
506 syntax &= ~RE_DOT_NEWLINE;
507 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
508 /* It also changes the matching behavior. */
509 preg->newline_anchor = 1;
511 else
512 preg->newline_anchor = 0;
513 preg->no_sub = !!(cflags & REG_NOSUB);
514 preg->translate = NULL;
516 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
518 /* POSIX doesn't distinguish between an unmatched open-group and an
519 unmatched close-group: both are REG_EPAREN. */
520 if (ret == REG_ERPAREN)
521 ret = REG_EPAREN;
523 /* We have already checked preg->fastmap != NULL. */
524 if (BE (ret == REG_NOERROR, 1))
525 /* Compute the fastmap now, since regexec cannot modify the pattern
526 buffer. This function never fails in this implementation. */
527 (void) re_compile_fastmap (preg);
528 else
530 /* Some error occurred while compiling the expression. */
531 re_free (preg->fastmap);
532 preg->fastmap = NULL;
535 return (int) ret;
537 #ifdef _LIBC
538 weak_alias (__regcomp, regcomp)
539 #endif
541 /* Returns a message corresponding to an error code, ERRCODE, returned
542 from either regcomp or regexec. We don't use PREG here. */
544 size_t
545 regerror(int errcode, const regex_t *__restrict preg,
546 char *__restrict errbuf, size_t errbuf_size)
548 const char *msg;
549 size_t msg_size;
551 if (BE (errcode < 0
552 || errcode >= (int) (sizeof (__re_error_msgid_idx)
553 / sizeof (__re_error_msgid_idx[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
558 abort ();
560 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562 msg_size = strlen (msg) + 1; /* Includes the null. */
564 if (BE (errbuf_size != 0, 1))
566 if (BE (msg_size > errbuf_size, 0))
568 memcpy (errbuf, msg, errbuf_size - 1);
569 errbuf[errbuf_size - 1] = 0;
571 else
572 memcpy (errbuf, msg, msg_size);
575 return msg_size;
577 #ifdef _LIBC
578 weak_alias (__regerror, regerror)
579 #endif
582 #ifdef RE_ENABLE_I18N
583 /* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
587 #if __GNUC__ >= 3
588 static const bitset_t utf8_sb_map = {
589 /* Set the first 128 bits. */
590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
592 #else /* ! (__GNUC__ >= 3) */
593 static bitset_t utf8_sb_map;
594 #endif /* __GNUC__ >= 3 */
595 #endif /* RE_ENABLE_I18N */
598 static void
599 free_dfa_content (re_dfa_t *dfa)
601 int i, j;
603 if (dfa->nodes)
604 for (i = 0; i < dfa->nodes_len; ++i)
605 free_token (dfa->nodes + i);
606 re_free (dfa->nexts);
607 for (i = 0; i < dfa->nodes_len; ++i)
609 if (dfa->eclosures != NULL)
610 re_node_set_free (dfa->eclosures + i);
611 if (dfa->inveclosures != NULL)
612 re_node_set_free (dfa->inveclosures + i);
613 if (dfa->edests != NULL)
614 re_node_set_free (dfa->edests + i);
616 re_free (dfa->edests);
617 re_free (dfa->eclosures);
618 re_free (dfa->inveclosures);
619 re_free (dfa->nodes);
621 if (dfa->state_table)
622 for (i = 0; i <= dfa->state_hash_mask; ++i)
624 struct re_state_table_entry *entry = dfa->state_table + i;
625 for (j = 0; j < entry->num; ++j)
627 re_dfastate_t *state = entry->array[j];
628 free_state (state);
630 re_free (entry->array);
632 re_free (dfa->state_table);
633 #ifdef RE_ENABLE_I18N
634 if (dfa->sb_char != utf8_sb_map)
635 re_free (dfa->sb_char);
636 #endif
637 re_free (dfa->subexp_map);
638 #ifdef DEBUG
639 re_free (dfa->re_str);
640 #endif
642 re_free (dfa);
646 /* Free dynamically allocated space used by PREG. */
648 void
649 regfree (regex_t *preg)
651 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
652 if (BE (dfa != NULL, 1))
653 free_dfa_content (dfa);
654 preg->buffer = NULL;
655 preg->allocated = 0;
657 re_free (preg->fastmap);
658 preg->fastmap = NULL;
660 re_free (preg->translate);
661 preg->translate = NULL;
663 #ifdef _LIBC
664 weak_alias (__regfree, regfree)
665 #endif
667 /* Entry points compatible with 4.2 BSD regex library. We don't define
668 them unless specifically requested. */
670 #if defined _REGEX_RE_COMP || defined _LIBC
672 /* BSD has one and only one pattern buffer. */
673 static struct re_pattern_buffer re_comp_buf;
675 char *
676 # ifdef _LIBC
677 /* Make these definitions weak in libc, so POSIX programs can redefine
678 these names if they don't use our functions, and still use
679 regcomp/regexec above without link errors. */
680 weak_function
681 # endif
682 re_comp (s)
683 const char *s;
685 reg_errcode_t ret;
686 char *fastmap;
688 if (!s)
690 if (!re_comp_buf.buffer)
691 return gettext ("No previous regular expression");
692 return 0;
695 if (re_comp_buf.buffer)
697 fastmap = re_comp_buf.fastmap;
698 re_comp_buf.fastmap = NULL;
699 __regfree (&re_comp_buf);
700 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
701 re_comp_buf.fastmap = fastmap;
704 if (re_comp_buf.fastmap == NULL)
706 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
707 if (re_comp_buf.fastmap == NULL)
708 return (char *) gettext (__re_error_msgid
709 + __re_error_msgid_idx[(int) REG_ESPACE]);
712 /* Since `re_exec' always passes NULL for the `regs' argument, we
713 don't need to initialize the pattern buffer fields which affect it. */
715 /* Match anchors at newlines. */
716 re_comp_buf.newline_anchor = 1;
718 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
720 if (!ret)
721 return NULL;
723 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
724 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
727 #ifdef _LIBC
728 libc_freeres_fn (free_mem)
730 __regfree (&re_comp_buf);
732 #endif
734 #endif /* _REGEX_RE_COMP */
736 /* Internal entry point.
737 Compile the regular expression PATTERN, whose length is LENGTH.
738 SYNTAX indicate regular expression's syntax. */
740 static reg_errcode_t
741 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
742 reg_syntax_t syntax)
744 reg_errcode_t err = REG_NOERROR;
745 re_dfa_t *dfa;
746 re_string_t regexp;
748 /* Initialize the pattern buffer. */
749 preg->fastmap_accurate = 0;
750 preg->syntax = syntax;
751 preg->not_bol = preg->not_eol = 0;
752 preg->used = 0;
753 preg->re_nsub = 0;
754 preg->can_be_null = 0;
755 preg->regs_allocated = REGS_UNALLOCATED;
757 /* Initialize the dfa. */
758 dfa = (re_dfa_t *) preg->buffer;
759 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
761 /* If zero allocated, but buffer is non-null, try to realloc
762 enough space. This loses if buffer's address is bogus, but
763 that is the user's responsibility. If ->buffer is NULL this
764 is a simple allocation. */
765 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
766 if (dfa == NULL)
767 return REG_ESPACE;
768 preg->allocated = sizeof (re_dfa_t);
769 preg->buffer = (unsigned char *) dfa;
771 preg->used = sizeof (re_dfa_t);
773 err = init_dfa (dfa, length);
774 if (BE (err != REG_NOERROR, 0))
776 free_dfa_content (dfa);
777 preg->buffer = NULL;
778 preg->allocated = 0;
779 return err;
781 #ifdef DEBUG
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
785 #endif
787 __libc_lock_init (dfa->lock);
789 err = re_string_construct (&regexp, pattern, length, preg->translate,
790 syntax & RE_ICASE, dfa);
791 if (BE (err != REG_NOERROR, 0))
793 re_compile_internal_free_return:
794 free_workarea_compile (preg);
795 re_string_destruct (&regexp);
796 free_dfa_content (dfa);
797 preg->buffer = NULL;
798 preg->allocated = 0;
799 return err;
802 /* Parse the regular expression, and build a structure tree. */
803 preg->re_nsub = 0;
804 dfa->str_tree = parse (&regexp, preg, syntax, &err);
805 if (BE (dfa->str_tree == NULL, 0))
806 goto re_compile_internal_free_return;
808 /* Analyze the tree and create the nfa. */
809 err = analyze (preg);
810 if (BE (err != REG_NOERROR, 0))
811 goto re_compile_internal_free_return;
813 #ifdef RE_ENABLE_I18N
814 /* If possible, do searching in single byte encoding to speed things up. */
815 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
816 optimize_utf8 (dfa);
817 #endif
819 /* Then create the initial state of the dfa. */
820 err = create_initial_state (dfa);
822 /* Release work areas. */
823 free_workarea_compile (preg);
824 re_string_destruct (&regexp);
826 if (BE (err != REG_NOERROR, 0))
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
830 preg->allocated = 0;
833 return err;
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
839 static reg_errcode_t
840 init_dfa (re_dfa_t *dfa, size_t pat_len)
842 unsigned int table_size;
843 #ifndef _LIBC
844 char *codeset_name;
845 #endif
847 memset (dfa, '\0', sizeof (re_dfa_t));
849 /* Force allocation of str_tree_storage the first time. */
850 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
852 /* Avoid overflows. */
853 if (pat_len == SIZE_MAX)
854 return REG_ESPACE;
856 dfa->nodes_alloc = pat_len + 1;
857 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size = 1; ; table_size <<= 1)
861 if (table_size > pat_len)
862 break;
864 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865 dfa->state_hash_mask = table_size - 1;
867 dfa->mb_cur_max = MB_CUR_MAX;
868 #ifdef _LIBC
869 if (dfa->mb_cur_max == 6
870 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
871 dfa->is_utf8 = 1;
872 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
873 != 0);
874 #else
875 # ifdef HAVE_LANGINFO_CODESET
876 codeset_name = nl_langinfo (CODESET);
877 # else
878 codeset_name = getenv ("LC_ALL");
879 if (codeset_name == NULL || codeset_name[0] == '\0')
880 codeset_name = getenv ("LC_CTYPE");
881 if (codeset_name == NULL || codeset_name[0] == '\0')
882 codeset_name = getenv ("LANG");
883 if (codeset_name == NULL)
884 codeset_name = "";
885 else if (strchr (codeset_name, '.') != NULL)
886 codeset_name = strchr (codeset_name, '.') + 1;
887 # endif
889 /* strcasecmp isn't a standard interface. brute force check */
890 #if 0
891 if (strcasecmp (codeset_name, "UTF-8") == 0
892 || strcasecmp (codeset_name, "UTF8") == 0)
893 dfa->is_utf8 = 1;
894 #else
895 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
896 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
897 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
898 && (codeset_name[3] == '-'
899 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
900 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
901 dfa->is_utf8 = 1;
902 #endif
904 /* We check exhaustively in the loop below if this charset is a
905 superset of ASCII. */
906 dfa->map_notascii = 0;
907 #endif
909 #ifdef RE_ENABLE_I18N
910 if (dfa->mb_cur_max > 1)
912 if (dfa->is_utf8)
914 #if !defined(__GNUC__) || __GNUC__ < 3
915 static short utf8_sb_map_inited = 0;
917 if (! utf8_sb_map_inited)
919 int i;
921 utf8_sb_map_inited = 0;
922 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
923 utf8_sb_map[i] = BITSET_WORD_MAX;
925 #endif
926 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
928 else
930 int i, j, ch;
932 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
933 if (BE (dfa->sb_char == NULL, 0))
934 return REG_ESPACE;
936 /* Set the bits corresponding to single byte chars. */
937 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
938 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
940 wint_t wch = __btowc (ch);
941 if (wch != WEOF)
942 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
943 # ifndef _LIBC
944 if (isascii (ch) && wch != ch)
945 dfa->map_notascii = 1;
946 # endif
950 #endif
952 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
953 return REG_ESPACE;
954 return REG_NOERROR;
957 /* Initialize WORD_CHAR table, which indicate which character is
958 "word". In this case "word" means that it is the word construction
959 character used by some operators like "\<", "\>", etc. */
961 static void
962 internal_function
963 init_word_char (re_dfa_t *dfa)
965 int i, j, ch;
966 dfa->word_ops_used = 1;
967 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
968 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
969 if (isalnum (ch) || ch == '_')
970 dfa->word_char[i] |= (bitset_word_t) 1 << j;
973 /* Free the work area which are only used while compiling. */
975 static void
976 free_workarea_compile (regex_t *preg)
978 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
979 bin_tree_storage_t *storage, *next;
980 for (storage = dfa->str_tree_storage; storage; storage = next)
982 next = storage->next;
983 re_free (storage);
985 dfa->str_tree_storage = NULL;
986 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
987 dfa->str_tree = NULL;
988 re_free (dfa->org_indices);
989 dfa->org_indices = NULL;
992 /* Create initial states for all contexts. */
994 static reg_errcode_t
995 create_initial_state (re_dfa_t *dfa)
997 int first, i;
998 reg_errcode_t err;
999 re_node_set init_nodes;
1001 /* Initial states have the epsilon closure of the node which is
1002 the first node of the regular expression. */
1003 first = dfa->str_tree->first->node_idx;
1004 dfa->init_node = first;
1005 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1006 if (BE (err != REG_NOERROR, 0))
1007 return err;
1009 /* The back-references which are in initial states can epsilon transit,
1010 since in this case all of the subexpressions can be null.
1011 Then we add epsilon closures of the nodes which are the next nodes of
1012 the back-references. */
1013 if (dfa->nbackref > 0)
1014 for (i = 0; i < init_nodes.nelem; ++i)
1016 int node_idx = init_nodes.elems[i];
1017 re_token_type_t type = dfa->nodes[node_idx].type;
1019 int clexp_idx;
1020 if (type != OP_BACK_REF)
1021 continue;
1022 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1024 re_token_t *clexp_node;
1025 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1026 if (clexp_node->type == OP_CLOSE_SUBEXP
1027 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1028 break;
1030 if (clexp_idx == init_nodes.nelem)
1031 continue;
1033 if (type == OP_BACK_REF)
1035 int dest_idx = dfa->edests[node_idx].elems[0];
1036 if (!re_node_set_contains (&init_nodes, dest_idx))
1038 reg_errcode_t err = re_node_set_merge (&init_nodes,
1039 dfa->eclosures
1040 + dest_idx);
1041 if (err != REG_NOERROR)
1042 return err;
1043 i = 0;
1048 /* It must be the first time to invoke acquire_state. */
1049 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1050 /* We don't check ERR here, since the initial state must not be NULL. */
1051 if (BE (dfa->init_state == NULL, 0))
1052 return err;
1053 if (dfa->init_state->has_constraint)
1055 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1056 CONTEXT_WORD);
1057 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1058 CONTEXT_NEWLINE);
1059 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1060 &init_nodes,
1061 CONTEXT_NEWLINE
1062 | CONTEXT_BEGBUF);
1063 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1064 || dfa->init_state_begbuf == NULL, 0))
1065 return err;
1067 else
1068 dfa->init_state_word = dfa->init_state_nl
1069 = dfa->init_state_begbuf = dfa->init_state;
1071 re_node_set_free (&init_nodes);
1072 return REG_NOERROR;
1075 #ifdef RE_ENABLE_I18N
1076 /* If it is possible to do searching in single byte encoding instead of UTF-8
1077 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1078 DFA nodes where needed. */
1080 static void
1081 optimize_utf8 (re_dfa_t *dfa)
1083 int node, i, mb_chars = 0, has_period = 0;
1085 for (node = 0; node < dfa->nodes_len; ++node)
1086 switch (dfa->nodes[node].type)
1088 case CHARACTER:
1089 if (dfa->nodes[node].opr.c >= 0x80)
1090 mb_chars = 1;
1091 break;
1092 case ANCHOR:
1093 switch (dfa->nodes[node].opr.ctx_type)
1095 case LINE_FIRST:
1096 case LINE_LAST:
1097 case BUF_FIRST:
1098 case BUF_LAST:
1099 break;
1100 default:
1101 /* Word anchors etc. cannot be handled. It's okay to test
1102 opr.ctx_type since constraints (for all DFA nodes) are
1103 created by ORing one or more opr.ctx_type values. */
1104 return;
1106 break;
1107 case OP_PERIOD:
1108 has_period = 1;
1109 break;
1110 case OP_BACK_REF:
1111 case OP_ALT:
1112 case END_OF_RE:
1113 case OP_DUP_ASTERISK:
1114 case OP_OPEN_SUBEXP:
1115 case OP_CLOSE_SUBEXP:
1116 break;
1117 case COMPLEX_BRACKET:
1118 return;
1119 case SIMPLE_BRACKET:
1120 /* Just double check. The non-ASCII range starts at 0x80. */
1121 assert (0x80 % BITSET_WORD_BITS == 0);
1122 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1123 if (dfa->nodes[node].opr.sbcset[i])
1124 return;
1125 break;
1126 default:
1127 abort ();
1130 if (mb_chars || has_period)
1131 for (node = 0; node < dfa->nodes_len; ++node)
1133 if (dfa->nodes[node].type == CHARACTER
1134 && dfa->nodes[node].opr.c >= 0x80)
1135 dfa->nodes[node].mb_partial = 0;
1136 else if (dfa->nodes[node].type == OP_PERIOD)
1137 dfa->nodes[node].type = OP_UTF8_PERIOD;
1140 /* The search can be in single byte locale. */
1141 dfa->mb_cur_max = 1;
1142 dfa->is_utf8 = 0;
1143 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1145 #endif
1147 /* Analyze the structure tree, and calculate "first", "next", "edest",
1148 "eclosure", and "inveclosure". */
1150 static reg_errcode_t
1151 analyze (regex_t *preg)
1153 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1154 reg_errcode_t ret;
1156 /* Allocate arrays. */
1157 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1158 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1159 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1160 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1161 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1162 || dfa->eclosures == NULL, 0))
1163 return REG_ESPACE;
1165 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1166 if (dfa->subexp_map != NULL)
1168 int i;
1169 for (i = 0; i < preg->re_nsub; i++)
1170 dfa->subexp_map[i] = i;
1171 preorder (dfa->str_tree, optimize_subexps, dfa);
1172 for (i = 0; i < preg->re_nsub; i++)
1173 if (dfa->subexp_map[i] != i)
1174 break;
1175 if (i == preg->re_nsub)
1177 free (dfa->subexp_map);
1178 dfa->subexp_map = NULL;
1182 ret = postorder (dfa->str_tree, lower_subexps, preg);
1183 if (BE (ret != REG_NOERROR, 0))
1184 return ret;
1185 ret = postorder (dfa->str_tree, calc_first, dfa);
1186 if (BE (ret != REG_NOERROR, 0))
1187 return ret;
1188 preorder (dfa->str_tree, calc_next, dfa);
1189 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1190 if (BE (ret != REG_NOERROR, 0))
1191 return ret;
1192 ret = calc_eclosure (dfa);
1193 if (BE (ret != REG_NOERROR, 0))
1194 return ret;
1196 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1197 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1198 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1199 || dfa->nbackref)
1201 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1202 if (BE (dfa->inveclosures == NULL, 0))
1203 return REG_ESPACE;
1204 ret = calc_inveclosure (dfa);
1207 return ret;
1210 /* Our parse trees are very unbalanced, so we cannot use a stack to
1211 implement parse tree visits. Instead, we use parent pointers and
1212 some hairy code in these two functions. */
1213 static reg_errcode_t
1214 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1215 void *extra)
1217 bin_tree_t *node, *prev;
1219 for (node = root; ; )
1221 /* Descend down the tree, preferably to the left (or to the right
1222 if that's the only child). */
1223 while (node->left || node->right)
1224 if (node->left)
1225 node = node->left;
1226 else
1227 node = node->right;
1231 reg_errcode_t err = fn (extra, node);
1232 if (BE (err != REG_NOERROR, 0))
1233 return err;
1234 if (node->parent == NULL)
1235 return REG_NOERROR;
1236 prev = node;
1237 node = node->parent;
1239 /* Go up while we have a node that is reached from the right. */
1240 while (node->right == prev || node->right == NULL);
1241 node = node->right;
1245 static reg_errcode_t
1246 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1247 void *extra)
1249 bin_tree_t *node;
1251 for (node = root; ; )
1253 reg_errcode_t err = fn (extra, node);
1254 if (BE (err != REG_NOERROR, 0))
1255 return err;
1257 /* Go to the left node, or up and to the right. */
1258 if (node->left)
1259 node = node->left;
1260 else
1262 bin_tree_t *prev = NULL;
1263 while (node->right == prev || node->right == NULL)
1265 prev = node;
1266 node = node->parent;
1267 if (!node)
1268 return REG_NOERROR;
1270 node = node->right;
1275 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1276 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1277 backreferences as well. Requires a preorder visit. */
1278 static reg_errcode_t
1279 optimize_subexps (void *extra, bin_tree_t *node)
1281 re_dfa_t *dfa = (re_dfa_t *) extra;
1283 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1285 int idx = node->token.opr.idx;
1286 node->token.opr.idx = dfa->subexp_map[idx];
1287 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1290 else if (node->token.type == SUBEXP
1291 && node->left && node->left->token.type == SUBEXP)
1293 int other_idx = node->left->token.opr.idx;
1295 node->left = node->left->left;
1296 if (node->left)
1297 node->left->parent = node;
1299 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1300 if (other_idx < BITSET_WORD_BITS)
1301 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1304 return REG_NOERROR;
1307 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1308 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1309 static reg_errcode_t
1310 lower_subexps (void *extra, bin_tree_t *node)
1312 regex_t *preg = (regex_t *) extra;
1313 reg_errcode_t err = REG_NOERROR;
1315 if (node->left && node->left->token.type == SUBEXP)
1317 node->left = lower_subexp (&err, preg, node->left);
1318 if (node->left)
1319 node->left->parent = node;
1321 if (node->right && node->right->token.type == SUBEXP)
1323 node->right = lower_subexp (&err, preg, node->right);
1324 if (node->right)
1325 node->right->parent = node;
1328 return err;
1331 static bin_tree_t *
1332 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1334 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1335 bin_tree_t *body = node->left;
1336 bin_tree_t *op, *cls, *tree1, *tree;
1338 if (preg->no_sub
1339 /* We do not optimize empty subexpressions, because otherwise we may
1340 have bad CONCAT nodes with NULL children. This is obviously not
1341 very common, so we do not lose much. An example that triggers
1342 this case is the sed "script" /\(\)/x. */
1343 && node->left != NULL
1344 && (node->token.opr.idx >= BITSET_WORD_BITS
1345 || !(dfa->used_bkref_map
1346 & ((bitset_word_t) 1 << node->token.opr.idx))))
1347 return node->left;
1349 /* Convert the SUBEXP node to the concatenation of an
1350 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1351 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1352 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1353 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1354 tree = create_tree (dfa, op, tree1, CONCAT);
1355 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1357 *err = REG_ESPACE;
1358 return NULL;
1361 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1362 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1363 return tree;
1366 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1367 nodes. Requires a postorder visit. */
1368 static reg_errcode_t
1369 calc_first (void *extra, bin_tree_t *node)
1371 re_dfa_t *dfa = (re_dfa_t *) extra;
1372 if (node->token.type == CONCAT)
1374 node->first = node->left->first;
1375 node->node_idx = node->left->node_idx;
1377 else
1379 node->first = node;
1380 node->node_idx = re_dfa_add_node (dfa, node->token);
1381 if (BE (node->node_idx == -1, 0))
1382 return REG_ESPACE;
1383 if (node->token.type == ANCHOR)
1384 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1386 return REG_NOERROR;
1389 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1390 static reg_errcode_t
1391 calc_next (void *extra, bin_tree_t *node)
1393 switch (node->token.type)
1395 case OP_DUP_ASTERISK:
1396 node->left->next = node;
1397 break;
1398 case CONCAT:
1399 node->left->next = node->right->first;
1400 node->right->next = node->next;
1401 break;
1402 default:
1403 if (node->left)
1404 node->left->next = node->next;
1405 if (node->right)
1406 node->right->next = node->next;
1407 break;
1409 return REG_NOERROR;
1412 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1413 static reg_errcode_t
1414 link_nfa_nodes (void *extra, bin_tree_t *node)
1416 re_dfa_t *dfa = (re_dfa_t *) extra;
1417 int idx = node->node_idx;
1418 reg_errcode_t err = REG_NOERROR;
1420 switch (node->token.type)
1422 case CONCAT:
1423 break;
1425 case END_OF_RE:
1426 assert (node->next == NULL);
1427 break;
1429 case OP_DUP_ASTERISK:
1430 case OP_ALT:
1432 int left, right;
1433 dfa->has_plural_match = 1;
1434 if (node->left != NULL)
1435 left = node->left->first->node_idx;
1436 else
1437 left = node->next->node_idx;
1438 if (node->right != NULL)
1439 right = node->right->first->node_idx;
1440 else
1441 right = node->next->node_idx;
1442 assert (left > -1);
1443 assert (right > -1);
1444 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1446 break;
1448 case ANCHOR:
1449 case OP_OPEN_SUBEXP:
1450 case OP_CLOSE_SUBEXP:
1451 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1452 break;
1454 case OP_BACK_REF:
1455 dfa->nexts[idx] = node->next->node_idx;
1456 if (node->token.type == OP_BACK_REF)
1457 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1458 break;
1460 default:
1461 assert (!IS_EPSILON_NODE (node->token.type));
1462 dfa->nexts[idx] = node->next->node_idx;
1463 break;
1466 return err;
1469 /* Duplicate the epsilon closure of the node ROOT_NODE.
1470 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1471 to their own constraint. */
1473 static reg_errcode_t
1474 internal_function
1475 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1476 int root_node, unsigned int init_constraint)
1478 int org_node, clone_node, ret;
1479 unsigned int constraint = init_constraint;
1480 for (org_node = top_org_node, clone_node = top_clone_node;;)
1482 int org_dest, clone_dest;
1483 if (dfa->nodes[org_node].type == OP_BACK_REF)
1485 /* If the back reference epsilon-transit, its destination must
1486 also have the constraint. Then duplicate the epsilon closure
1487 of the destination of the back reference, and store it in
1488 edests of the back reference. */
1489 org_dest = dfa->nexts[org_node];
1490 re_node_set_empty (dfa->edests + clone_node);
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == -1, 0))
1493 return REG_ESPACE;
1494 dfa->nexts[clone_node] = dfa->nexts[org_node];
1495 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 if (BE (ret < 0, 0))
1497 return REG_ESPACE;
1499 else if (dfa->edests[org_node].nelem == 0)
1501 /* In case of the node can't epsilon-transit, don't duplicate the
1502 destination and store the original destination as the
1503 destination of the node. */
1504 dfa->nexts[clone_node] = dfa->nexts[org_node];
1505 break;
1507 else if (dfa->edests[org_node].nelem == 1)
1509 /* In case of the node can epsilon-transit, and it has only one
1510 destination. */
1511 org_dest = dfa->edests[org_node].elems[0];
1512 re_node_set_empty (dfa->edests + clone_node);
1513 /* If the node is root_node itself, it means the epsilon clsoure
1514 has a loop. Then tie it to the destination of the root_node. */
1515 if (org_node == root_node && clone_node != org_node)
1517 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1518 if (BE (ret < 0, 0))
1519 return REG_ESPACE;
1520 break;
1522 /* In case of the node has another constraint, add it. */
1523 constraint |= dfa->nodes[org_node].constraint;
1524 clone_dest = duplicate_node (dfa, org_dest, constraint);
1525 if (BE (clone_dest == -1, 0))
1526 return REG_ESPACE;
1527 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 if (BE (ret < 0, 0))
1529 return REG_ESPACE;
1531 else /* dfa->edests[org_node].nelem == 2 */
1533 /* In case of the node can epsilon-transit, and it has two
1534 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1535 org_dest = dfa->edests[org_node].elems[0];
1536 re_node_set_empty (dfa->edests + clone_node);
1537 /* Search for a duplicated node which satisfies the constraint. */
1538 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539 if (clone_dest == -1)
1541 /* There is no such duplicated node, create a new one. */
1542 reg_errcode_t err;
1543 clone_dest = duplicate_node (dfa, org_dest, constraint);
1544 if (BE (clone_dest == -1, 0))
1545 return REG_ESPACE;
1546 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547 if (BE (ret < 0, 0))
1548 return REG_ESPACE;
1549 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1550 root_node, constraint);
1551 if (BE (err != REG_NOERROR, 0))
1552 return err;
1554 else
1556 /* There is a duplicated node which satisfies the constraint,
1557 use it to avoid infinite loop. */
1558 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 if (BE (ret < 0, 0))
1560 return REG_ESPACE;
1563 org_dest = dfa->edests[org_node].elems[1];
1564 clone_dest = duplicate_node (dfa, org_dest, constraint);
1565 if (BE (clone_dest == -1, 0))
1566 return REG_ESPACE;
1567 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 if (BE (ret < 0, 0))
1569 return REG_ESPACE;
1571 org_node = org_dest;
1572 clone_node = clone_dest;
1574 return REG_NOERROR;
1577 /* Search for a node which is duplicated from the node ORG_NODE, and
1578 satisfies the constraint CONSTRAINT. */
1580 static int
1581 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1582 unsigned int constraint)
1584 int idx;
1585 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1587 if (org_node == dfa->org_indices[idx]
1588 && constraint == dfa->nodes[idx].constraint)
1589 return idx; /* Found. */
1591 return -1; /* Not found. */
1594 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595 Return the index of the new node, or -1 if insufficient storage is
1596 available. */
1598 static int
1599 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1601 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1602 if (BE (dup_idx != -1, 1))
1604 dfa->nodes[dup_idx].constraint = constraint;
1605 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1606 dfa->nodes[dup_idx].duplicated = 1;
1608 /* Store the index of the original node. */
1609 dfa->org_indices[dup_idx] = org_idx;
1611 return dup_idx;
1614 static reg_errcode_t
1615 calc_inveclosure (re_dfa_t *dfa)
1617 int src, idx, ret;
1618 for (idx = 0; idx < dfa->nodes_len; ++idx)
1619 re_node_set_init_empty (dfa->inveclosures + idx);
1621 for (src = 0; src < dfa->nodes_len; ++src)
1623 int *elems = dfa->eclosures[src].elems;
1624 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1626 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1627 if (BE (ret == -1, 0))
1628 return REG_ESPACE;
1632 return REG_NOERROR;
1635 /* Calculate "eclosure" for all the node in DFA. */
1637 static reg_errcode_t
1638 calc_eclosure (re_dfa_t *dfa)
1640 int node_idx, incomplete;
1641 #ifdef DEBUG
1642 assert (dfa->nodes_len > 0);
1643 #endif
1644 incomplete = 0;
1645 /* For each nodes, calculate epsilon closure. */
1646 for (node_idx = 0; ; ++node_idx)
1648 reg_errcode_t err;
1649 re_node_set eclosure_elem;
1650 if (node_idx == dfa->nodes_len)
1652 if (!incomplete)
1653 break;
1654 incomplete = 0;
1655 node_idx = 0;
1658 #ifdef DEBUG
1659 assert (dfa->eclosures[node_idx].nelem != -1);
1660 #endif
1662 /* If we have already calculated, skip it. */
1663 if (dfa->eclosures[node_idx].nelem != 0)
1664 continue;
1665 /* Calculate epsilon closure of `node_idx'. */
1666 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1667 if (BE (err != REG_NOERROR, 0))
1668 return err;
1670 if (dfa->eclosures[node_idx].nelem == 0)
1672 incomplete = 1;
1673 re_node_set_free (&eclosure_elem);
1676 return REG_NOERROR;
1679 /* Calculate epsilon closure of NODE. */
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1684 reg_errcode_t err;
1685 int i;
1686 re_node_set eclosure;
1687 int ret;
1688 int incomplete = 0;
1689 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690 if (BE (err != REG_NOERROR, 0))
1691 return err;
1693 /* This indicates that we are calculating this node now.
1694 We reference this value to avoid infinite loop. */
1695 dfa->eclosures[node].nelem = -1;
1697 /* If the current node has constraints, duplicate all nodes
1698 since they must inherit the constraints. */
1699 if (dfa->nodes[node].constraint
1700 && dfa->edests[node].nelem
1701 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703 err = duplicate_node_closure (dfa, node, node, node,
1704 dfa->nodes[node].constraint);
1705 if (BE (err != REG_NOERROR, 0))
1706 return err;
1709 /* Expand each epsilon destination nodes. */
1710 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711 for (i = 0; i < dfa->edests[node].nelem; ++i)
1713 re_node_set eclosure_elem;
1714 int edest = dfa->edests[node].elems[i];
1715 /* If calculating the epsilon closure of `edest' is in progress,
1716 return intermediate result. */
1717 if (dfa->eclosures[edest].nelem == -1)
1719 incomplete = 1;
1720 continue;
1722 /* If we haven't calculated the epsilon closure of `edest' yet,
1723 calculate now. Otherwise use calculated epsilon closure. */
1724 if (dfa->eclosures[edest].nelem == 0)
1726 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1727 if (BE (err != REG_NOERROR, 0))
1728 return err;
1730 else
1731 eclosure_elem = dfa->eclosures[edest];
1732 /* Merge the epsilon closure of `edest'. */
1733 err = re_node_set_merge (&eclosure, &eclosure_elem);
1734 if (BE (err != REG_NOERROR, 0))
1735 return err;
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa->eclosures[edest].nelem == 0)
1740 incomplete = 1;
1741 re_node_set_free (&eclosure_elem);
1745 /* An epsilon closure includes itself. */
1746 ret = re_node_set_insert (&eclosure, node);
1747 if (BE (ret < 0, 0))
1748 return REG_ESPACE;
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1751 else
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1754 return REG_NOERROR;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1762 static void
1763 internal_function
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1766 re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 /* Peek a token from INPUT, and return the length of the token.
1770 We must not use this function inside bracket expressions. */
1772 static int
1773 internal_function
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1776 unsigned char c;
1778 if (re_string_eoi (input))
1780 token->type = END_OF_RE;
1781 return 0;
1784 c = re_string_peek_byte (input, 0);
1785 token->opr.c = c;
1787 token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789 token->mb_partial = 0;
1790 if (input->mb_cur_max > 1 &&
1791 !re_string_first_byte (input, re_string_cur_idx (input)))
1793 token->type = CHARACTER;
1794 token->mb_partial = 1;
1795 return 1;
1797 #endif
1798 if (c == '\\')
1800 unsigned char c2;
1801 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803 token->type = BACK_SLASH;
1804 return 1;
1807 c2 = re_string_peek_byte_case (input, 1);
1808 token->opr.c = c2;
1809 token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811 if (input->mb_cur_max > 1)
1813 wint_t wc = re_string_wchar_at (input,
1814 re_string_cur_idx (input) + 1);
1815 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1817 else
1818 #endif
1819 token->word_char = IS_WORD_CHAR (c2) != 0;
1821 switch (c2)
1823 case '|':
1824 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825 token->type = OP_ALT;
1826 break;
1827 case '1': case '2': case '3': case '4': case '5':
1828 case '6': case '7': case '8': case '9':
1829 if (!(syntax & RE_NO_BK_REFS))
1831 token->type = OP_BACK_REF;
1832 token->opr.idx = c2 - '1';
1834 break;
1835 case '<':
1836 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = WORD_FIRST;
1841 break;
1842 case '>':
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_LAST;
1848 break;
1849 case 'b':
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_DELIM;
1855 break;
1856 case 'B':
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = NOT_WORD_DELIM;
1862 break;
1863 case 'w':
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = OP_WORD;
1866 break;
1867 case 'W':
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_NOTWORD;
1870 break;
1871 case 's':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_SPACE;
1874 break;
1875 case 'S':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTSPACE;
1878 break;
1879 case '`':
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = BUF_FIRST;
1885 break;
1886 case '\'':
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_LAST;
1892 break;
1893 case '(':
1894 if (!(syntax & RE_NO_BK_PARENS))
1895 token->type = OP_OPEN_SUBEXP;
1896 break;
1897 case ')':
1898 if (!(syntax & RE_NO_BK_PARENS))
1899 token->type = OP_CLOSE_SUBEXP;
1900 break;
1901 case '+':
1902 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903 token->type = OP_DUP_PLUS;
1904 break;
1905 case '?':
1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_QUESTION;
1908 break;
1909 case '{':
1910 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911 token->type = OP_OPEN_DUP_NUM;
1912 break;
1913 case '}':
1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 token->type = OP_CLOSE_DUP_NUM;
1916 break;
1917 default:
1918 break;
1920 return 2;
1923 token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925 if (input->mb_cur_max > 1)
1927 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1930 else
1931 #endif
1932 token->word_char = IS_WORD_CHAR (token->opr.c);
1934 switch (c)
1936 case '\n':
1937 if (syntax & RE_NEWLINE_ALT)
1938 token->type = OP_ALT;
1939 break;
1940 case '|':
1941 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942 token->type = OP_ALT;
1943 break;
1944 case '*':
1945 token->type = OP_DUP_ASTERISK;
1946 break;
1947 case '+':
1948 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949 token->type = OP_DUP_PLUS;
1950 break;
1951 case '?':
1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_QUESTION;
1954 break;
1955 case '{':
1956 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957 token->type = OP_OPEN_DUP_NUM;
1958 break;
1959 case '}':
1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 token->type = OP_CLOSE_DUP_NUM;
1962 break;
1963 case '(':
1964 if (syntax & RE_NO_BK_PARENS)
1965 token->type = OP_OPEN_SUBEXP;
1966 break;
1967 case ')':
1968 if (syntax & RE_NO_BK_PARENS)
1969 token->type = OP_CLOSE_SUBEXP;
1970 break;
1971 case '[':
1972 token->type = OP_OPEN_BRACKET;
1973 break;
1974 case '.':
1975 token->type = OP_PERIOD;
1976 break;
1977 case '^':
1978 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979 re_string_cur_idx (input) != 0)
1981 char prev = re_string_peek_byte (input, -1);
1982 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983 break;
1985 token->type = ANCHOR;
1986 token->opr.ctx_type = LINE_FIRST;
1987 break;
1988 case '$':
1989 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990 re_string_cur_idx (input) + 1 != re_string_length (input))
1992 re_token_t next;
1993 re_string_skip_bytes (input, 1);
1994 peek_token (&next, input, syntax);
1995 re_string_skip_bytes (input, -1);
1996 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997 break;
1999 token->type = ANCHOR;
2000 token->opr.ctx_type = LINE_LAST;
2001 break;
2002 default:
2003 break;
2005 return 1;
2008 /* Peek a token from INPUT, and return the length of the token.
2009 We must not use this function out of bracket expressions. */
2011 static int
2012 internal_function
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2015 unsigned char c;
2016 if (re_string_eoi (input))
2018 token->type = END_OF_RE;
2019 return 0;
2021 c = re_string_peek_byte (input, 0);
2022 token->opr.c = c;
2024 #ifdef RE_ENABLE_I18N
2025 if (input->mb_cur_max > 1 &&
2026 !re_string_first_byte (input, re_string_cur_idx (input)))
2028 token->type = CHARACTER;
2029 return 1;
2031 #endif /* RE_ENABLE_I18N */
2033 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034 && re_string_cur_idx (input) + 1 < re_string_length (input))
2036 /* In this case, '\' escape a character. */
2037 unsigned char c2;
2038 re_string_skip_bytes (input, 1);
2039 c2 = re_string_peek_byte (input, 0);
2040 token->opr.c = c2;
2041 token->type = CHARACTER;
2042 return 1;
2044 if (c == '[') /* '[' is a special char in a bracket exps. */
2046 unsigned char c2;
2047 int token_len;
2048 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049 c2 = re_string_peek_byte (input, 1);
2050 else
2051 c2 = 0;
2052 token->opr.c = c2;
2053 token_len = 2;
2054 switch (c2)
2056 case '.':
2057 token->type = OP_OPEN_COLL_ELEM;
2058 break;
2059 case '=':
2060 token->type = OP_OPEN_EQUIV_CLASS;
2061 break;
2062 case ':':
2063 if (syntax & RE_CHAR_CLASSES)
2065 token->type = OP_OPEN_CHAR_CLASS;
2066 break;
2068 /* else fall through. */
2069 default:
2070 token->type = CHARACTER;
2071 token->opr.c = c;
2072 token_len = 1;
2073 break;
2075 return token_len;
2077 switch (c)
2079 case '-':
2080 token->type = OP_CHARSET_RANGE;
2081 break;
2082 case ']':
2083 token->type = OP_CLOSE_BRACKET;
2084 break;
2085 case '^':
2086 token->type = OP_NON_MATCH_LIST;
2087 break;
2088 default:
2089 token->type = CHARACTER;
2091 return 1;
2094 /* Functions for parser. */
2096 /* Entry point of the parser.
2097 Parse the regular expression REGEXP and return the structure tree.
2098 If an error has occurred, ERR is set by error code, and return NULL.
2099 This function build the following tree, from regular expression <reg_exp>:
2103 <reg_exp> EOR
2105 CAT means concatenation.
2106 EOR means end of regular expression. */
2108 static bin_tree_t *
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110 reg_errcode_t *err)
2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113 bin_tree_t *tree, *eor, *root;
2114 re_token_t current_token;
2115 dfa->syntax = syntax;
2116 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119 return NULL;
2120 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121 if (tree != NULL)
2122 root = create_tree (dfa, tree, eor, CONCAT);
2123 else
2124 root = eor;
2125 if (BE (eor == NULL || root == NULL, 0))
2127 *err = REG_ESPACE;
2128 return NULL;
2130 return root;
2133 /* This function build the following tree, from regular expression
2134 <branch1>|<branch2>:
2138 <branch1> <branch2>
2140 ALT means alternative, which represents the operator `|'. */
2142 static bin_tree_t *
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147 bin_tree_t *tree, *branch = NULL;
2148 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150 return NULL;
2152 while (token->type == OP_ALT)
2154 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155 if (token->type != OP_ALT && token->type != END_OF_RE
2156 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2158 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2160 return NULL;
2162 else
2163 branch = NULL;
2164 tree = create_tree (dfa, tree, branch, OP_ALT);
2165 if (BE (tree == NULL, 0))
2167 *err = REG_ESPACE;
2168 return NULL;
2171 return tree;
2174 /* This function build the following tree, from regular expression
2175 <exp1><exp2>:
2179 <exp1> <exp2>
2181 CAT means concatenation. */
2183 static bin_tree_t *
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2187 bin_tree_t *tree, *exp;
2188 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191 return NULL;
2193 while (token->type != OP_ALT && token->type != END_OF_RE
2194 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2199 return NULL;
2201 if (tree != NULL && exp != NULL)
2203 tree = create_tree (dfa, tree, exp, CONCAT);
2204 if (tree == NULL)
2206 *err = REG_ESPACE;
2207 return NULL;
2210 else if (tree == NULL)
2211 tree = exp;
2212 /* Otherwise exp == NULL, we don't need to create new tree. */
2214 return tree;
2217 /* This function build the following tree, from regular expression a*:
2223 static bin_tree_t *
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228 bin_tree_t *tree;
2229 switch (token->type)
2231 case CHARACTER:
2232 tree = create_token_tree (dfa, NULL, NULL, token);
2233 if (BE (tree == NULL, 0))
2235 *err = REG_ESPACE;
2236 return NULL;
2238 #ifdef RE_ENABLE_I18N
2239 if (dfa->mb_cur_max > 1)
2241 while (!re_string_eoi (regexp)
2242 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2244 bin_tree_t *mbc_remain;
2245 fetch_token (token, regexp, syntax);
2246 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248 if (BE (mbc_remain == NULL || tree == NULL, 0))
2250 *err = REG_ESPACE;
2251 return NULL;
2255 #endif
2256 break;
2257 case OP_OPEN_SUBEXP:
2258 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260 return NULL;
2261 break;
2262 case OP_OPEN_BRACKET:
2263 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265 return NULL;
2266 break;
2267 case OP_BACK_REF:
2268 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2270 *err = REG_ESUBREG;
2271 return NULL;
2273 dfa->used_bkref_map |= 1 << token->opr.idx;
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2277 *err = REG_ESPACE;
2278 return NULL;
2280 ++dfa->nbackref;
2281 dfa->has_mb_node = 1;
2282 break;
2283 case OP_OPEN_DUP_NUM:
2284 if (syntax & RE_CONTEXT_INVALID_DUP)
2286 *err = REG_BADRPT;
2287 return NULL;
2289 /* FALLTHROUGH */
2290 case OP_DUP_ASTERISK:
2291 case OP_DUP_PLUS:
2292 case OP_DUP_QUESTION:
2293 if (syntax & RE_CONTEXT_INVALID_OPS)
2295 *err = REG_BADRPT;
2296 return NULL;
2298 else if (syntax & RE_CONTEXT_INDEP_OPS)
2300 fetch_token (token, regexp, syntax);
2301 return parse_expression (regexp, preg, token, syntax, nest, err);
2303 /* else fall through */
2304 case OP_CLOSE_SUBEXP:
2305 if ((token->type == OP_CLOSE_SUBEXP) &&
2306 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2308 *err = REG_ERPAREN;
2309 return NULL;
2311 /* else fall through */
2312 case OP_CLOSE_DUP_NUM:
2313 /* We treat it as a normal character. */
2315 /* Then we can these characters as normal characters. */
2316 token->type = CHARACTER;
2317 /* mb_partial and word_char bits should be initialized already
2318 by peek_token. */
2319 tree = create_token_tree (dfa, NULL, NULL, token);
2320 if (BE (tree == NULL, 0))
2322 *err = REG_ESPACE;
2323 return NULL;
2325 break;
2326 case ANCHOR:
2327 if ((token->opr.ctx_type
2328 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329 && dfa->word_ops_used == 0)
2330 init_word_char (dfa);
2331 if (token->opr.ctx_type == WORD_DELIM
2332 || token->opr.ctx_type == NOT_WORD_DELIM)
2334 bin_tree_t *tree_first, *tree_last;
2335 if (token->opr.ctx_type == WORD_DELIM)
2337 token->opr.ctx_type = WORD_FIRST;
2338 tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 token->opr.ctx_type = WORD_LAST;
2341 else
2343 token->opr.ctx_type = INSIDE_WORD;
2344 tree_first = create_token_tree (dfa, NULL, NULL, token);
2345 token->opr.ctx_type = INSIDE_NOTWORD;
2347 tree_last = create_token_tree (dfa, NULL, NULL, token);
2348 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2351 *err = REG_ESPACE;
2352 return NULL;
2355 else
2357 tree = create_token_tree (dfa, NULL, NULL, token);
2358 if (BE (tree == NULL, 0))
2360 *err = REG_ESPACE;
2361 return NULL;
2364 /* We must return here, since ANCHORs can't be followed
2365 by repetition operators.
2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2368 fetch_token (token, regexp, syntax);
2369 return tree;
2370 case OP_PERIOD:
2371 tree = create_token_tree (dfa, NULL, NULL, token);
2372 if (BE (tree == NULL, 0))
2374 *err = REG_ESPACE;
2375 return NULL;
2377 if (dfa->mb_cur_max > 1)
2378 dfa->has_mb_node = 1;
2379 break;
2380 case OP_WORD:
2381 case OP_NOTWORD:
2382 tree = build_charclass_op (dfa, regexp->trans,
2383 "alnum",
2384 "_",
2385 token->type == OP_NOTWORD, err);
2386 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387 return NULL;
2388 break;
2389 case OP_SPACE:
2390 case OP_NOTSPACE:
2391 tree = build_charclass_op (dfa, regexp->trans,
2392 "space",
2394 token->type == OP_NOTSPACE, err);
2395 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396 return NULL;
2397 break;
2398 case OP_ALT:
2399 case END_OF_RE:
2400 return NULL;
2401 case BACK_SLASH:
2402 *err = REG_EESCAPE;
2403 return NULL;
2404 default:
2405 /* Must not happen? */
2406 #ifdef DEBUG
2407 assert (0);
2408 #endif
2409 return NULL;
2411 fetch_token (token, regexp, syntax);
2413 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2416 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418 return NULL;
2419 /* In BRE consecutive duplications are not allowed. */
2420 if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 && (token->type == OP_DUP_ASTERISK
2422 || token->type == OP_OPEN_DUP_NUM))
2424 *err = REG_BADRPT;
2425 return NULL;
2429 return tree;
2432 /* This function build the following tree, from regular expression
2433 (<reg_exp>):
2434 SUBEXP
2436 <reg_exp>
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444 bin_tree_t *tree;
2445 size_t cur_nsub;
2446 cur_nsub = preg->re_nsub++;
2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450 /* The subexpression may be a null string. */
2451 if (token->type == OP_CLOSE_SUBEXP)
2452 tree = NULL;
2453 else
2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457 *err = REG_EPAREN;
2458 if (BE (*err != REG_NOERROR, 0))
2459 return NULL;
2462 if (cur_nsub <= '9' - '1')
2463 dfa->completed_bkref_map |= 1 << cur_nsub;
2465 tree = create_tree (dfa, tree, NULL, SUBEXP);
2466 if (BE (tree == NULL, 0))
2468 *err = REG_ESPACE;
2469 return NULL;
2471 tree->token.opr.idx = cur_nsub;
2472 return tree;
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2477 static bin_tree_t *
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2481 bin_tree_t *tree = NULL, *old_tree = NULL;
2482 int i, start, end, start_idx = re_string_cur_idx (regexp);
2483 #ifndef RE_TOKEN_INIT_BUG
2484 re_token_t start_token = *token;
2485 #else
2486 re_token_t start_token;
2488 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2489 #endif
2491 if (token->type == OP_OPEN_DUP_NUM)
2493 end = 0;
2494 start = fetch_number (regexp, token, syntax);
2495 if (start == -1)
2497 if (token->type == CHARACTER && token->opr.c == ',')
2498 start = 0; /* We treat "{,m}" as "{0,m}". */
2499 else
2501 *err = REG_BADBR; /* <re>{} is invalid. */
2502 return NULL;
2505 if (BE (start != -2, 1))
2507 /* We treat "{n}" as "{n,n}". */
2508 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509 : ((token->type == CHARACTER && token->opr.c == ',')
2510 ? fetch_number (regexp, token, syntax) : -2));
2512 if (BE (start == -2 || end == -2, 0))
2514 /* Invalid sequence. */
2515 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2517 if (token->type == END_OF_RE)
2518 *err = REG_EBRACE;
2519 else
2520 *err = REG_BADBR;
2522 return NULL;
2525 /* If the syntax bit is set, rollback. */
2526 re_string_set_index (regexp, start_idx);
2527 *token = start_token;
2528 token->type = CHARACTER;
2529 /* mb_partial and word_char bits should be already initialized by
2530 peek_token. */
2531 return elem;
2534 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2536 /* First number greater than second. */
2537 *err = REG_BADBR;
2538 return NULL;
2541 else
2543 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2544 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2547 fetch_token (token, regexp, syntax);
2549 if (BE (elem == NULL, 0))
2550 return NULL;
2551 if (BE (start == 0 && end == 0, 0))
2553 postorder (elem, free_tree, NULL);
2554 return NULL;
2557 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2558 if (BE (start > 0, 0))
2560 tree = elem;
2561 for (i = 2; i <= start; ++i)
2563 elem = duplicate_tree (elem, dfa);
2564 tree = create_tree (dfa, tree, elem, CONCAT);
2565 if (BE (elem == NULL || tree == NULL, 0))
2566 goto parse_dup_op_espace;
2569 if (start == end)
2570 return tree;
2572 /* Duplicate ELEM before it is marked optional. */
2573 elem = duplicate_tree (elem, dfa);
2574 old_tree = tree;
2576 else
2577 old_tree = NULL;
2579 if (elem->token.type == SUBEXP)
2580 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2582 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2583 if (BE (tree == NULL, 0))
2584 goto parse_dup_op_espace;
2586 /* This loop is actually executed only when end != -1,
2587 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2588 already created the start+1-th copy. */
2589 for (i = start + 2; i <= end; ++i)
2591 elem = duplicate_tree (elem, dfa);
2592 tree = create_tree (dfa, tree, elem, CONCAT);
2593 if (BE (elem == NULL || tree == NULL, 0))
2594 goto parse_dup_op_espace;
2596 tree = create_tree (dfa, tree, NULL, OP_ALT);
2597 if (BE (tree == NULL, 0))
2598 goto parse_dup_op_espace;
2601 if (old_tree)
2602 tree = create_tree (dfa, old_tree, tree, CONCAT);
2604 return tree;
2606 parse_dup_op_espace:
2607 *err = REG_ESPACE;
2608 return NULL;
2611 /* Size of the names for collating symbol/equivalence_class/character_class.
2612 I'm not sure, but maybe enough. */
2613 #define BRACKET_NAME_BUF_SIZE 32
2615 #ifndef _LIBC
2616 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617 Build the range expression which starts from START_ELEM, and ends
2618 at END_ELEM. The result are written to MBCSET and SBCSET.
2619 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620 mbcset->range_ends, is a pointer argument since we may
2621 update it. */
2623 static reg_errcode_t
2624 internal_function
2625 # ifdef RE_ENABLE_I18N
2626 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2627 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2628 # else /* not RE_ENABLE_I18N */
2629 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2630 bracket_elem_t *end_elem)
2631 # endif /* not RE_ENABLE_I18N */
2633 unsigned int start_ch, end_ch;
2634 /* Equivalence Classes and Character Classes can't be a range start/end. */
2635 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2636 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2638 return REG_ERANGE;
2640 /* We can handle no multi character collating elements without libc
2641 support. */
2642 if (BE ((start_elem->type == COLL_SYM
2643 && strlen ((char *) start_elem->opr.name) > 1)
2644 || (end_elem->type == COLL_SYM
2645 && strlen ((char *) end_elem->opr.name) > 1), 0))
2646 return REG_ECOLLATE;
2648 # ifdef RE_ENABLE_I18N
2650 wchar_t wc;
2651 wint_t start_wc;
2652 wint_t end_wc;
2653 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2655 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2656 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2657 : 0));
2658 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2659 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2660 : 0));
2661 #ifdef GAWK
2663 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2664 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2665 * unsigned, so we don't have sign extension problems.
2667 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2668 ? start_ch : start_elem->opr.wch);
2669 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2670 ? end_ch : end_elem->opr.wch);
2671 #else
2672 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673 ? __btowc (start_ch) : start_elem->opr.wch);
2674 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675 ? __btowc (end_ch) : end_elem->opr.wch);
2676 #endif
2677 if (start_wc == WEOF || end_wc == WEOF)
2678 return REG_ECOLLATE;
2679 cmp_buf[0] = start_wc;
2680 cmp_buf[4] = end_wc;
2681 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2682 return REG_ERANGE;
2684 /* Got valid collation sequence values, add them as a new entry.
2685 However, for !_LIBC we have no collation elements: if the
2686 character set is single byte, the single byte character set
2687 that we build below suffices. parse_bracket_exp passes
2688 no MBCSET if dfa->mb_cur_max == 1. */
2689 if (mbcset)
2691 /* Check the space of the arrays. */
2692 if (BE (*range_alloc == mbcset->nranges, 0))
2694 /* There is not enough space, need realloc. */
2695 wchar_t *new_array_start, *new_array_end;
2696 int new_nranges;
2698 /* +1 in case of mbcset->nranges is 0. */
2699 new_nranges = 2 * mbcset->nranges + 1;
2700 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2701 are NULL if *range_alloc == 0. */
2702 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2703 new_nranges);
2704 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2705 new_nranges);
2707 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2708 return REG_ESPACE;
2710 mbcset->range_starts = new_array_start;
2711 mbcset->range_ends = new_array_end;
2712 *range_alloc = new_nranges;
2715 mbcset->range_starts[mbcset->nranges] = start_wc;
2716 mbcset->range_ends[mbcset->nranges++] = end_wc;
2719 /* Build the table for single byte characters. */
2720 for (wc = 0; wc < SBC_MAX; ++wc)
2722 cmp_buf[2] = wc;
2723 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2724 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2725 bitset_set (sbcset, wc);
2728 # else /* not RE_ENABLE_I18N */
2730 unsigned int ch;
2731 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2732 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2733 : 0));
2734 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2735 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2736 : 0));
2737 if (start_ch > end_ch)
2738 return REG_ERANGE;
2739 /* Build the table for single byte characters. */
2740 for (ch = 0; ch < SBC_MAX; ++ch)
2741 if (start_ch <= ch && ch <= end_ch)
2742 bitset_set (sbcset, ch);
2744 # endif /* not RE_ENABLE_I18N */
2745 return REG_NOERROR;
2747 #endif /* not _LIBC */
2749 #ifndef _LIBC
2750 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2751 Build the collating element which is represented by NAME.
2752 The result are written to MBCSET and SBCSET.
2753 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2754 pointer argument since we may update it. */
2756 static reg_errcode_t
2757 internal_function
2758 # ifdef RE_ENABLE_I18N
2759 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2760 int *coll_sym_alloc, const unsigned char *name)
2761 # else /* not RE_ENABLE_I18N */
2762 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2763 # endif /* not RE_ENABLE_I18N */
2765 size_t name_len = strlen ((const char *) name);
2766 if (BE (name_len != 1, 0))
2767 return REG_ECOLLATE;
2768 else
2770 bitset_set (sbcset, name[0]);
2771 return REG_NOERROR;
2774 #endif /* not _LIBC */
2776 /* This function parse bracket expression like "[abc]", "[a-c]",
2777 "[[.a-a.]]" etc. */
2779 static bin_tree_t *
2780 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2781 reg_syntax_t syntax, reg_errcode_t *err)
2783 #ifdef _LIBC
2784 const unsigned char *collseqmb;
2785 const char *collseqwc;
2786 uint32_t nrules;
2787 int32_t table_size;
2788 const int32_t *symb_table;
2789 const unsigned char *extra;
2791 /* Local function for parse_bracket_exp used in _LIBC environment.
2792 Seek the collating symbol entry correspondings to NAME.
2793 Return the index of the symbol in the SYMB_TABLE. */
2795 auto inline int32_t
2796 __attribute ((always_inline))
2797 seek_collating_symbol_entry (name, name_len)
2798 const unsigned char *name;
2799 size_t name_len;
2801 int32_t hash = elem_hash ((const char *) name, name_len);
2802 int32_t elem = hash % table_size;
2803 if (symb_table[2 * elem] != 0)
2805 int32_t second = hash % (table_size - 2) + 1;
2809 /* First compare the hashing value. */
2810 if (symb_table[2 * elem] == hash
2811 /* Compare the length of the name. */
2812 && name_len == extra[symb_table[2 * elem + 1]]
2813 /* Compare the name. */
2814 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2815 name_len) == 0)
2817 /* Yep, this is the entry. */
2818 break;
2821 /* Next entry. */
2822 elem += second;
2824 while (symb_table[2 * elem] != 0);
2826 return elem;
2829 /* Local function for parse_bracket_exp used in _LIBC environment.
2830 Look up the collation sequence value of BR_ELEM.
2831 Return the value if succeeded, UINT_MAX otherwise. */
2833 auto inline unsigned int
2834 __attribute ((always_inline))
2835 lookup_collation_sequence_value (br_elem)
2836 bracket_elem_t *br_elem;
2838 if (br_elem->type == SB_CHAR)
2841 if (MB_CUR_MAX == 1)
2843 if (nrules == 0)
2844 return collseqmb[br_elem->opr.ch];
2845 else
2847 wint_t wc = __btowc (br_elem->opr.ch);
2848 return __collseq_table_lookup (collseqwc, wc);
2851 else if (br_elem->type == MB_CHAR)
2853 if (nrules != 0)
2854 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2856 else if (br_elem->type == COLL_SYM)
2858 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2859 if (nrules != 0)
2861 int32_t elem, idx;
2862 elem = seek_collating_symbol_entry (br_elem->opr.name,
2863 sym_name_len);
2864 if (symb_table[2 * elem] != 0)
2866 /* We found the entry. */
2867 idx = symb_table[2 * elem + 1];
2868 /* Skip the name of collating element name. */
2869 idx += 1 + extra[idx];
2870 /* Skip the byte sequence of the collating element. */
2871 idx += 1 + extra[idx];
2872 /* Adjust for the alignment. */
2873 idx = (idx + 3) & ~3;
2874 /* Skip the multibyte collation sequence value. */
2875 idx += sizeof (unsigned int);
2876 /* Skip the wide char sequence of the collating element. */
2877 idx += sizeof (unsigned int) *
2878 (1 + *(unsigned int *) (extra + idx));
2879 /* Return the collation sequence value. */
2880 return *(unsigned int *) (extra + idx);
2882 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2884 /* No valid character. Match it as a single byte
2885 character. */
2886 return collseqmb[br_elem->opr.name[0]];
2889 else if (sym_name_len == 1)
2890 return collseqmb[br_elem->opr.name[0]];
2892 return UINT_MAX;
2895 /* Local function for parse_bracket_exp used in _LIBC environment.
2896 Build the range expression which starts from START_ELEM, and ends
2897 at END_ELEM. The result are written to MBCSET and SBCSET.
2898 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2899 mbcset->range_ends, is a pointer argument since we may
2900 update it. */
2902 auto inline reg_errcode_t
2903 __attribute ((always_inline))
2904 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2905 re_charset_t *mbcset;
2906 int *range_alloc;
2907 bitset_t sbcset;
2908 bracket_elem_t *start_elem, *end_elem;
2910 unsigned int ch;
2911 uint32_t start_collseq;
2912 uint32_t end_collseq;
2914 /* Equivalence Classes and Character Classes can't be a range
2915 start/end. */
2916 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2917 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2919 return REG_ERANGE;
2921 start_collseq = lookup_collation_sequence_value (start_elem);
2922 end_collseq = lookup_collation_sequence_value (end_elem);
2923 /* Check start/end collation sequence values. */
2924 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2925 return REG_ECOLLATE;
2926 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2927 return REG_ERANGE;
2929 /* Got valid collation sequence values, add them as a new entry.
2930 However, if we have no collation elements, and the character set
2931 is single byte, the single byte character set that we
2932 build below suffices. */
2933 if (nrules > 0 || dfa->mb_cur_max > 1)
2935 /* Check the space of the arrays. */
2936 if (BE (*range_alloc == mbcset->nranges, 0))
2938 /* There is not enough space, need realloc. */
2939 uint32_t *new_array_start;
2940 uint32_t *new_array_end;
2941 int new_nranges;
2943 /* +1 in case of mbcset->nranges is 0. */
2944 new_nranges = 2 * mbcset->nranges + 1;
2945 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2946 new_nranges);
2947 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2948 new_nranges);
2950 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2951 return REG_ESPACE;
2953 mbcset->range_starts = new_array_start;
2954 mbcset->range_ends = new_array_end;
2955 *range_alloc = new_nranges;
2958 mbcset->range_starts[mbcset->nranges] = start_collseq;
2959 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2962 /* Build the table for single byte characters. */
2963 for (ch = 0; ch < SBC_MAX; ch++)
2965 uint32_t ch_collseq;
2967 if (MB_CUR_MAX == 1)
2969 if (nrules == 0)
2970 ch_collseq = collseqmb[ch];
2971 else
2972 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2973 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2974 bitset_set (sbcset, ch);
2976 return REG_NOERROR;
2979 /* Local function for parse_bracket_exp used in _LIBC environment.
2980 Build the collating element which is represented by NAME.
2981 The result are written to MBCSET and SBCSET.
2982 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2983 pointer argument since we may update it. */
2985 auto inline reg_errcode_t
2986 __attribute ((always_inline))
2987 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2988 re_charset_t *mbcset;
2989 int *coll_sym_alloc;
2990 bitset_t sbcset;
2991 const unsigned char *name;
2993 int32_t elem, idx;
2994 size_t name_len = strlen ((const char *) name);
2995 if (nrules != 0)
2997 elem = seek_collating_symbol_entry (name, name_len);
2998 if (symb_table[2 * elem] != 0)
3000 /* We found the entry. */
3001 idx = symb_table[2 * elem + 1];
3002 /* Skip the name of collating element name. */
3003 idx += 1 + extra[idx];
3005 else if (symb_table[2 * elem] == 0 && name_len == 1)
3007 /* No valid character, treat it as a normal
3008 character. */
3009 bitset_set (sbcset, name[0]);
3010 return REG_NOERROR;
3012 else
3013 return REG_ECOLLATE;
3015 /* Got valid collation sequence, add it as a new entry. */
3016 /* Check the space of the arrays. */
3017 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3019 /* Not enough, realloc it. */
3020 /* +1 in case of mbcset->ncoll_syms is 0. */
3021 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3022 /* Use realloc since mbcset->coll_syms is NULL
3023 if *alloc == 0. */
3024 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3025 new_coll_sym_alloc);
3026 if (BE (new_coll_syms == NULL, 0))
3027 return REG_ESPACE;
3028 mbcset->coll_syms = new_coll_syms;
3029 *coll_sym_alloc = new_coll_sym_alloc;
3031 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3032 return REG_NOERROR;
3034 else
3036 if (BE (name_len != 1, 0))
3037 return REG_ECOLLATE;
3038 else
3040 bitset_set (sbcset, name[0]);
3041 return REG_NOERROR;
3045 #endif
3047 re_token_t br_token;
3048 re_bitset_ptr_t sbcset;
3049 #ifdef RE_ENABLE_I18N
3050 re_charset_t *mbcset;
3051 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3052 int equiv_class_alloc = 0, char_class_alloc = 0;
3053 #endif /* not RE_ENABLE_I18N */
3054 int non_match = 0;
3055 bin_tree_t *work_tree;
3056 int token_len;
3057 int first_round = 1;
3058 #ifdef _LIBC
3059 collseqmb = (const unsigned char *)
3060 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3061 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3062 if (nrules)
3065 if (MB_CUR_MAX > 1)
3067 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3068 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3069 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3070 _NL_COLLATE_SYMB_TABLEMB);
3071 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3072 _NL_COLLATE_SYMB_EXTRAMB);
3074 #endif
3075 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3076 #ifdef RE_ENABLE_I18N
3077 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3078 #endif /* RE_ENABLE_I18N */
3079 #ifdef RE_ENABLE_I18N
3080 if (BE (sbcset == NULL || mbcset == NULL, 0))
3081 #else
3082 if (BE (sbcset == NULL, 0))
3083 #endif /* RE_ENABLE_I18N */
3085 *err = REG_ESPACE;
3086 return NULL;
3089 token_len = peek_token_bracket (token, regexp, syntax);
3090 if (BE (token->type == END_OF_RE, 0))
3092 *err = REG_BADPAT;
3093 goto parse_bracket_exp_free_return;
3095 if (token->type == OP_NON_MATCH_LIST)
3097 #ifdef RE_ENABLE_I18N
3098 mbcset->non_match = 1;
3099 #endif /* not RE_ENABLE_I18N */
3100 non_match = 1;
3101 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3102 bitset_set (sbcset, '\n');
3103 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3104 token_len = peek_token_bracket (token, regexp, syntax);
3105 if (BE (token->type == END_OF_RE, 0))
3107 *err = REG_BADPAT;
3108 goto parse_bracket_exp_free_return;
3112 /* We treat the first ']' as a normal character. */
3113 if (token->type == OP_CLOSE_BRACKET)
3114 token->type = CHARACTER;
3116 while (1)
3118 bracket_elem_t start_elem, end_elem;
3119 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3120 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3121 reg_errcode_t ret;
3122 int token_len2 = 0, is_range_exp = 0;
3123 re_token_t token2;
3125 start_elem.opr.name = start_name_buf;
3126 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3127 syntax, first_round);
3128 if (BE (ret != REG_NOERROR, 0))
3130 *err = ret;
3131 goto parse_bracket_exp_free_return;
3133 first_round = 0;
3135 /* Get information about the next token. We need it in any case. */
3136 token_len = peek_token_bracket (token, regexp, syntax);
3138 /* Do not check for ranges if we know they are not allowed. */
3139 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3141 if (BE (token->type == END_OF_RE, 0))
3143 *err = REG_EBRACK;
3144 goto parse_bracket_exp_free_return;
3146 if (token->type == OP_CHARSET_RANGE)
3148 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3149 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3150 if (BE (token2.type == END_OF_RE, 0))
3152 *err = REG_EBRACK;
3153 goto parse_bracket_exp_free_return;
3155 if (token2.type == OP_CLOSE_BRACKET)
3157 /* We treat the last '-' as a normal character. */
3158 re_string_skip_bytes (regexp, -token_len);
3159 token->type = CHARACTER;
3161 else
3162 is_range_exp = 1;
3166 if (is_range_exp == 1)
3168 end_elem.opr.name = end_name_buf;
3169 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3170 dfa, syntax, 1);
3171 if (BE (ret != REG_NOERROR, 0))
3173 *err = ret;
3174 goto parse_bracket_exp_free_return;
3177 token_len = peek_token_bracket (token, regexp, syntax);
3179 #ifdef _LIBC
3180 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3181 &start_elem, &end_elem);
3182 #else
3183 # ifdef RE_ENABLE_I18N
3184 *err = build_range_exp (sbcset,
3185 dfa->mb_cur_max > 1 ? mbcset : NULL,
3186 &range_alloc, &start_elem, &end_elem);
3187 # else
3188 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3189 # endif
3190 #endif /* RE_ENABLE_I18N */
3191 if (BE (*err != REG_NOERROR, 0))
3192 goto parse_bracket_exp_free_return;
3194 else
3196 switch (start_elem.type)
3198 case SB_CHAR:
3199 bitset_set (sbcset, start_elem.opr.ch);
3200 break;
3201 #ifdef RE_ENABLE_I18N
3202 case MB_CHAR:
3203 /* Check whether the array has enough space. */
3204 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3206 wchar_t *new_mbchars;
3207 /* Not enough, realloc it. */
3208 /* +1 in case of mbcset->nmbchars is 0. */
3209 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3210 /* Use realloc since array is NULL if *alloc == 0. */
3211 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3212 mbchar_alloc);
3213 if (BE (new_mbchars == NULL, 0))
3214 goto parse_bracket_exp_espace;
3215 mbcset->mbchars = new_mbchars;
3217 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3218 break;
3219 #endif /* RE_ENABLE_I18N */
3220 case EQUIV_CLASS:
3221 *err = build_equiv_class (sbcset,
3222 #ifdef RE_ENABLE_I18N
3223 mbcset, &equiv_class_alloc,
3224 #endif /* RE_ENABLE_I18N */
3225 start_elem.opr.name);
3226 if (BE (*err != REG_NOERROR, 0))
3227 goto parse_bracket_exp_free_return;
3228 break;
3229 case COLL_SYM:
3230 *err = build_collating_symbol (sbcset,
3231 #ifdef RE_ENABLE_I18N
3232 mbcset, &coll_sym_alloc,
3233 #endif /* RE_ENABLE_I18N */
3234 start_elem.opr.name);
3235 if (BE (*err != REG_NOERROR, 0))
3236 goto parse_bracket_exp_free_return;
3237 break;
3238 case CHAR_CLASS:
3239 *err = build_charclass (regexp->trans, sbcset,
3240 #ifdef RE_ENABLE_I18N
3241 mbcset, &char_class_alloc,
3242 #endif /* RE_ENABLE_I18N */
3243 (const char *) start_elem.opr.name, syntax);
3244 if (BE (*err != REG_NOERROR, 0))
3245 goto parse_bracket_exp_free_return;
3246 break;
3247 default:
3248 assert (0);
3249 break;
3252 if (BE (token->type == END_OF_RE, 0))
3254 *err = REG_EBRACK;
3255 goto parse_bracket_exp_free_return;
3257 if (token->type == OP_CLOSE_BRACKET)
3258 break;
3261 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3263 /* If it is non-matching list. */
3264 if (non_match)
3265 bitset_not (sbcset);
3267 #ifdef RE_ENABLE_I18N
3268 /* Ensure only single byte characters are set. */
3269 if (dfa->mb_cur_max > 1)
3270 bitset_mask (sbcset, dfa->sb_char);
3272 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3273 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3274 || mbcset->non_match)))
3276 bin_tree_t *mbc_tree;
3277 int sbc_idx;
3278 /* Build a tree for complex bracket. */
3279 dfa->has_mb_node = 1;
3280 br_token.type = COMPLEX_BRACKET;
3281 br_token.opr.mbcset = mbcset;
3282 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3283 if (BE (mbc_tree == NULL, 0))
3284 goto parse_bracket_exp_espace;
3285 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3286 if (sbcset[sbc_idx])
3287 break;
3288 /* If there are no bits set in sbcset, there is no point
3289 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3290 if (sbc_idx < BITSET_WORDS)
3292 /* Build a tree for simple bracket. */
3293 br_token.type = SIMPLE_BRACKET;
3294 br_token.opr.sbcset = sbcset;
3295 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296 if (BE (work_tree == NULL, 0))
3297 goto parse_bracket_exp_espace;
3299 /* Then join them by ALT node. */
3300 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3301 if (BE (work_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3304 else
3306 re_free (sbcset);
3307 work_tree = mbc_tree;
3310 else
3311 #endif /* not RE_ENABLE_I18N */
3313 #ifdef RE_ENABLE_I18N
3314 free_charset (mbcset);
3315 #endif
3316 /* Build a tree for simple bracket. */
3317 br_token.type = SIMPLE_BRACKET;
3318 br_token.opr.sbcset = sbcset;
3319 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3320 if (BE (work_tree == NULL, 0))
3321 goto parse_bracket_exp_espace;
3323 return work_tree;
3325 parse_bracket_exp_espace:
3326 *err = REG_ESPACE;
3327 parse_bracket_exp_free_return:
3328 re_free (sbcset);
3329 #ifdef RE_ENABLE_I18N
3330 free_charset (mbcset);
3331 #endif /* RE_ENABLE_I18N */
3332 return NULL;
3335 /* Parse an element in the bracket expression. */
3337 static reg_errcode_t
3338 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3339 re_token_t *token, int token_len, re_dfa_t *dfa,
3340 reg_syntax_t syntax, int accept_hyphen)
3342 #ifdef RE_ENABLE_I18N
3343 int cur_char_size;
3344 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3345 if (cur_char_size > 1)
3347 elem->type = MB_CHAR;
3348 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3349 re_string_skip_bytes (regexp, cur_char_size);
3350 return REG_NOERROR;
3352 #endif /* RE_ENABLE_I18N */
3353 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3354 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3355 || token->type == OP_OPEN_EQUIV_CLASS)
3356 return parse_bracket_symbol (elem, regexp, token);
3357 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3359 /* A '-' must only appear as anything but a range indicator before
3360 the closing bracket. Everything else is an error. */
3361 re_token_t token2;
3362 (void) peek_token_bracket (&token2, regexp, syntax);
3363 if (token2.type != OP_CLOSE_BRACKET)
3364 /* The actual error value is not standardized since this whole
3365 case is undefined. But ERANGE makes good sense. */
3366 return REG_ERANGE;
3368 elem->type = SB_CHAR;
3369 elem->opr.ch = token->opr.c;
3370 return REG_NOERROR;
3373 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3374 such as [:<character_class>:], [.<collating_element>.], and
3375 [=<equivalent_class>=]. */
3377 static reg_errcode_t
3378 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3379 re_token_t *token)
3381 unsigned char ch, delim = token->opr.c;
3382 int i = 0;
3383 if (re_string_eoi(regexp))
3384 return REG_EBRACK;
3385 for (;; ++i)
3387 if (i >= BRACKET_NAME_BUF_SIZE)
3388 return REG_EBRACK;
3389 if (token->type == OP_OPEN_CHAR_CLASS)
3390 ch = re_string_fetch_byte_case (regexp);
3391 else
3392 ch = re_string_fetch_byte (regexp);
3393 if (re_string_eoi(regexp))
3394 return REG_EBRACK;
3395 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3396 break;
3397 elem->opr.name[i] = ch;
3399 re_string_skip_bytes (regexp, 1);
3400 elem->opr.name[i] = '\0';
3401 switch (token->type)
3403 case OP_OPEN_COLL_ELEM:
3404 elem->type = COLL_SYM;
3405 break;
3406 case OP_OPEN_EQUIV_CLASS:
3407 elem->type = EQUIV_CLASS;
3408 break;
3409 case OP_OPEN_CHAR_CLASS:
3410 elem->type = CHAR_CLASS;
3411 break;
3412 default:
3413 break;
3415 return REG_NOERROR;
3418 /* Helper function for parse_bracket_exp.
3419 Build the equivalence class which is represented by NAME.
3420 The result are written to MBCSET and SBCSET.
3421 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3422 is a pointer argument since we may update it. */
3424 static reg_errcode_t
3425 #ifdef RE_ENABLE_I18N
3426 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3427 int *equiv_class_alloc, const unsigned char *name)
3428 #else /* not RE_ENABLE_I18N */
3429 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3430 #endif /* not RE_ENABLE_I18N */
3432 #ifdef _LIBC
3433 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3434 if (nrules != 0)
3436 const int32_t *table, *indirect;
3437 const unsigned char *weights, *extra, *cp;
3438 unsigned char char_buf[2];
3439 int32_t idx1, idx2;
3440 unsigned int ch;
3441 size_t len;
3442 /* This #include defines a local function! */
3443 # include <locale/weight.h>
3444 /* Calculate the index for equivalence class. */
3445 cp = name;
3446 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3447 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3448 _NL_COLLATE_WEIGHTMB);
3449 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450 _NL_COLLATE_EXTRAMB);
3451 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3452 _NL_COLLATE_INDIRECTMB);
3453 idx1 = findidx (&cp);
3454 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3455 /* This isn't a valid character. */
3456 return REG_ECOLLATE;
3458 /* Build single byte matcing table for this equivalence class. */
3459 char_buf[1] = (unsigned char) '\0';
3460 len = weights[idx1 & 0xffffff];
3461 for (ch = 0; ch < SBC_MAX; ++ch)
3463 char_buf[0] = ch;
3464 cp = char_buf;
3465 idx2 = findidx (&cp);
3467 idx2 = table[ch];
3469 if (idx2 == 0)
3470 /* This isn't a valid character. */
3471 continue;
3472 /* Compare only if the length matches and the collation rule
3473 index is the same. */
3474 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3476 int cnt = 0;
3478 while (cnt <= len &&
3479 weights[(idx1 & 0xffffff) + 1 + cnt]
3480 == weights[(idx2 & 0xffffff) + 1 + cnt])
3481 ++cnt;
3483 if (cnt > len)
3484 bitset_set (sbcset, ch);
3487 /* Check whether the array has enough space. */
3488 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3490 /* Not enough, realloc it. */
3491 /* +1 in case of mbcset->nequiv_classes is 0. */
3492 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3493 /* Use realloc since the array is NULL if *alloc == 0. */
3494 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3495 int32_t,
3496 new_equiv_class_alloc);
3497 if (BE (new_equiv_classes == NULL, 0))
3498 return REG_ESPACE;
3499 mbcset->equiv_classes = new_equiv_classes;
3500 *equiv_class_alloc = new_equiv_class_alloc;
3502 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3504 else
3505 #endif /* _LIBC */
3507 if (BE (strlen ((const char *) name) != 1, 0))
3508 return REG_ECOLLATE;
3509 bitset_set (sbcset, *name);
3511 return REG_NOERROR;
3514 /* Helper function for parse_bracket_exp.
3515 Build the character class which is represented by NAME.
3516 The result are written to MBCSET and SBCSET.
3517 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3518 is a pointer argument since we may update it. */
3520 static reg_errcode_t
3521 #ifdef RE_ENABLE_I18N
3522 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3523 re_charset_t *mbcset, int *char_class_alloc,
3524 const char *class_name, reg_syntax_t syntax)
3525 #else /* not RE_ENABLE_I18N */
3526 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3527 const char *class_name, reg_syntax_t syntax)
3528 #endif /* not RE_ENABLE_I18N */
3530 int i;
3532 /* In case of REG_ICASE "upper" and "lower" match the both of
3533 upper and lower cases. */
3534 if ((syntax & RE_ICASE)
3535 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3536 class_name = "alpha";
3538 #ifdef RE_ENABLE_I18N
3539 /* Check the space of the arrays. */
3540 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3542 /* Not enough, realloc it. */
3543 /* +1 in case of mbcset->nchar_classes is 0. */
3544 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3545 /* Use realloc since array is NULL if *alloc == 0. */
3546 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3547 new_char_class_alloc);
3548 if (BE (new_char_classes == NULL, 0))
3549 return REG_ESPACE;
3550 mbcset->char_classes = new_char_classes;
3551 *char_class_alloc = new_char_class_alloc;
3553 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3554 #endif /* RE_ENABLE_I18N */
3556 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3557 do { \
3558 if (BE (trans != NULL, 0)) \
3560 for (i = 0; i < SBC_MAX; ++i) \
3561 if (ctype_func (i)) \
3562 bitset_set (sbcset, trans[i]); \
3564 else \
3566 for (i = 0; i < SBC_MAX; ++i) \
3567 if (ctype_func (i)) \
3568 bitset_set (sbcset, i); \
3570 } while (0)
3572 if (strcmp (class_name, "alnum") == 0)
3573 BUILD_CHARCLASS_LOOP (isalnum);
3574 else if (strcmp (class_name, "cntrl") == 0)
3575 BUILD_CHARCLASS_LOOP (iscntrl);
3576 else if (strcmp (class_name, "lower") == 0)
3577 BUILD_CHARCLASS_LOOP (islower);
3578 else if (strcmp (class_name, "space") == 0)
3579 BUILD_CHARCLASS_LOOP (isspace);
3580 else if (strcmp (class_name, "alpha") == 0)
3581 BUILD_CHARCLASS_LOOP (isalpha);
3582 else if (strcmp (class_name, "digit") == 0)
3583 BUILD_CHARCLASS_LOOP (isdigit);
3584 else if (strcmp (class_name, "print") == 0)
3585 BUILD_CHARCLASS_LOOP (isprint);
3586 else if (strcmp (class_name, "upper") == 0)
3587 BUILD_CHARCLASS_LOOP (isupper);
3588 else if (strcmp (class_name, "blank") == 0)
3589 #ifndef GAWK
3590 BUILD_CHARCLASS_LOOP (isblank);
3591 #else
3592 /* see comments above */
3593 BUILD_CHARCLASS_LOOP (is_blank);
3594 #endif
3595 else if (strcmp (class_name, "graph") == 0)
3596 BUILD_CHARCLASS_LOOP (isgraph);
3597 else if (strcmp (class_name, "punct") == 0)
3598 BUILD_CHARCLASS_LOOP (ispunct);
3599 else if (strcmp (class_name, "xdigit") == 0)
3600 BUILD_CHARCLASS_LOOP (isxdigit);
3601 else
3602 return REG_ECTYPE;
3604 return REG_NOERROR;
3607 static bin_tree_t *
3608 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3609 const char *class_name,
3610 const char *extra, int non_match,
3611 reg_errcode_t *err)
3613 re_bitset_ptr_t sbcset;
3614 #ifdef RE_ENABLE_I18N
3615 re_charset_t *mbcset;
3616 int alloc = 0;
3617 #endif /* not RE_ENABLE_I18N */
3618 reg_errcode_t ret;
3619 re_token_t br_token;
3620 bin_tree_t *tree;
3622 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3623 #ifdef RE_ENABLE_I18N
3624 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3625 #endif /* RE_ENABLE_I18N */
3627 #ifdef RE_ENABLE_I18N
3628 if (BE (sbcset == NULL || mbcset == NULL, 0))
3629 #else /* not RE_ENABLE_I18N */
3630 if (BE (sbcset == NULL, 0))
3631 #endif /* not RE_ENABLE_I18N */
3633 *err = REG_ESPACE;
3634 return NULL;
3637 if (non_match)
3639 #ifdef RE_ENABLE_I18N
3640 mbcset->non_match = 1;
3641 #endif /* not RE_ENABLE_I18N */
3644 /* We don't care the syntax in this case. */
3645 ret = build_charclass (trans, sbcset,
3646 #ifdef RE_ENABLE_I18N
3647 mbcset, &alloc,
3648 #endif /* RE_ENABLE_I18N */
3649 class_name, 0);
3651 if (BE (ret != REG_NOERROR, 0))
3653 re_free (sbcset);
3654 #ifdef RE_ENABLE_I18N
3655 free_charset (mbcset);
3656 #endif /* RE_ENABLE_I18N */
3657 *err = ret;
3658 return NULL;
3660 /* \w match '_' also. */
3661 for (; *extra; extra++)
3662 bitset_set (sbcset, *extra);
3664 /* If it is non-matching list. */
3665 if (non_match)
3666 bitset_not (sbcset);
3668 #ifdef RE_ENABLE_I18N
3669 /* Ensure only single byte characters are set. */
3670 if (dfa->mb_cur_max > 1)
3671 bitset_mask (sbcset, dfa->sb_char);
3672 #endif
3674 /* Build a tree for simple bracket. */
3675 br_token.type = SIMPLE_BRACKET;
3676 br_token.opr.sbcset = sbcset;
3677 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3678 if (BE (tree == NULL, 0))
3679 goto build_word_op_espace;
3681 #ifdef RE_ENABLE_I18N
3682 if (dfa->mb_cur_max > 1)
3684 bin_tree_t *mbc_tree;
3685 /* Build a tree for complex bracket. */
3686 br_token.type = COMPLEX_BRACKET;
3687 br_token.opr.mbcset = mbcset;
3688 dfa->has_mb_node = 1;
3689 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3690 if (BE (mbc_tree == NULL, 0))
3691 goto build_word_op_espace;
3692 /* Then join them by ALT node. */
3693 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3694 if (BE (mbc_tree != NULL, 1))
3695 return tree;
3697 else
3699 free_charset (mbcset);
3700 return tree;
3702 #else /* not RE_ENABLE_I18N */
3703 return tree;
3704 #endif /* not RE_ENABLE_I18N */
3706 build_word_op_espace:
3707 re_free (sbcset);
3708 #ifdef RE_ENABLE_I18N
3709 free_charset (mbcset);
3710 #endif /* RE_ENABLE_I18N */
3711 *err = REG_ESPACE;
3712 return NULL;
3715 /* This is intended for the expressions like "a{1,3}".
3716 Fetch a number from `input', and return the number.
3717 Return -1, if the number field is empty like "{,1}".
3718 Return -2, if an error has occurred. */
3720 static int
3721 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3723 int num = -1;
3724 unsigned char c;
3725 while (1)
3727 fetch_token (token, input, syntax);
3728 c = token->opr.c;
3729 if (BE (token->type == END_OF_RE, 0))
3730 return -2;
3731 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3732 break;
3733 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3734 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3735 num = (num > RE_DUP_MAX) ? -2 : num;
3737 return num;
3740 #ifdef RE_ENABLE_I18N
3741 static void
3742 free_charset (re_charset_t *cset)
3744 re_free (cset->mbchars);
3745 # ifdef _LIBC
3746 re_free (cset->coll_syms);
3747 re_free (cset->equiv_classes);
3748 re_free (cset->range_starts);
3749 re_free (cset->range_ends);
3750 # endif
3751 re_free (cset->char_classes);
3752 re_free (cset);
3754 #endif /* RE_ENABLE_I18N */
3756 /* Functions for binary tree operation. */
3758 /* Create a tree node. */
3760 static bin_tree_t *
3761 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3762 re_token_type_t type)
3764 re_token_t t;
3765 t.type = type;
3766 return create_token_tree (dfa, left, right, &t);
3769 static bin_tree_t *
3770 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3771 const re_token_t *token)
3773 bin_tree_t *tree;
3774 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3776 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3778 if (storage == NULL)
3779 return NULL;
3780 storage->next = dfa->str_tree_storage;
3781 dfa->str_tree_storage = storage;
3782 dfa->str_tree_storage_idx = 0;
3784 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3786 tree->parent = NULL;
3787 tree->left = left;
3788 tree->right = right;
3789 tree->token = *token;
3790 tree->token.duplicated = 0;
3791 tree->token.opt_subexp = 0;
3792 tree->first = NULL;
3793 tree->next = NULL;
3794 tree->node_idx = -1;
3796 if (left != NULL)
3797 left->parent = tree;
3798 if (right != NULL)
3799 right->parent = tree;
3800 return tree;
3803 /* Mark the tree SRC as an optional subexpression.
3804 To be called from preorder or postorder. */
3806 static reg_errcode_t
3807 mark_opt_subexp (void *extra, bin_tree_t *node)
3809 int idx = (int) (long) extra;
3810 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3811 node->token.opt_subexp = 1;
3813 return REG_NOERROR;
3816 /* Free the allocated memory inside NODE. */
3818 static void
3819 free_token (re_token_t *node)
3821 #ifdef RE_ENABLE_I18N
3822 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3823 free_charset (node->opr.mbcset);
3824 else
3825 #endif /* RE_ENABLE_I18N */
3826 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3827 re_free (node->opr.sbcset);
3830 /* Worker function for tree walking. Free the allocated memory inside NODE
3831 and its children. */
3833 static reg_errcode_t
3834 free_tree (void *extra, bin_tree_t *node)
3836 free_token (&node->token);
3837 return REG_NOERROR;
3841 /* Duplicate the node SRC, and return new node. This is a preorder
3842 visit similar to the one implemented by the generic visitor, but
3843 we need more infrastructure to maintain two parallel trees --- so,
3844 it's easier to duplicate. */
3846 static bin_tree_t *
3847 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3849 const bin_tree_t *node;
3850 bin_tree_t *dup_root;
3851 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3853 for (node = root; ; )
3855 /* Create a new tree and link it back to the current parent. */
3856 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3857 if (*p_new == NULL)
3858 return NULL;
3859 (*p_new)->parent = dup_node;
3860 (*p_new)->token.duplicated = 1;
3861 dup_node = *p_new;
3863 /* Go to the left node, or up and to the right. */
3864 if (node->left)
3866 node = node->left;
3867 p_new = &dup_node->left;
3869 else
3871 const bin_tree_t *prev = NULL;
3872 while (node->right == prev || node->right == NULL)
3874 prev = node;
3875 node = node->parent;
3876 dup_node = dup_node->parent;
3877 if (!node)
3878 return dup_root;
3880 node = node->right;
3881 p_new = &dup_node->right;