1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
21 # include <locale/weight.h>
24 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
25 size_t length
, reg_syntax_t syntax
);
26 static void re_compile_fastmap_iter (regex_t
*bufp
,
27 const re_dfastate_t
*init_state
,
29 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
30 static void free_charset (re_charset_t
*cset
);
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
33 static void optimize_utf8 (re_dfa_t
*dfa
);
34 static reg_errcode_t
analyze (regex_t
*preg
);
35 static reg_errcode_t
preorder (bin_tree_t
*root
,
36 reg_errcode_t (fn (void *, bin_tree_t
*)),
38 static reg_errcode_t
postorder (bin_tree_t
*root
,
39 reg_errcode_t (fn (void *, bin_tree_t
*)),
41 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
42 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
43 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
45 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
48 static Idx
duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
);
49 static Idx
search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
50 unsigned int constraint
);
51 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
52 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
54 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
55 static Idx
fetch_number (re_string_t
*input
, re_token_t
*token
,
57 static int peek_token (re_token_t
*token
, re_string_t
*input
,
59 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
60 reg_syntax_t syntax
, reg_errcode_t
*err
);
61 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
62 re_token_t
*token
, reg_syntax_t syntax
,
63 Idx nest
, reg_errcode_t
*err
);
64 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
65 re_token_t
*token
, reg_syntax_t syntax
,
66 Idx nest
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 Idx nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 Idx nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
74 re_dfa_t
*dfa
, re_token_t
*token
,
75 reg_syntax_t syntax
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
77 re_token_t
*token
, reg_syntax_t syntax
,
79 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
81 re_token_t
*token
, int token_len
,
85 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
88 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
90 Idx
*equiv_class_alloc
,
91 const unsigned char *name
);
92 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
95 Idx
*char_class_alloc
,
96 const char *class_name
,
98 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
99 RE_TRANSLATE_TYPE trans
,
100 const char *class_name
,
102 bool non_match
, reg_errcode_t
*err
);
103 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
104 bin_tree_t
*left
, bin_tree_t
*right
,
105 re_token_type_t type
);
106 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
107 bin_tree_t
*left
, bin_tree_t
*right
,
108 const re_token_t
*token
);
109 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
110 static void free_token (re_token_t
*node
);
111 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
112 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
114 /* This table gives an error message for each of the error codes listed
115 in regex.h. Obviously the order here has to be same as there.
116 POSIX doesn't require that we do anything for REG_NOERROR,
117 but why not be nice? */
119 static const char __re_error_msgid
[] =
121 #define REG_NOERROR_IDX 0
122 gettext_noop ("Success") /* REG_NOERROR */
124 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
125 gettext_noop ("No match") /* REG_NOMATCH */
127 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
128 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
130 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
131 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
133 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
134 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
136 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
137 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
139 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
140 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
142 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
143 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
145 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
146 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
148 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
149 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
151 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
152 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
154 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
155 gettext_noop ("Invalid range end") /* REG_ERANGE */
157 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
158 gettext_noop ("Memory exhausted") /* REG_ESPACE */
160 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
161 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
163 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
164 gettext_noop ("Premature end of regular expression") /* REG_EEND */
166 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
167 gettext_noop ("Regular expression too big") /* REG_ESIZE */
169 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
170 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
173 static const size_t __re_error_msgid_idx
[] =
194 /* Entry points for GNU code. */
196 /* re_compile_pattern is the GNU regular expression compiler: it
197 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
198 Returns 0 if the pattern was valid, otherwise an error string.
200 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
201 are set in BUFP on entry. */
204 re_compile_pattern (const char *pattern
, size_t length
,
205 struct re_pattern_buffer
*bufp
)
209 /* And GNU code determines whether or not to get register information
210 by passing null for the REGS argument to re_match, etc., not by
211 setting no_sub, unless RE_NO_SUB is set. */
212 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
214 /* Match anchors at newline. */
215 bufp
->newline_anchor
= 1;
217 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
221 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
223 weak_alias (__re_compile_pattern
, re_compile_pattern
)
225 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
226 also be assigned to arbitrarily: each pattern buffer stores its own
227 syntax, so it can be changed between regex compilations. */
228 /* This has no initializer because initialized variables in Emacs
229 become read-only after dumping. */
230 reg_syntax_t re_syntax_options
;
233 /* Specify the precise syntax of regexps for compilation. This provides
234 for compatibility for various utilities which historically have
235 different, incompatible syntaxes.
237 The argument SYNTAX is a bit mask comprised of the various bits
238 defined in regex.h. We return the old syntax. */
241 re_set_syntax (reg_syntax_t syntax
)
243 reg_syntax_t ret
= re_syntax_options
;
245 re_syntax_options
= syntax
;
248 weak_alias (__re_set_syntax
, re_set_syntax
)
251 re_compile_fastmap (struct re_pattern_buffer
*bufp
)
253 re_dfa_t
*dfa
= bufp
->buffer
;
254 char *fastmap
= bufp
->fastmap
;
256 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
257 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
258 if (dfa
->init_state
!= dfa
->init_state_word
)
259 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
260 if (dfa
->init_state
!= dfa
->init_state_nl
)
261 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
262 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
263 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
264 bufp
->fastmap_accurate
= 1;
267 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
269 static __always_inline
void
270 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
274 fastmap
[tolower (ch
)] = 1;
277 /* Helper function for re_compile_fastmap.
278 Compile fastmap for the initial_state INIT_STATE. */
281 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
284 re_dfa_t
*dfa
= bufp
->buffer
;
286 bool icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
287 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
289 Idx node
= init_state
->nodes
.elems
[node_cnt
];
290 re_token_type_t type
= dfa
->nodes
[node
].type
;
292 if (type
== CHARACTER
)
294 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
295 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
297 unsigned char buf
[MB_LEN_MAX
];
303 *p
++ = dfa
->nodes
[node
].opr
.c
;
304 while (++node
< dfa
->nodes_len
305 && dfa
->nodes
[node
].type
== CHARACTER
306 && dfa
->nodes
[node
].mb_partial
)
307 *p
++ = dfa
->nodes
[node
].opr
.c
;
308 memset (&state
, '\0', sizeof (state
));
309 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
311 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
313 re_set_fastmap (fastmap
, false, buf
[0]);
316 else if (type
== SIMPLE_BRACKET
)
319 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
322 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
323 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
324 if (w
& ((bitset_word_t
) 1 << j
))
325 re_set_fastmap (fastmap
, icase
, ch
);
328 else if (type
== COMPLEX_BRACKET
)
330 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
334 /* See if we have to try all bytes which start multiple collation
336 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
337 collation element, and don't catch 'b' since 'b' is
338 the only collation element which starts from 'b' (and
339 it is caught by SIMPLE_BRACKET). */
340 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
341 && (cset
->ncoll_syms
|| cset
->nranges
))
343 const int32_t *table
= (const int32_t *)
344 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
345 for (i
= 0; i
< SBC_MAX
; ++i
)
347 re_set_fastmap (fastmap
, icase
, i
);
351 /* See if we have to start the match at all multibyte characters,
352 i.e. where we would not find an invalid sequence. This only
353 applies to multibyte character sets; for single byte character
354 sets, the SIMPLE_BRACKET again suffices. */
355 if (dfa
->mb_cur_max
> 1
356 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
358 || cset
->nequiv_classes
366 memset (&mbs
, 0, sizeof (mbs
));
367 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
368 re_set_fastmap (fastmap
, false, (int) c
);
375 /* ... Else catch all bytes which can start the mbchars. */
376 for (i
= 0; i
< cset
->nmbchars
; ++i
)
380 memset (&state
, '\0', sizeof (state
));
381 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
382 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
383 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
385 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
387 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
392 else if (type
== OP_PERIOD
|| type
== OP_UTF8_PERIOD
|| type
== END_OF_RE
)
394 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
395 if (type
== END_OF_RE
)
396 bufp
->can_be_null
= 1;
402 /* Entry point for POSIX code. */
403 /* regcomp takes a regular expression as a string and compiles it.
405 PREG is a regex_t *. We do not expect any fields to be initialized,
406 since POSIX says we shouldn't. Thus, we set
408 'buffer' to the compiled pattern;
409 'used' to the length of the compiled pattern;
410 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
411 REG_EXTENDED bit in CFLAGS is set; otherwise, to
412 RE_SYNTAX_POSIX_BASIC;
413 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
414 'fastmap' to an allocated space for the fastmap;
415 'fastmap_accurate' to zero;
416 're_nsub' to the number of subexpressions in PATTERN.
418 PATTERN is the address of the pattern string.
420 CFLAGS is a series of bits which affect compilation.
422 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423 use POSIX basic syntax.
425 If REG_NEWLINE is set, then . and [^...] don't match newline.
426 Also, regexec will try a match beginning after every newline.
428 If REG_ICASE is set, then we considers upper- and lowercase
429 versions of letters to be equivalent when matching.
431 If REG_NOSUB is set, then when PREG is passed to regexec, that
432 routine will report only success or failure, and nothing about the
435 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
436 the return codes and their meanings.) */
439 regcomp (regex_t
*__restrict preg
, const char *__restrict pattern
, int cflags
)
442 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
443 : RE_SYNTAX_POSIX_BASIC
);
449 /* Try to allocate space for the fastmap. */
450 preg
->fastmap
= re_malloc (char, SBC_MAX
);
451 if (__glibc_unlikely (preg
->fastmap
== NULL
))
454 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
456 /* If REG_NEWLINE is set, newlines are treated differently. */
457 if (cflags
& REG_NEWLINE
)
458 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
459 syntax
&= ~RE_DOT_NEWLINE
;
460 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
461 /* It also changes the matching behavior. */
462 preg
->newline_anchor
= 1;
465 preg
->newline_anchor
= 0;
466 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
467 preg
->translate
= NULL
;
469 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
471 /* POSIX doesn't distinguish between an unmatched open-group and an
472 unmatched close-group: both are REG_EPAREN. */
473 if (ret
== REG_ERPAREN
)
476 /* We have already checked preg->fastmap != NULL. */
477 if (__glibc_likely (ret
== REG_NOERROR
))
478 /* Compute the fastmap now, since regexec cannot modify the pattern
479 buffer. This function never fails in this implementation. */
480 (void) re_compile_fastmap (preg
);
483 /* Some error occurred while compiling the expression. */
484 re_free (preg
->fastmap
);
485 preg
->fastmap
= NULL
;
490 libc_hidden_def (__regcomp
)
491 weak_alias (__regcomp
, regcomp
)
493 /* Returns a message corresponding to an error code, ERRCODE, returned
494 from either regcomp or regexec. We don't use PREG here. */
497 regerror (int errcode
, const regex_t
*__restrict preg
, char *__restrict errbuf
,
502 int nerrcodes
= sizeof __re_error_msgid_idx
/ sizeof __re_error_msgid_idx
[0];
504 if (__glibc_unlikely (errcode
< 0 || errcode
>= nerrcodes
))
505 /* Only error codes returned by the rest of the code should be passed
506 to this routine. If we are given anything else, or if other regex
507 code generates an invalid error code, then the program has a bug.
508 Dump core so we can fix it. */
511 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
513 msg_size
= strlen (msg
) + 1; /* Includes the null. */
515 if (__glibc_likely (errbuf_size
!= 0))
517 size_t cpy_size
= msg_size
;
518 if (__glibc_unlikely (msg_size
> errbuf_size
))
520 cpy_size
= errbuf_size
- 1;
521 errbuf
[cpy_size
] = '\0';
523 memcpy (errbuf
, msg
, cpy_size
);
528 weak_alias (__regerror
, regerror
)
531 /* This static array is used for the map to single-byte characters when
532 UTF-8 is used. Otherwise we would allocate memory just to initialize
533 it the same all the time. UTF-8 is the preferred encoding so this is
534 a worthwhile optimization. */
535 static const bitset_t utf8_sb_map
=
537 /* Set the first 128 bits. */
538 #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
539 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
541 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
542 # error "bitset_word_t is narrower than 32 bits"
543 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
544 BITSET_WORD_MAX
, BITSET_WORD_MAX
, BITSET_WORD_MAX
,
545 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
546 BITSET_WORD_MAX
, BITSET_WORD_MAX
,
547 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
551 >> (SBC_MAX
% BITSET_WORD_BITS
== 0
553 : BITSET_WORD_BITS
- SBC_MAX
% BITSET_WORD_BITS
))
559 free_dfa_content (re_dfa_t
*dfa
)
564 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
565 free_token (dfa
->nodes
+ i
);
566 re_free (dfa
->nexts
);
567 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
569 if (dfa
->eclosures
!= NULL
)
570 re_node_set_free (dfa
->eclosures
+ i
);
571 if (dfa
->inveclosures
!= NULL
)
572 re_node_set_free (dfa
->inveclosures
+ i
);
573 if (dfa
->edests
!= NULL
)
574 re_node_set_free (dfa
->edests
+ i
);
576 re_free (dfa
->edests
);
577 re_free (dfa
->eclosures
);
578 re_free (dfa
->inveclosures
);
579 re_free (dfa
->nodes
);
581 if (dfa
->state_table
)
582 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
584 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
585 for (j
= 0; j
< entry
->num
; ++j
)
587 re_dfastate_t
*state
= entry
->array
[j
];
590 re_free (entry
->array
);
592 re_free (dfa
->state_table
);
593 if (dfa
->sb_char
!= utf8_sb_map
)
594 re_free (dfa
->sb_char
);
595 re_free (dfa
->subexp_map
);
597 re_free (dfa
->re_str
);
604 /* Free dynamically allocated space used by PREG. */
607 regfree (regex_t
*preg
)
609 re_dfa_t
*dfa
= preg
->buffer
;
610 if (__glibc_likely (dfa
!= NULL
))
612 lock_fini (dfa
->lock
);
613 free_dfa_content (dfa
);
618 re_free (preg
->fastmap
);
619 preg
->fastmap
= NULL
;
621 re_free (preg
->translate
);
622 preg
->translate
= NULL
;
624 libc_hidden_def (__regfree
)
625 weak_alias (__regfree
, regfree
)
627 /* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
630 #if defined _REGEX_RE_COMP || defined _LIBC
632 /* BSD has one and only one pattern buffer. */
633 static struct re_pattern_buffer re_comp_buf
;
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
642 re_comp (const char *s
)
649 if (!re_comp_buf
.buffer
)
650 return gettext ("No previous regular expression");
654 if (re_comp_buf
.buffer
)
656 fastmap
= re_comp_buf
.fastmap
;
657 re_comp_buf
.fastmap
= NULL
;
658 __regfree (&re_comp_buf
);
659 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
660 re_comp_buf
.fastmap
= fastmap
;
663 if (re_comp_buf
.fastmap
== NULL
)
665 re_comp_buf
.fastmap
= re_malloc (char, SBC_MAX
);
666 if (re_comp_buf
.fastmap
== NULL
)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
671 /* Since 're_exec' always passes NULL for the 'regs' argument, we
672 don't need to initialize the pattern buffer fields which affect it. */
674 /* Match anchors at newlines. */
675 re_comp_buf
.newline_anchor
= 1;
677 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
682 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
683 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
687 libc_freeres_fn (free_mem
)
689 __regfree (&re_comp_buf
);
693 #endif /* _REGEX_RE_COMP */
695 /* Internal entry point.
696 Compile the regular expression PATTERN, whose length is LENGTH.
697 SYNTAX indicate regular expression's syntax. */
700 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
703 reg_errcode_t err
= REG_NOERROR
;
707 /* Initialize the pattern buffer. */
708 preg
->fastmap_accurate
= 0;
709 preg
->syntax
= syntax
;
710 preg
->not_bol
= preg
->not_eol
= 0;
713 preg
->can_be_null
= 0;
714 preg
->regs_allocated
= REGS_UNALLOCATED
;
716 /* Initialize the dfa. */
718 if (__glibc_unlikely (preg
->allocated
< sizeof (re_dfa_t
)))
720 /* If zero allocated, but buffer is non-null, try to realloc
721 enough space. This loses if buffer's address is bogus, but
722 that is the user's responsibility. If ->buffer is NULL this
723 is a simple allocation. */
724 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
727 preg
->allocated
= sizeof (re_dfa_t
);
730 preg
->used
= sizeof (re_dfa_t
);
732 err
= init_dfa (dfa
, length
);
733 if (__glibc_unlikely (err
== REG_NOERROR
&& lock_init (dfa
->lock
) != 0))
735 if (__glibc_unlikely (err
!= REG_NOERROR
))
737 free_dfa_content (dfa
);
743 /* Note: length+1 will not overflow since it is checked in init_dfa. */
744 dfa
->re_str
= re_malloc (char, length
+ 1);
745 strncpy (dfa
->re_str
, pattern
, length
+ 1);
748 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
749 (syntax
& RE_ICASE
) != 0, dfa
);
750 if (__glibc_unlikely (err
!= REG_NOERROR
))
752 re_compile_internal_free_return
:
753 free_workarea_compile (preg
);
754 re_string_destruct (®exp
);
755 lock_fini (dfa
->lock
);
756 free_dfa_content (dfa
);
762 /* Parse the regular expression, and build a structure tree. */
764 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
765 if (__glibc_unlikely (dfa
->str_tree
== NULL
))
766 goto re_compile_internal_free_return
;
768 /* Analyze the tree and create the nfa. */
769 err
= analyze (preg
);
770 if (__glibc_unlikely (err
!= REG_NOERROR
))
771 goto re_compile_internal_free_return
;
773 /* If possible, do searching in single byte encoding to speed things up. */
774 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
777 /* Then create the initial state of the dfa. */
778 err
= create_initial_state (dfa
);
780 /* Release work areas. */
781 free_workarea_compile (preg
);
782 re_string_destruct (®exp
);
784 if (__glibc_unlikely (err
!= REG_NOERROR
))
786 lock_fini (dfa
->lock
);
787 free_dfa_content (dfa
);
795 /* Initialize DFA. We use the length of the regular expression PAT_LEN
796 as the initial length of some arrays. */
799 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
801 __re_size_t table_size
;
803 const char *codeset_name
;
805 size_t max_i18n_object_size
= MAX (sizeof (wchar_t), sizeof (wctype_t));
806 size_t max_object_size
=
807 MAX (sizeof (struct re_state_table_entry
),
808 MAX (sizeof (re_token_t
),
809 MAX (sizeof (re_node_set
),
810 MAX (sizeof (regmatch_t
),
811 max_i18n_object_size
))));
813 memset (dfa
, '\0', sizeof (re_dfa_t
));
815 /* Force allocation of str_tree_storage the first time. */
816 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
818 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
819 calculation below, and for similar doubling calculations
820 elsewhere. And it's <= rather than <, because some of the
821 doubling calculations add 1 afterwards. */
822 if (__glibc_unlikely (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) / 2
826 dfa
->nodes_alloc
= pat_len
+ 1;
827 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
829 /* table_size = 2 ^ ceil(log pat_len) */
830 for (table_size
= 1; ; table_size
<<= 1)
831 if (table_size
> pat_len
)
834 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
835 dfa
->state_hash_mask
= table_size
- 1;
837 dfa
->mb_cur_max
= MB_CUR_MAX
;
839 if (dfa
->mb_cur_max
== 6
840 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
842 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
845 codeset_name
= nl_langinfo (CODESET
);
846 if ((codeset_name
[0] == 'U' || codeset_name
[0] == 'u')
847 && (codeset_name
[1] == 'T' || codeset_name
[1] == 't')
848 && (codeset_name
[2] == 'F' || codeset_name
[2] == 'f')
849 && strcmp (codeset_name
+ 3 + (codeset_name
[3] == '-'), "8") == 0)
852 /* We check exhaustively in the loop below if this charset is a
853 superset of ASCII. */
854 dfa
->map_notascii
= 0;
857 if (dfa
->mb_cur_max
> 1)
860 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
865 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
866 if (__glibc_unlikely (dfa
->sb_char
== NULL
))
869 /* Set the bits corresponding to single byte chars. */
870 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
871 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
873 wint_t wch
= __btowc (ch
);
875 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
877 if (isascii (ch
) && wch
!= ch
)
878 dfa
->map_notascii
= 1;
884 if (__glibc_unlikely (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
))
889 /* Initialize WORD_CHAR table, which indicate which character is
890 "word". In this case "word" means that it is the word construction
891 character used by some operators like "\<", "\>", etc. */
894 init_word_char (re_dfa_t
*dfa
)
899 dfa
->word_ops_used
= 1;
900 if (__glibc_likely (dfa
->map_notascii
== 0))
902 bitset_word_t bits0
= 0x00000000;
903 bitset_word_t bits1
= 0x03ff0000;
904 bitset_word_t bits2
= 0x87fffffe;
905 bitset_word_t bits3
= 0x07fffffe;
906 if (BITSET_WORD_BITS
== 64)
908 /* Pacify gcc -Woverflow on 32-bit platforms. */
909 dfa
->word_char
[0] = bits1
<< 31 << 1 | bits0
;
910 dfa
->word_char
[1] = bits3
<< 31 << 1 | bits2
;
913 else if (BITSET_WORD_BITS
== 32)
915 dfa
->word_char
[0] = bits0
;
916 dfa
->word_char
[1] = bits1
;
917 dfa
->word_char
[2] = bits2
;
918 dfa
->word_char
[3] = bits3
;
925 if (__glibc_likely (dfa
->is_utf8
))
927 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
933 for (; i
< BITSET_WORDS
; ++i
)
934 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
935 if (isalnum (ch
) || ch
== '_')
936 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
939 /* Free the work area which are only used while compiling. */
942 free_workarea_compile (regex_t
*preg
)
944 re_dfa_t
*dfa
= preg
->buffer
;
945 bin_tree_storage_t
*storage
, *next
;
946 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
948 next
= storage
->next
;
951 dfa
->str_tree_storage
= NULL
;
952 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
953 dfa
->str_tree
= NULL
;
954 re_free (dfa
->org_indices
);
955 dfa
->org_indices
= NULL
;
958 /* Create initial states for all contexts. */
961 create_initial_state (re_dfa_t
*dfa
)
965 re_node_set init_nodes
;
967 /* Initial states have the epsilon closure of the node which is
968 the first node of the regular expression. */
969 first
= dfa
->str_tree
->first
->node_idx
;
970 dfa
->init_node
= first
;
971 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
972 if (__glibc_unlikely (err
!= REG_NOERROR
))
975 /* The back-references which are in initial states can epsilon transit,
976 since in this case all of the subexpressions can be null.
977 Then we add epsilon closures of the nodes which are the next nodes of
978 the back-references. */
979 if (dfa
->nbackref
> 0)
980 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
982 Idx node_idx
= init_nodes
.elems
[i
];
983 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
986 if (type
!= OP_BACK_REF
)
988 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
990 re_token_t
*clexp_node
;
991 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
992 if (clexp_node
->type
== OP_CLOSE_SUBEXP
993 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
996 if (clexp_idx
== init_nodes
.nelem
)
999 if (type
== OP_BACK_REF
)
1001 Idx dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1002 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1004 reg_errcode_t merge_err
1005 = re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1006 if (merge_err
!= REG_NOERROR
)
1013 /* It must be the first time to invoke acquire_state. */
1014 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1015 /* We don't check ERR here, since the initial state must not be NULL. */
1016 if (__glibc_unlikely (dfa
->init_state
== NULL
))
1018 if (dfa
->init_state
->has_constraint
)
1020 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1022 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1024 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1028 if (__glibc_unlikely (dfa
->init_state_word
== NULL
1029 || dfa
->init_state_nl
== NULL
1030 || dfa
->init_state_begbuf
== NULL
))
1034 dfa
->init_state_word
= dfa
->init_state_nl
1035 = dfa
->init_state_begbuf
= dfa
->init_state
;
1037 re_node_set_free (&init_nodes
);
1041 /* If it is possible to do searching in single byte encoding instead of UTF-8
1042 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1043 DFA nodes where needed. */
1046 optimize_utf8 (re_dfa_t
*dfa
)
1050 bool mb_chars
= false;
1051 bool has_period
= false;
1053 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1054 switch (dfa
->nodes
[node
].type
)
1057 if (dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1061 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1069 /* Word anchors etc. cannot be handled. It's okay to test
1070 opr.ctx_type since constraints (for all DFA nodes) are
1071 created by ORing one or more opr.ctx_type values. */
1081 case OP_DUP_ASTERISK
:
1082 case OP_OPEN_SUBEXP
:
1083 case OP_CLOSE_SUBEXP
:
1085 case COMPLEX_BRACKET
:
1087 case SIMPLE_BRACKET
:
1088 /* Just double check. */
1090 int rshift
= (ASCII_CHARS
% BITSET_WORD_BITS
== 0
1092 : BITSET_WORD_BITS
- ASCII_CHARS
% BITSET_WORD_BITS
);
1093 for (i
= ASCII_CHARS
/ BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1095 if (dfa
->nodes
[node
].opr
.sbcset
[i
] >> rshift
!= 0)
1105 if (mb_chars
|| has_period
)
1106 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1108 if (dfa
->nodes
[node
].type
== CHARACTER
1109 && dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1110 dfa
->nodes
[node
].mb_partial
= 0;
1111 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1112 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1115 /* The search can be in single byte locale. */
1116 dfa
->mb_cur_max
= 1;
1118 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
1122 "eclosure", and "inveclosure". */
1124 static reg_errcode_t
1125 analyze (regex_t
*preg
)
1127 re_dfa_t
*dfa
= preg
->buffer
;
1130 /* Allocate arrays. */
1131 dfa
->nexts
= re_malloc (Idx
, dfa
->nodes_alloc
);
1132 dfa
->org_indices
= re_malloc (Idx
, dfa
->nodes_alloc
);
1133 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1134 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1135 if (__glibc_unlikely (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
1136 || dfa
->edests
== NULL
|| dfa
->eclosures
== NULL
))
1139 dfa
->subexp_map
= re_malloc (Idx
, preg
->re_nsub
);
1140 if (dfa
->subexp_map
!= NULL
)
1143 for (i
= 0; i
< preg
->re_nsub
; i
++)
1144 dfa
->subexp_map
[i
] = i
;
1145 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1146 for (i
= 0; i
< preg
->re_nsub
; i
++)
1147 if (dfa
->subexp_map
[i
] != i
)
1149 if (i
== preg
->re_nsub
)
1151 re_free (dfa
->subexp_map
);
1152 dfa
->subexp_map
= NULL
;
1156 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1157 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1159 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1160 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1162 preorder (dfa
->str_tree
, calc_next
, dfa
);
1163 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1164 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1166 ret
= calc_eclosure (dfa
);
1167 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1170 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1172 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1175 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1176 if (__glibc_unlikely (dfa
->inveclosures
== NULL
))
1178 ret
= calc_inveclosure (dfa
);
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185 implement parse tree visits. Instead, we use parent pointers and
1186 some hairy code in these two functions. */
1187 static reg_errcode_t
1188 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1191 bin_tree_t
*node
, *prev
;
1193 for (node
= root
; ; )
1195 /* Descend down the tree, preferably to the left (or to the right
1196 if that's the only child). */
1197 while (node
->left
|| node
->right
)
1205 reg_errcode_t err
= fn (extra
, node
);
1206 if (__glibc_unlikely (err
!= REG_NOERROR
))
1208 if (node
->parent
== NULL
)
1211 node
= node
->parent
;
1213 /* Go up while we have a node that is reached from the right. */
1214 while (node
->right
== prev
|| node
->right
== NULL
);
1219 static reg_errcode_t
1220 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1225 for (node
= root
; ; )
1227 reg_errcode_t err
= fn (extra
, node
);
1228 if (__glibc_unlikely (err
!= REG_NOERROR
))
1231 /* Go to the left node, or up and to the right. */
1236 bin_tree_t
*prev
= NULL
;
1237 while (node
->right
== prev
|| node
->right
== NULL
)
1240 node
= node
->parent
;
1249 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1250 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1251 backreferences as well. Requires a preorder visit. */
1252 static reg_errcode_t
1253 optimize_subexps (void *extra
, bin_tree_t
*node
)
1255 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1257 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1259 int idx
= node
->token
.opr
.idx
;
1260 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1261 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1264 else if (node
->token
.type
== SUBEXP
1265 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1267 Idx other_idx
= node
->left
->token
.opr
.idx
;
1269 node
->left
= node
->left
->left
;
1271 node
->left
->parent
= node
;
1273 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1274 if (other_idx
< BITSET_WORD_BITS
)
1275 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1281 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1282 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1283 static reg_errcode_t
1284 lower_subexps (void *extra
, bin_tree_t
*node
)
1286 regex_t
*preg
= (regex_t
*) extra
;
1287 reg_errcode_t err
= REG_NOERROR
;
1289 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1291 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1293 node
->left
->parent
= node
;
1295 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1297 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1299 node
->right
->parent
= node
;
1306 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1308 re_dfa_t
*dfa
= preg
->buffer
;
1309 bin_tree_t
*body
= node
->left
;
1310 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1313 /* We do not optimize empty subexpressions, because otherwise we may
1314 have bad CONCAT nodes with NULL children. This is obviously not
1315 very common, so we do not lose much. An example that triggers
1316 this case is the sed "script" /\(\)/x. */
1317 && node
->left
!= NULL
1318 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1319 || !(dfa
->used_bkref_map
1320 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1323 /* Convert the SUBEXP node to the concatenation of an
1324 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1325 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1326 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1327 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1328 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1329 if (__glibc_unlikely (tree
== NULL
|| tree1
== NULL
1330 || op
== NULL
|| cls
== NULL
))
1336 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1337 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1341 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1342 nodes. Requires a postorder visit. */
1343 static reg_errcode_t
1344 calc_first (void *extra
, bin_tree_t
*node
)
1346 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1347 if (node
->token
.type
== CONCAT
)
1349 node
->first
= node
->left
->first
;
1350 node
->node_idx
= node
->left
->node_idx
;
1355 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1356 if (__glibc_unlikely (node
->node_idx
== -1))
1358 if (node
->token
.type
== ANCHOR
)
1359 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1364 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1365 static reg_errcode_t
1366 calc_next (void *extra
, bin_tree_t
*node
)
1368 switch (node
->token
.type
)
1370 case OP_DUP_ASTERISK
:
1371 node
->left
->next
= node
;
1374 node
->left
->next
= node
->right
->first
;
1375 node
->right
->next
= node
->next
;
1379 node
->left
->next
= node
->next
;
1381 node
->right
->next
= node
->next
;
1387 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1388 static reg_errcode_t
1389 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1391 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1392 Idx idx
= node
->node_idx
;
1393 reg_errcode_t err
= REG_NOERROR
;
1395 switch (node
->token
.type
)
1401 DEBUG_ASSERT (node
->next
== NULL
);
1404 case OP_DUP_ASTERISK
:
1408 dfa
->has_plural_match
= 1;
1409 if (node
->left
!= NULL
)
1410 left
= node
->left
->first
->node_idx
;
1412 left
= node
->next
->node_idx
;
1413 if (node
->right
!= NULL
)
1414 right
= node
->right
->first
->node_idx
;
1416 right
= node
->next
->node_idx
;
1417 DEBUG_ASSERT (left
> -1);
1418 DEBUG_ASSERT (right
> -1);
1419 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1424 case OP_OPEN_SUBEXP
:
1425 case OP_CLOSE_SUBEXP
:
1426 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1430 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1431 if (node
->token
.type
== OP_BACK_REF
)
1432 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1436 DEBUG_ASSERT (!IS_EPSILON_NODE (node
->token
.type
));
1437 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1444 /* Duplicate the epsilon closure of the node ROOT_NODE.
1445 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1446 to their own constraint. */
1448 static reg_errcode_t
1449 duplicate_node_closure (re_dfa_t
*dfa
, Idx top_org_node
, Idx top_clone_node
,
1450 Idx root_node
, unsigned int init_constraint
)
1452 Idx org_node
, clone_node
;
1454 unsigned int constraint
= init_constraint
;
1455 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1457 Idx org_dest
, clone_dest
;
1458 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1460 /* If the back reference epsilon-transit, its destination must
1461 also have the constraint. Then duplicate the epsilon closure
1462 of the destination of the back reference, and store it in
1463 edests of the back reference. */
1464 org_dest
= dfa
->nexts
[org_node
];
1465 re_node_set_empty (dfa
->edests
+ clone_node
);
1466 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1467 if (__glibc_unlikely (clone_dest
== -1))
1469 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1470 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1471 if (__glibc_unlikely (! ok
))
1474 else if (dfa
->edests
[org_node
].nelem
== 0)
1476 /* In case of the node can't epsilon-transit, don't duplicate the
1477 destination and store the original destination as the
1478 destination of the node. */
1479 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1482 else if (dfa
->edests
[org_node
].nelem
== 1)
1484 /* In case of the node can epsilon-transit, and it has only one
1486 org_dest
= dfa
->edests
[org_node
].elems
[0];
1487 re_node_set_empty (dfa
->edests
+ clone_node
);
1488 /* If the node is root_node itself, it means the epsilon closure
1489 has a loop. Then tie it to the destination of the root_node. */
1490 if (org_node
== root_node
&& clone_node
!= org_node
)
1492 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1493 if (__glibc_unlikely (! ok
))
1497 /* In case the node has another constraint, append it. */
1498 constraint
|= dfa
->nodes
[org_node
].constraint
;
1499 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1500 if (__glibc_unlikely (clone_dest
== -1))
1502 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1503 if (__glibc_unlikely (! ok
))
1506 else /* dfa->edests[org_node].nelem == 2 */
1508 /* In case of the node can epsilon-transit, and it has two
1509 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1510 org_dest
= dfa
->edests
[org_node
].elems
[0];
1511 re_node_set_empty (dfa
->edests
+ clone_node
);
1512 /* Search for a duplicated node which satisfies the constraint. */
1513 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1514 if (clone_dest
== -1)
1516 /* There is no such duplicated node, create a new one. */
1518 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1519 if (__glibc_unlikely (clone_dest
== -1))
1521 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1522 if (__glibc_unlikely (! ok
))
1524 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1525 root_node
, constraint
);
1526 if (__glibc_unlikely (err
!= REG_NOERROR
))
1531 /* There is a duplicated node which satisfies the constraint,
1532 use it to avoid infinite loop. */
1533 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1534 if (__glibc_unlikely (! ok
))
1538 org_dest
= dfa
->edests
[org_node
].elems
[1];
1539 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1540 if (__glibc_unlikely (clone_dest
== -1))
1542 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1543 if (__glibc_unlikely (! ok
))
1546 org_node
= org_dest
;
1547 clone_node
= clone_dest
;
1552 /* Search for a node which is duplicated from the node ORG_NODE, and
1553 satisfies the constraint CONSTRAINT. */
1556 search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
1557 unsigned int constraint
)
1560 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1562 if (org_node
== dfa
->org_indices
[idx
]
1563 && constraint
== dfa
->nodes
[idx
].constraint
)
1564 return idx
; /* Found. */
1566 return -1; /* Not found. */
1569 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1570 Return the index of the new node, or -1 if insufficient storage is
1574 duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
)
1576 Idx dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1577 if (__glibc_likely (dup_idx
!= -1))
1579 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1580 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1581 dfa
->nodes
[dup_idx
].duplicated
= 1;
1583 /* Store the index of the original node. */
1584 dfa
->org_indices
[dup_idx
] = org_idx
;
1589 static reg_errcode_t
1590 calc_inveclosure (re_dfa_t
*dfa
)
1594 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1595 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1597 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1599 Idx
*elems
= dfa
->eclosures
[src
].elems
;
1600 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1602 ok
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1603 if (__glibc_unlikely (! ok
))
1611 /* Calculate "eclosure" for all the node in DFA. */
1613 static reg_errcode_t
1614 calc_eclosure (re_dfa_t
*dfa
)
1618 DEBUG_ASSERT (dfa
->nodes_len
> 0);
1620 /* For each nodes, calculate epsilon closure. */
1621 for (node_idx
= 0; ; ++node_idx
)
1624 re_node_set eclosure_elem
;
1625 if (node_idx
== dfa
->nodes_len
)
1633 DEBUG_ASSERT (dfa
->eclosures
[node_idx
].nelem
!= -1);
1635 /* If we have already calculated, skip it. */
1636 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1638 /* Calculate epsilon closure of 'node_idx'. */
1639 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, true);
1640 if (__glibc_unlikely (err
!= REG_NOERROR
))
1643 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1646 re_node_set_free (&eclosure_elem
);
1652 /* Calculate epsilon closure of NODE. */
1654 static reg_errcode_t
1655 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, Idx node
, bool root
)
1659 re_node_set eclosure
;
1660 bool incomplete
= false;
1661 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1662 if (__glibc_unlikely (err
!= REG_NOERROR
))
1665 /* An epsilon closure includes itself. */
1666 eclosure
.elems
[eclosure
.nelem
++] = node
;
1668 /* This indicates that we are calculating this node now.
1669 We reference this value to avoid infinite loop. */
1670 dfa
->eclosures
[node
].nelem
= -1;
1672 /* If the current node has constraints, duplicate all nodes
1673 since they must inherit the constraints. */
1674 if (dfa
->nodes
[node
].constraint
1675 && dfa
->edests
[node
].nelem
1676 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1678 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1679 dfa
->nodes
[node
].constraint
);
1680 if (__glibc_unlikely (err
!= REG_NOERROR
))
1684 /* Expand each epsilon destination nodes. */
1685 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1686 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1688 re_node_set eclosure_elem
;
1689 Idx edest
= dfa
->edests
[node
].elems
[i
];
1690 /* If calculating the epsilon closure of 'edest' is in progress,
1691 return intermediate result. */
1692 if (dfa
->eclosures
[edest
].nelem
== -1)
1697 /* If we haven't calculated the epsilon closure of 'edest' yet,
1698 calculate now. Otherwise use calculated epsilon closure. */
1699 if (dfa
->eclosures
[edest
].nelem
== 0)
1701 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, false);
1702 if (__glibc_unlikely (err
!= REG_NOERROR
))
1706 eclosure_elem
= dfa
->eclosures
[edest
];
1707 /* Merge the epsilon closure of 'edest'. */
1708 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1709 if (__glibc_unlikely (err
!= REG_NOERROR
))
1711 /* If the epsilon closure of 'edest' is incomplete,
1712 the epsilon closure of this node is also incomplete. */
1713 if (dfa
->eclosures
[edest
].nelem
== 0)
1716 re_node_set_free (&eclosure_elem
);
1720 if (incomplete
&& !root
)
1721 dfa
->eclosures
[node
].nelem
= 0;
1723 dfa
->eclosures
[node
] = eclosure
;
1724 *new_set
= eclosure
;
1728 /* Functions for token which are used in the parser. */
1730 /* Fetch a token from INPUT.
1731 We must not use this function inside bracket expressions. */
1734 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1736 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1739 /* Peek a token from INPUT, and return the length of the token.
1740 We must not use this function inside bracket expressions. */
1743 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1747 if (re_string_eoi (input
))
1749 token
->type
= END_OF_RE
;
1753 c
= re_string_peek_byte (input
, 0);
1756 token
->word_char
= 0;
1757 token
->mb_partial
= 0;
1758 if (input
->mb_cur_max
> 1
1759 && !re_string_first_byte (input
, re_string_cur_idx (input
)))
1761 token
->type
= CHARACTER
;
1762 token
->mb_partial
= 1;
1768 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1770 token
->type
= BACK_SLASH
;
1774 c2
= re_string_peek_byte_case (input
, 1);
1776 token
->type
= CHARACTER
;
1777 if (input
->mb_cur_max
> 1)
1779 wint_t wc
= re_string_wchar_at (input
,
1780 re_string_cur_idx (input
) + 1);
1781 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1784 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1789 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1790 token
->type
= OP_ALT
;
1792 case '1': case '2': case '3': case '4': case '5':
1793 case '6': case '7': case '8': case '9':
1794 if (!(syntax
& RE_NO_BK_REFS
))
1796 token
->type
= OP_BACK_REF
;
1797 token
->opr
.idx
= c2
- '1';
1801 if (!(syntax
& RE_NO_GNU_OPS
))
1803 token
->type
= ANCHOR
;
1804 token
->opr
.ctx_type
= WORD_FIRST
;
1808 if (!(syntax
& RE_NO_GNU_OPS
))
1810 token
->type
= ANCHOR
;
1811 token
->opr
.ctx_type
= WORD_LAST
;
1815 if (!(syntax
& RE_NO_GNU_OPS
))
1817 token
->type
= ANCHOR
;
1818 token
->opr
.ctx_type
= WORD_DELIM
;
1822 if (!(syntax
& RE_NO_GNU_OPS
))
1824 token
->type
= ANCHOR
;
1825 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1829 if (!(syntax
& RE_NO_GNU_OPS
))
1830 token
->type
= OP_WORD
;
1833 if (!(syntax
& RE_NO_GNU_OPS
))
1834 token
->type
= OP_NOTWORD
;
1837 if (!(syntax
& RE_NO_GNU_OPS
))
1838 token
->type
= OP_SPACE
;
1841 if (!(syntax
& RE_NO_GNU_OPS
))
1842 token
->type
= OP_NOTSPACE
;
1845 if (!(syntax
& RE_NO_GNU_OPS
))
1847 token
->type
= ANCHOR
;
1848 token
->opr
.ctx_type
= BUF_FIRST
;
1852 if (!(syntax
& RE_NO_GNU_OPS
))
1854 token
->type
= ANCHOR
;
1855 token
->opr
.ctx_type
= BUF_LAST
;
1859 if (!(syntax
& RE_NO_BK_PARENS
))
1860 token
->type
= OP_OPEN_SUBEXP
;
1863 if (!(syntax
& RE_NO_BK_PARENS
))
1864 token
->type
= OP_CLOSE_SUBEXP
;
1867 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1868 token
->type
= OP_DUP_PLUS
;
1871 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1872 token
->type
= OP_DUP_QUESTION
;
1875 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1876 token
->type
= OP_OPEN_DUP_NUM
;
1879 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1880 token
->type
= OP_CLOSE_DUP_NUM
;
1888 token
->type
= CHARACTER
;
1889 if (input
->mb_cur_max
> 1)
1891 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1892 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1895 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1900 if (syntax
& RE_NEWLINE_ALT
)
1901 token
->type
= OP_ALT
;
1904 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1905 token
->type
= OP_ALT
;
1908 token
->type
= OP_DUP_ASTERISK
;
1911 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1912 token
->type
= OP_DUP_PLUS
;
1915 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1916 token
->type
= OP_DUP_QUESTION
;
1919 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1920 token
->type
= OP_OPEN_DUP_NUM
;
1923 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1924 token
->type
= OP_CLOSE_DUP_NUM
;
1927 if (syntax
& RE_NO_BK_PARENS
)
1928 token
->type
= OP_OPEN_SUBEXP
;
1931 if (syntax
& RE_NO_BK_PARENS
)
1932 token
->type
= OP_CLOSE_SUBEXP
;
1935 token
->type
= OP_OPEN_BRACKET
;
1938 token
->type
= OP_PERIOD
;
1941 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
))
1942 && re_string_cur_idx (input
) != 0)
1944 char prev
= re_string_peek_byte (input
, -1);
1945 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1948 token
->type
= ANCHOR
;
1949 token
->opr
.ctx_type
= LINE_FIRST
;
1952 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
)
1953 && re_string_cur_idx (input
) + 1 != re_string_length (input
))
1956 re_string_skip_bytes (input
, 1);
1957 peek_token (&next
, input
, syntax
);
1958 re_string_skip_bytes (input
, -1);
1959 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1962 token
->type
= ANCHOR
;
1963 token
->opr
.ctx_type
= LINE_LAST
;
1971 /* Peek a token from INPUT, and return the length of the token.
1972 We must not use this function out of bracket expressions. */
1975 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1978 if (re_string_eoi (input
))
1980 token
->type
= END_OF_RE
;
1983 c
= re_string_peek_byte (input
, 0);
1986 if (input
->mb_cur_max
> 1
1987 && !re_string_first_byte (input
, re_string_cur_idx (input
)))
1989 token
->type
= CHARACTER
;
1993 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
1994 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
1996 /* In this case, '\' escape a character. */
1998 re_string_skip_bytes (input
, 1);
1999 c2
= re_string_peek_byte (input
, 0);
2001 token
->type
= CHARACTER
;
2004 if (c
== '[') /* '[' is a special char in a bracket exps. */
2008 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2009 c2
= re_string_peek_byte (input
, 1);
2017 token
->type
= OP_OPEN_COLL_ELEM
;
2021 token
->type
= OP_OPEN_EQUIV_CLASS
;
2025 if (syntax
& RE_CHAR_CLASSES
)
2027 token
->type
= OP_OPEN_CHAR_CLASS
;
2032 token
->type
= CHARACTER
;
2042 token
->type
= OP_CLOSE_BRACKET
;
2045 token
->type
= OP_NON_MATCH_LIST
;
2048 /* In V7 Unix grep and Unix awk and mawk, [...---...]
2049 (3 adjacent minus signs) stands for a single minus sign.
2050 Support that without breaking anything else. */
2051 if (! (re_string_cur_idx (input
) + 2 < re_string_length (input
)
2052 && re_string_peek_byte (input
, 1) == '-'
2053 && re_string_peek_byte (input
, 2) == '-'))
2055 token
->type
= OP_CHARSET_RANGE
;
2058 re_string_skip_bytes (input
, 2);
2061 token
->type
= CHARACTER
;
2066 /* Functions for parser. */
2068 /* Entry point of the parser.
2069 Parse the regular expression REGEXP and return the structure tree.
2070 If an error occurs, ERR is set by error code, and return NULL.
2071 This function build the following tree, from regular expression <reg_exp>:
2077 CAT means concatenation.
2078 EOR means end of regular expression. */
2081 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2084 re_dfa_t
*dfa
= preg
->buffer
;
2085 bin_tree_t
*tree
, *eor
, *root
;
2086 re_token_t current_token
;
2087 dfa
->syntax
= syntax
;
2088 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2089 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2090 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2092 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2094 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2097 if (__glibc_unlikely (eor
== NULL
|| root
== NULL
))
2105 /* This function build the following tree, from regular expression
2106 <branch1>|<branch2>:
2112 ALT means alternative, which represents the operator '|'. */
2115 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2116 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2118 re_dfa_t
*dfa
= preg
->buffer
;
2119 bin_tree_t
*tree
, *branch
= NULL
;
2120 bitset_word_t initial_bkref_map
= dfa
->completed_bkref_map
;
2121 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2122 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2125 while (token
->type
== OP_ALT
)
2127 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2128 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2129 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2131 bitset_word_t accumulated_bkref_map
= dfa
->completed_bkref_map
;
2132 dfa
->completed_bkref_map
= initial_bkref_map
;
2133 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2134 if (__glibc_unlikely (*err
!= REG_NOERROR
&& branch
== NULL
))
2137 postorder (tree
, free_tree
, NULL
);
2140 dfa
->completed_bkref_map
|= accumulated_bkref_map
;
2144 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2145 if (__glibc_unlikely (tree
== NULL
))
2154 /* This function build the following tree, from regular expression
2161 CAT means concatenation. */
2164 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2165 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2167 bin_tree_t
*tree
, *expr
;
2168 re_dfa_t
*dfa
= preg
->buffer
;
2169 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2170 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2173 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2174 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2176 expr
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2177 if (__glibc_unlikely (*err
!= REG_NOERROR
&& expr
== NULL
))
2180 postorder (tree
, free_tree
, NULL
);
2183 if (tree
!= NULL
&& expr
!= NULL
)
2185 bin_tree_t
*newtree
= create_tree (dfa
, tree
, expr
, CONCAT
);
2186 if (newtree
== NULL
)
2188 postorder (expr
, free_tree
, NULL
);
2189 postorder (tree
, free_tree
, NULL
);
2195 else if (tree
== NULL
)
2197 /* Otherwise expr == NULL, we don't need to create new tree. */
2202 /* This function build the following tree, from regular expression a*:
2209 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2210 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2212 re_dfa_t
*dfa
= preg
->buffer
;
2214 switch (token
->type
)
2217 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2218 if (__glibc_unlikely (tree
== NULL
))
2223 if (dfa
->mb_cur_max
> 1)
2225 while (!re_string_eoi (regexp
)
2226 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2228 bin_tree_t
*mbc_remain
;
2229 fetch_token (token
, regexp
, syntax
);
2230 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2231 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2232 if (__glibc_unlikely (mbc_remain
== NULL
|| tree
== NULL
))
2241 case OP_OPEN_SUBEXP
:
2242 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2243 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2247 case OP_OPEN_BRACKET
:
2248 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2249 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2254 if (!__glibc_likely (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
)))
2259 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2260 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2261 if (__glibc_unlikely (tree
== NULL
))
2267 dfa
->has_mb_node
= 1;
2270 case OP_OPEN_DUP_NUM
:
2271 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2277 case OP_DUP_ASTERISK
:
2279 case OP_DUP_QUESTION
:
2280 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2285 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2287 fetch_token (token
, regexp
, syntax
);
2288 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2291 case OP_CLOSE_SUBEXP
:
2292 if ((token
->type
== OP_CLOSE_SUBEXP
)
2293 && !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2299 case OP_CLOSE_DUP_NUM
:
2300 /* We treat it as a normal character. */
2302 /* Then we can these characters as normal characters. */
2303 token
->type
= CHARACTER
;
2304 /* mb_partial and word_char bits should be initialized already
2306 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2307 if (__glibc_unlikely (tree
== NULL
))
2315 if ((token
->opr
.ctx_type
2316 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2317 && dfa
->word_ops_used
== 0)
2318 init_word_char (dfa
);
2319 if (token
->opr
.ctx_type
== WORD_DELIM
2320 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2322 bin_tree_t
*tree_first
, *tree_last
;
2323 if (token
->opr
.ctx_type
== WORD_DELIM
)
2325 token
->opr
.ctx_type
= WORD_FIRST
;
2326 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2327 token
->opr
.ctx_type
= WORD_LAST
;
2331 token
->opr
.ctx_type
= INSIDE_WORD
;
2332 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2333 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2335 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2336 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2337 if (__glibc_unlikely (tree_first
== NULL
|| tree_last
== NULL
2346 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2347 if (__glibc_unlikely (tree
== NULL
))
2353 /* We must return here, since ANCHORs can't be followed
2354 by repetition operators.
2355 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2356 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2357 fetch_token (token
, regexp
, syntax
);
2361 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2362 if (__glibc_unlikely (tree
== NULL
))
2367 if (dfa
->mb_cur_max
> 1)
2368 dfa
->has_mb_node
= 1;
2373 tree
= build_charclass_op (dfa
, regexp
->trans
,
2376 token
->type
== OP_NOTWORD
, err
);
2377 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2383 tree
= build_charclass_op (dfa
, regexp
->trans
,
2386 token
->type
== OP_NOTSPACE
, err
);
2387 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2400 /* Must not happen? */
2401 DEBUG_ASSERT (false);
2404 fetch_token (token
, regexp
, syntax
);
2406 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2407 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2409 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
,
2411 if (__glibc_unlikely (*err
!= REG_NOERROR
&& dup_tree
== NULL
))
2414 postorder (tree
, free_tree
, NULL
);
2418 /* In BRE consecutive duplications are not allowed. */
2419 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2420 && (token
->type
== OP_DUP_ASTERISK
2421 || token
->type
== OP_OPEN_DUP_NUM
))
2424 postorder (tree
, free_tree
, NULL
);
2433 /* This function build the following tree, from regular expression
2441 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2442 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2444 re_dfa_t
*dfa
= preg
->buffer
;
2447 cur_nsub
= preg
->re_nsub
++;
2449 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2451 /* The subexpression may be a null string. */
2452 if (token
->type
== OP_CLOSE_SUBEXP
)
2456 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2457 if (__glibc_unlikely (*err
== REG_NOERROR
2458 && token
->type
!= OP_CLOSE_SUBEXP
))
2461 postorder (tree
, free_tree
, NULL
);
2464 if (__glibc_unlikely (*err
!= REG_NOERROR
))
2468 if (cur_nsub
<= '9' - '1')
2469 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2471 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2472 if (__glibc_unlikely (tree
== NULL
))
2477 tree
->token
.opr
.idx
= cur_nsub
;
2481 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2484 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2485 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2487 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2488 Idx i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2489 re_token_t start_token
= *token
;
2491 if (token
->type
== OP_OPEN_DUP_NUM
)
2494 start
= fetch_number (regexp
, token
, syntax
);
2497 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2498 start
= 0; /* We treat "{,m}" as "{0,m}". */
2501 *err
= REG_BADBR
; /* <re>{} is invalid. */
2505 if (__glibc_likely (start
!= -2))
2507 /* We treat "{n}" as "{n,n}". */
2508 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2509 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2510 ? fetch_number (regexp
, token
, syntax
) : -2));
2512 if (__glibc_unlikely (start
== -2 || end
== -2))
2514 /* Invalid sequence. */
2515 if (__glibc_unlikely (!(syntax
& RE_INVALID_INTERVAL_ORD
)))
2517 if (token
->type
== END_OF_RE
)
2525 /* If the syntax bit is set, rollback. */
2526 re_string_set_index (regexp
, start_idx
);
2527 *token
= start_token
;
2528 token
->type
= CHARACTER
;
2529 /* mb_partial and word_char bits should be already initialized by
2534 if (__glibc_unlikely ((end
!= -1 && start
> end
)
2535 || token
->type
!= OP_CLOSE_DUP_NUM
))
2537 /* First number greater than second. */
2542 if (__glibc_unlikely (RE_DUP_MAX
< (end
== -1 ? start
: end
)))
2550 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2551 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2554 fetch_token (token
, regexp
, syntax
);
2556 if (__glibc_unlikely (elem
== NULL
))
2558 if (__glibc_unlikely (start
== 0 && end
== 0))
2560 postorder (elem
, free_tree
, NULL
);
2564 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2565 if (__glibc_unlikely (start
> 0))
2568 for (i
= 2; i
<= start
; ++i
)
2570 elem
= duplicate_tree (elem
, dfa
);
2571 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2572 if (__glibc_unlikely (elem
== NULL
|| tree
== NULL
))
2573 goto parse_dup_op_espace
;
2579 /* Duplicate ELEM before it is marked optional. */
2580 elem
= duplicate_tree (elem
, dfa
);
2581 if (__glibc_unlikely (elem
== NULL
))
2582 goto parse_dup_op_espace
;
2588 if (elem
->token
.type
== SUBEXP
)
2590 uintptr_t subidx
= elem
->token
.opr
.idx
;
2591 postorder (elem
, mark_opt_subexp
, (void *) subidx
);
2594 tree
= create_tree (dfa
, elem
, NULL
,
2595 (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2596 if (__glibc_unlikely (tree
== NULL
))
2597 goto parse_dup_op_espace
;
2599 /* This loop is actually executed only when end != -1,
2600 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2601 already created the start+1-th copy. */
2602 if (TYPE_SIGNED (Idx
) || end
!= -1)
2603 for (i
= start
+ 2; i
<= end
; ++i
)
2605 elem
= duplicate_tree (elem
, dfa
);
2606 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2607 if (__glibc_unlikely (elem
== NULL
|| tree
== NULL
))
2608 goto parse_dup_op_espace
;
2610 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2611 if (__glibc_unlikely (tree
== NULL
))
2612 goto parse_dup_op_espace
;
2616 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2620 parse_dup_op_espace
:
2625 /* Size of the names for collating symbol/equivalence_class/character_class.
2626 I'm not sure, but maybe enough. */
2627 #define BRACKET_NAME_BUF_SIZE 32
2631 /* Convert the byte B to the corresponding wide character. In a
2632 unibyte locale, treat B as itself. In a multibyte locale, return
2633 WEOF if B is an encoding error. */
2635 parse_byte (unsigned char b
, re_dfa_t
const *dfa
)
2637 return dfa
->mb_cur_max
> 1 ? __btowc (b
) : b
;
2640 /* Local function for parse_bracket_exp used in _LIBC environment.
2641 Build the range expression which starts from START_ELEM, and ends
2642 at END_ELEM. The result are written to MBCSET and SBCSET.
2643 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2644 mbcset->range_ends, is a pointer argument since we may
2647 static reg_errcode_t
2648 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, Idx
*range_alloc
,
2649 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
,
2650 re_dfa_t
*dfa
, reg_syntax_t syntax
, uint_fast32_t nrules
,
2651 const unsigned char *collseqmb
, const char *collseqwc
,
2652 int_fast32_t table_size
, const void *symb_table
,
2653 const unsigned char *extra
)
2655 /* Equivalence Classes and Character Classes can't be a range start/end. */
2656 if (__glibc_unlikely (start_elem
->type
== EQUIV_CLASS
2657 || start_elem
->type
== CHAR_CLASS
2658 || end_elem
->type
== EQUIV_CLASS
2659 || end_elem
->type
== CHAR_CLASS
))
2662 /* We can handle no multi character collating elements without libc
2664 if (__glibc_unlikely ((start_elem
->type
== COLL_SYM
2665 && strlen ((char *) start_elem
->opr
.name
) > 1)
2666 || (end_elem
->type
== COLL_SYM
2667 && strlen ((char *) end_elem
->opr
.name
) > 1)))
2668 return REG_ECOLLATE
;
2671 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2672 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2674 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2675 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2678 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2679 ? parse_byte (start_ch
, dfa
) : start_elem
->opr
.wch
),
2680 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2681 ? parse_byte (end_ch
, dfa
) : end_elem
->opr
.wch
);
2683 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2684 return REG_ECOLLATE
;
2685 else if (__glibc_unlikely ((syntax
& RE_NO_EMPTY_RANGES
)
2686 && start_wc
> end_wc
))
2689 /* Got valid collation sequence values, add them as a new entry.
2690 However, for !_LIBC we have no collation elements: if the
2691 character set is single byte, the single byte character set
2692 that we build below suffices. parse_bracket_exp passes
2693 no MBCSET if dfa->mb_cur_max == 1. */
2694 if (dfa
->mb_cur_max
> 1)
2696 /* Check the space of the arrays. */
2697 if (__glibc_unlikely (*range_alloc
== mbcset
->nranges
))
2699 /* There is not enough space, need realloc. */
2700 wchar_t *new_array_start
, *new_array_end
;
2703 /* +1 in case of mbcset->nranges is 0. */
2704 new_nranges
= 2 * mbcset
->nranges
+ 1;
2705 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706 are NULL if *range_alloc == 0. */
2707 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2709 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2712 if (__glibc_unlikely (new_array_start
== NULL
2713 || new_array_end
== NULL
))
2715 re_free (new_array_start
);
2716 re_free (new_array_end
);
2720 mbcset
->range_starts
= new_array_start
;
2721 mbcset
->range_ends
= new_array_end
;
2722 *range_alloc
= new_nranges
;
2725 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2726 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2729 /* Build the table for single byte characters. */
2730 for (wchar_t wc
= 0; wc
< SBC_MAX
; ++wc
)
2732 if (start_wc
<= wc
&& wc
<= end_wc
)
2733 bitset_set (sbcset
, wc
);
2738 #endif /* not _LIBC */
2741 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2742 Build the collating element which is represented by NAME.
2743 The result are written to MBCSET and SBCSET.
2744 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2745 pointer argument since we may update it. */
2747 static reg_errcode_t
2748 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2749 Idx
*coll_sym_alloc
, const unsigned char *name
,
2750 uint_fast32_t nrules
, int_fast32_t table_size
,
2751 const void *symb_table
, const unsigned char *extra
)
2753 size_t name_len
= strlen ((const char *) name
);
2754 if (__glibc_unlikely (name_len
!= 1))
2755 return REG_ECOLLATE
;
2758 bitset_set (sbcset
, name
[0]);
2762 #endif /* not _LIBC */
2765 /* Local function for parse_bracket_exp used in _LIBC environment.
2766 Seek the collating symbol entry corresponding to NAME.
2767 Return the index of the symbol in the SYMB_TABLE,
2768 or -1 if not found. */
2770 static __always_inline
int32_t
2771 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
,
2772 const int32_t *symb_table
,
2773 int_fast32_t table_size
,
2774 const unsigned char *extra
)
2778 for (elem
= 0; elem
< table_size
; elem
++)
2779 if (symb_table
[2 * elem
] != 0)
2781 int32_t idx
= symb_table
[2 * elem
+ 1];
2782 /* Skip the name of collating element name. */
2783 idx
+= 1 + extra
[idx
];
2784 if (/* Compare the length of the name. */
2785 name_len
== extra
[idx
]
2786 /* Compare the name. */
2787 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2788 /* Yep, this is the entry. */
2794 /* Local function for parse_bracket_exp used in _LIBC environment.
2795 Look up the collation sequence value of BR_ELEM.
2796 Return the value if succeeded, UINT_MAX otherwise. */
2798 static __always_inline
unsigned int
2799 lookup_collation_sequence_value (bracket_elem_t
*br_elem
, uint32_t nrules
,
2800 const unsigned char *collseqmb
,
2801 const char *collseqwc
,
2802 int_fast32_t table_size
,
2803 const int32_t *symb_table
,
2804 const unsigned char *extra
)
2806 if (br_elem
->type
== SB_CHAR
)
2808 /* if (MB_CUR_MAX == 1) */
2810 return collseqmb
[br_elem
->opr
.ch
];
2813 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2814 return __collseq_table_lookup (collseqwc
, wc
);
2817 else if (br_elem
->type
== MB_CHAR
)
2820 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2822 else if (br_elem
->type
== COLL_SYM
)
2824 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2828 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2830 symb_table
, table_size
,
2834 /* We found the entry. */
2835 idx
= symb_table
[2 * elem
+ 1];
2836 /* Skip the name of collating element name. */
2837 idx
+= 1 + extra
[idx
];
2838 /* Skip the byte sequence of the collating element. */
2839 idx
+= 1 + extra
[idx
];
2840 /* Adjust for the alignment. */
2841 idx
= (idx
+ 3) & ~3;
2842 /* Skip the multibyte collation sequence value. */
2843 idx
+= sizeof (unsigned int);
2844 /* Skip the wide char sequence of the collating element. */
2845 idx
+= sizeof (unsigned int) *
2846 (1 + *(unsigned int *) (extra
+ idx
));
2847 /* Return the collation sequence value. */
2848 return *(unsigned int *) (extra
+ idx
);
2850 else if (sym_name_len
== 1)
2852 /* No valid character. Match it as a single byte
2854 return collseqmb
[br_elem
->opr
.name
[0]];
2857 else if (sym_name_len
== 1)
2858 return collseqmb
[br_elem
->opr
.name
[0]];
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864 Build the range expression which starts from START_ELEM, and ends
2865 at END_ELEM. The result are written to MBCSET and SBCSET.
2866 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2867 mbcset->range_ends, is a pointer argument since we may
2870 static __always_inline reg_errcode_t
2871 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, Idx
*range_alloc
,
2872 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
,
2873 re_dfa_t
*dfa
, reg_syntax_t syntax
, uint32_t nrules
,
2874 const unsigned char *collseqmb
, const char *collseqwc
,
2875 int_fast32_t table_size
, const int32_t *symb_table
,
2876 const unsigned char *extra
)
2879 uint32_t start_collseq
;
2880 uint32_t end_collseq
;
2882 /* Equivalence Classes and Character Classes can't be a range
2884 if (__glibc_unlikely (start_elem
->type
== EQUIV_CLASS
2885 || start_elem
->type
== CHAR_CLASS
2886 || end_elem
->type
== EQUIV_CLASS
2887 || end_elem
->type
== CHAR_CLASS
))
2890 /* FIXME: Implement rational ranges here, too. */
2891 start_collseq
= lookup_collation_sequence_value (start_elem
, nrules
, collseqmb
, collseqwc
,
2892 table_size
, symb_table
, extra
);
2893 end_collseq
= lookup_collation_sequence_value (end_elem
, nrules
, collseqmb
, collseqwc
,
2894 table_size
, symb_table
, extra
);
2895 /* Check start/end collation sequence values. */
2896 if (__glibc_unlikely (start_collseq
== UINT_MAX
2897 || end_collseq
== UINT_MAX
))
2898 return REG_ECOLLATE
;
2899 if (__glibc_unlikely ((syntax
& RE_NO_EMPTY_RANGES
)
2900 && start_collseq
> end_collseq
))
2903 /* Got valid collation sequence values, add them as a new entry.
2904 However, if we have no collation elements, and the character set
2905 is single byte, the single byte character set that we
2906 build below suffices. */
2907 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2909 /* Check the space of the arrays. */
2910 if (__glibc_unlikely (*range_alloc
== mbcset
->nranges
))
2912 /* There is not enough space, need realloc. */
2913 uint32_t *new_array_start
;
2914 uint32_t *new_array_end
;
2917 /* +1 in case of mbcset->nranges is 0. */
2918 new_nranges
= 2 * mbcset
->nranges
+ 1;
2919 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2921 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2924 if (__glibc_unlikely (new_array_start
== NULL
2925 || new_array_end
== NULL
))
2928 mbcset
->range_starts
= new_array_start
;
2929 mbcset
->range_ends
= new_array_end
;
2930 *range_alloc
= new_nranges
;
2933 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2934 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2937 /* Build the table for single byte characters. */
2938 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2940 uint32_t ch_collseq
;
2941 /* if (MB_CUR_MAX == 1) */
2943 ch_collseq
= collseqmb
[ch
];
2945 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2946 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2947 bitset_set (sbcset
, ch
);
2952 /* Local function for parse_bracket_exp used in _LIBC environment.
2953 Build the collating element which is represented by NAME.
2954 The result are written to MBCSET and SBCSET.
2955 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956 pointer argument since we may update it. */
2958 static __always_inline reg_errcode_t
2959 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2960 Idx
*coll_sym_alloc
, const unsigned char *name
,
2961 uint_fast32_t nrules
, int_fast32_t table_size
,
2962 const int32_t *symb_table
, const unsigned char *extra
)
2965 size_t name_len
= strlen ((const char *) name
);
2968 elem
= seek_collating_symbol_entry (name
, name_len
, symb_table
,
2972 /* We found the entry. */
2973 idx
= symb_table
[2 * elem
+ 1];
2974 /* Skip the name of collating element name. */
2975 idx
+= 1 + extra
[idx
];
2977 else if (name_len
== 1)
2979 /* No valid character, treat it as a normal
2981 bitset_set (sbcset
, name
[0]);
2985 return REG_ECOLLATE
;
2987 /* Got valid collation sequence, add it as a new entry. */
2988 /* Check the space of the arrays. */
2989 if (__glibc_unlikely (*coll_sym_alloc
== mbcset
->ncoll_syms
))
2991 /* Not enough, realloc it. */
2992 /* +1 in case of mbcset->ncoll_syms is 0. */
2993 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2994 /* Use realloc since mbcset->coll_syms is NULL
2996 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2997 new_coll_sym_alloc
);
2998 if (__glibc_unlikely (new_coll_syms
== NULL
))
3000 mbcset
->coll_syms
= new_coll_syms
;
3001 *coll_sym_alloc
= new_coll_sym_alloc
;
3003 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3008 if (__glibc_unlikely (name_len
!= 1))
3009 return REG_ECOLLATE
;
3012 bitset_set (sbcset
, name
[0]);
3019 /* This function parse bracket expression like "[abc]", "[a-c]",
3023 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
3024 reg_syntax_t syntax
, reg_errcode_t
*err
)
3026 const unsigned char *collseqmb
= NULL
;
3027 const char *collseqwc
= NULL
;
3028 uint_fast32_t nrules
= 0;
3029 int_fast32_t table_size
= 0;
3030 const void *symb_table
= NULL
;
3031 const unsigned char *extra
= NULL
;
3033 re_token_t br_token
;
3034 re_bitset_ptr_t sbcset
;
3035 re_charset_t
*mbcset
;
3036 Idx coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3037 Idx equiv_class_alloc
= 0, char_class_alloc
= 0;
3038 bool non_match
= false;
3039 bin_tree_t
*work_tree
;
3041 bool first_round
= true;
3043 collseqmb
= (const unsigned char *)
3044 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3045 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3051 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3052 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3053 symb_table
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_TABLEMB
);
3054 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3055 _NL_COLLATE_SYMB_EXTRAMB
);
3058 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3059 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3060 if (__glibc_unlikely (sbcset
== NULL
|| mbcset
== NULL
))
3068 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3069 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3072 goto parse_bracket_exp_free_return
;
3074 if (token
->type
== OP_NON_MATCH_LIST
)
3076 mbcset
->non_match
= 1;
3078 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3079 bitset_set (sbcset
, '\n');
3080 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3081 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3082 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3085 goto parse_bracket_exp_free_return
;
3089 /* We treat the first ']' as a normal character. */
3090 if (token
->type
== OP_CLOSE_BRACKET
)
3091 token
->type
= CHARACTER
;
3095 bracket_elem_t start_elem
, end_elem
;
3096 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3097 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3100 bool is_range_exp
= false;
3103 start_elem
.opr
.name
= start_name_buf
;
3104 start_elem
.type
= COLL_SYM
;
3105 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3106 syntax
, first_round
);
3107 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3110 goto parse_bracket_exp_free_return
;
3112 first_round
= false;
3114 /* Get information about the next token. We need it in any case. */
3115 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3117 /* Do not check for ranges if we know they are not allowed. */
3118 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3120 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3123 goto parse_bracket_exp_free_return
;
3125 if (token
->type
== OP_CHARSET_RANGE
)
3127 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3128 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3129 if (__glibc_unlikely (token2
.type
== END_OF_RE
))
3132 goto parse_bracket_exp_free_return
;
3134 if (token2
.type
== OP_CLOSE_BRACKET
)
3136 /* We treat the last '-' as a normal character. */
3137 re_string_skip_bytes (regexp
, -token_len
);
3138 token
->type
= CHARACTER
;
3141 is_range_exp
= true;
3145 if (is_range_exp
== true)
3147 end_elem
.opr
.name
= end_name_buf
;
3148 end_elem
.type
= COLL_SYM
;
3149 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3151 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3154 goto parse_bracket_exp_free_return
;
3157 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3159 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3160 &start_elem
, &end_elem
,
3161 dfa
, syntax
, nrules
, collseqmb
, collseqwc
,
3162 table_size
, symb_table
, extra
);
3163 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3164 goto parse_bracket_exp_free_return
;
3168 switch (start_elem
.type
)
3171 bitset_set (sbcset
, start_elem
.opr
.ch
);
3174 /* Check whether the array has enough space. */
3175 if (__glibc_unlikely (mbchar_alloc
== mbcset
->nmbchars
))
3177 wchar_t *new_mbchars
;
3178 /* Not enough, realloc it. */
3179 /* +1 in case of mbcset->nmbchars is 0. */
3180 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3181 /* Use realloc since array is NULL if *alloc == 0. */
3182 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3184 if (__glibc_unlikely (new_mbchars
== NULL
))
3185 goto parse_bracket_exp_espace
;
3186 mbcset
->mbchars
= new_mbchars
;
3188 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3191 *err
= build_equiv_class (sbcset
,
3192 mbcset
, &equiv_class_alloc
,
3193 start_elem
.opr
.name
);
3194 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3195 goto parse_bracket_exp_free_return
;
3198 *err
= build_collating_symbol (sbcset
,
3199 mbcset
, &coll_sym_alloc
,
3200 start_elem
.opr
.name
,
3201 nrules
, table_size
, symb_table
, extra
);
3202 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3203 goto parse_bracket_exp_free_return
;
3206 *err
= build_charclass (regexp
->trans
, sbcset
,
3207 mbcset
, &char_class_alloc
,
3208 (const char *) start_elem
.opr
.name
,
3210 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3211 goto parse_bracket_exp_free_return
;
3214 DEBUG_ASSERT (false);
3218 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3221 goto parse_bracket_exp_free_return
;
3223 if (token
->type
== OP_CLOSE_BRACKET
)
3227 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3229 /* If it is non-matching list. */
3231 bitset_not (sbcset
);
3233 /* Ensure only single byte characters are set. */
3234 if (dfa
->mb_cur_max
> 1)
3235 bitset_mask (sbcset
, dfa
->sb_char
);
3237 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3238 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3239 || mbcset
->non_match
)))
3241 bin_tree_t
*mbc_tree
;
3243 /* Build a tree for complex bracket. */
3244 dfa
->has_mb_node
= 1;
3245 br_token
.type
= COMPLEX_BRACKET
;
3246 br_token
.opr
.mbcset
= mbcset
;
3247 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3248 if (__glibc_unlikely (mbc_tree
== NULL
))
3249 goto parse_bracket_exp_espace
;
3250 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3251 if (sbcset
[sbc_idx
])
3253 /* If there are no bits set in sbcset, there is no point
3254 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3255 if (sbc_idx
< BITSET_WORDS
)
3257 /* Build a tree for simple bracket. */
3258 br_token
.type
= SIMPLE_BRACKET
;
3259 br_token
.opr
.sbcset
= sbcset
;
3260 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3261 if (__glibc_unlikely (work_tree
== NULL
))
3262 goto parse_bracket_exp_espace
;
3264 /* Then join them by ALT node. */
3265 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3266 if (__glibc_unlikely (work_tree
== NULL
))
3267 goto parse_bracket_exp_espace
;
3272 work_tree
= mbc_tree
;
3277 free_charset (mbcset
);
3278 /* Build a tree for simple bracket. */
3279 br_token
.type
= SIMPLE_BRACKET
;
3280 br_token
.opr
.sbcset
= sbcset
;
3281 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3282 if (__glibc_unlikely (work_tree
== NULL
))
3283 goto parse_bracket_exp_espace
;
3287 parse_bracket_exp_espace
:
3289 parse_bracket_exp_free_return
:
3291 free_charset (mbcset
);
3295 /* Parse an element in the bracket expression. */
3297 static reg_errcode_t
3298 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3299 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3300 reg_syntax_t syntax
, bool accept_hyphen
)
3303 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3304 if (cur_char_size
> 1)
3306 elem
->type
= MB_CHAR
;
3307 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3308 re_string_skip_bytes (regexp
, cur_char_size
);
3311 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3312 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3313 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3314 return parse_bracket_symbol (elem
, regexp
, token
);
3315 if (__glibc_unlikely (token
->type
== OP_CHARSET_RANGE
) && !accept_hyphen
)
3317 /* A '-' must only appear as anything but a range indicator before
3318 the closing bracket. Everything else is an error. */
3320 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3321 if (token2
.type
!= OP_CLOSE_BRACKET
)
3322 /* The actual error value is not standardized since this whole
3323 case is undefined. But ERANGE makes good sense. */
3326 elem
->type
= SB_CHAR
;
3327 elem
->opr
.ch
= token
->opr
.c
;
3331 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3332 such as [:<character_class>:], [.<collating_element>.], and
3333 [=<equivalent_class>=]. */
3335 static reg_errcode_t
3336 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3339 unsigned char ch
, delim
= token
->opr
.c
;
3341 if (re_string_eoi(regexp
))
3345 if (i
>= BRACKET_NAME_BUF_SIZE
)
3347 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3348 ch
= re_string_fetch_byte_case (regexp
);
3350 ch
= re_string_fetch_byte (regexp
);
3351 if (re_string_eoi(regexp
))
3353 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3355 elem
->opr
.name
[i
] = ch
;
3357 re_string_skip_bytes (regexp
, 1);
3358 elem
->opr
.name
[i
] = '\0';
3359 switch (token
->type
)
3361 case OP_OPEN_COLL_ELEM
:
3362 elem
->type
= COLL_SYM
;
3364 case OP_OPEN_EQUIV_CLASS
:
3365 elem
->type
= EQUIV_CLASS
;
3367 case OP_OPEN_CHAR_CLASS
:
3368 elem
->type
= CHAR_CLASS
;
3376 /* Helper function for parse_bracket_exp.
3377 Build the equivalence class which is represented by NAME.
3378 The result are written to MBCSET and SBCSET.
3379 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3380 is a pointer argument since we may update it. */
3382 static reg_errcode_t
3383 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3384 Idx
*equiv_class_alloc
, const unsigned char *name
)
3387 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3390 const int32_t *table
, *indirect
;
3391 const unsigned char *weights
, *extra
, *cp
;
3392 unsigned char char_buf
[2];
3396 /* Calculate the index for equivalence class. */
3398 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3399 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3400 _NL_COLLATE_WEIGHTMB
);
3401 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3402 _NL_COLLATE_EXTRAMB
);
3403 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3404 _NL_COLLATE_INDIRECTMB
);
3405 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3406 if (__glibc_unlikely (idx1
== 0 || *cp
!= '\0'))
3407 /* This isn't a valid character. */
3408 return REG_ECOLLATE
;
3410 /* Build single byte matching table for this equivalence class. */
3411 len
= weights
[idx1
& 0xffffff];
3412 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3416 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3421 /* This isn't a valid character. */
3423 /* Compare only if the length matches and the collation rule
3424 index is the same. */
3425 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24)
3426 && memcmp (weights
+ (idx1
& 0xffffff) + 1,
3427 weights
+ (idx2
& 0xffffff) + 1, len
) == 0)
3428 bitset_set (sbcset
, ch
);
3430 /* Check whether the array has enough space. */
3431 if (__glibc_unlikely (*equiv_class_alloc
== mbcset
->nequiv_classes
))
3433 /* Not enough, realloc it. */
3434 /* +1 in case of mbcset->nequiv_classes is 0. */
3435 Idx new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3436 /* Use realloc since the array is NULL if *alloc == 0. */
3437 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3439 new_equiv_class_alloc
);
3440 if (__glibc_unlikely (new_equiv_classes
== NULL
))
3442 mbcset
->equiv_classes
= new_equiv_classes
;
3443 *equiv_class_alloc
= new_equiv_class_alloc
;
3445 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3450 if (__glibc_unlikely (strlen ((const char *) name
) != 1))
3451 return REG_ECOLLATE
;
3452 bitset_set (sbcset
, *name
);
3457 /* Helper function for parse_bracket_exp.
3458 Build the character class which is represented by NAME.
3459 The result are written to MBCSET and SBCSET.
3460 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3461 is a pointer argument since we may update it. */
3463 static reg_errcode_t
3464 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3465 re_charset_t
*mbcset
, Idx
*char_class_alloc
,
3466 const char *class_name
, reg_syntax_t syntax
)
3469 const char *name
= class_name
;
3471 /* In case of REG_ICASE "upper" and "lower" match the both of
3472 upper and lower cases. */
3473 if ((syntax
& RE_ICASE
)
3474 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3477 /* Check the space of the arrays. */
3478 if (__glibc_unlikely (*char_class_alloc
== mbcset
->nchar_classes
))
3480 /* Not enough, realloc it. */
3481 /* +1 in case of mbcset->nchar_classes is 0. */
3482 Idx new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3483 /* Use realloc since array is NULL if *alloc == 0. */
3484 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3485 new_char_class_alloc
);
3486 if (__glibc_unlikely (new_char_classes
== NULL
))
3488 mbcset
->char_classes
= new_char_classes
;
3489 *char_class_alloc
= new_char_class_alloc
;
3491 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3493 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3495 if (__glibc_unlikely (trans != NULL)) \
3497 for (i = 0; i < SBC_MAX; ++i) \
3498 if (ctype_func (i)) \
3499 bitset_set (sbcset, trans[i]); \
3503 for (i = 0; i < SBC_MAX; ++i) \
3504 if (ctype_func (i)) \
3505 bitset_set (sbcset, i); \
3509 if (strcmp (name
, "alnum") == 0)
3510 BUILD_CHARCLASS_LOOP (isalnum
);
3511 else if (strcmp (name
, "cntrl") == 0)
3512 BUILD_CHARCLASS_LOOP (iscntrl
);
3513 else if (strcmp (name
, "lower") == 0)
3514 BUILD_CHARCLASS_LOOP (islower
);
3515 else if (strcmp (name
, "space") == 0)
3516 BUILD_CHARCLASS_LOOP (isspace
);
3517 else if (strcmp (name
, "alpha") == 0)
3518 BUILD_CHARCLASS_LOOP (isalpha
);
3519 else if (strcmp (name
, "digit") == 0)
3520 BUILD_CHARCLASS_LOOP (isdigit
);
3521 else if (strcmp (name
, "print") == 0)
3522 BUILD_CHARCLASS_LOOP (isprint
);
3523 else if (strcmp (name
, "upper") == 0)
3524 BUILD_CHARCLASS_LOOP (isupper
);
3525 else if (strcmp (name
, "blank") == 0)
3526 BUILD_CHARCLASS_LOOP (isblank
);
3527 else if (strcmp (name
, "graph") == 0)
3528 BUILD_CHARCLASS_LOOP (isgraph
);
3529 else if (strcmp (name
, "punct") == 0)
3530 BUILD_CHARCLASS_LOOP (ispunct
);
3531 else if (strcmp (name
, "xdigit") == 0)
3532 BUILD_CHARCLASS_LOOP (isxdigit
);
3540 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3541 const char *class_name
,
3542 const char *extra
, bool non_match
,
3545 re_bitset_ptr_t sbcset
;
3546 re_charset_t
*mbcset
;
3551 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3552 if (__glibc_unlikely (sbcset
== NULL
))
3557 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3558 if (__glibc_unlikely (mbcset
== NULL
))
3564 mbcset
->non_match
= non_match
;
3566 /* We don't care the syntax in this case. */
3567 ret
= build_charclass (trans
, sbcset
, mbcset
, &alloc
, class_name
, 0);
3569 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3572 free_charset (mbcset
);
3576 /* \w match '_' also. */
3577 for (; *extra
; extra
++)
3578 bitset_set (sbcset
, *extra
);
3580 /* If it is non-matching list. */
3582 bitset_not (sbcset
);
3584 /* Ensure only single byte characters are set. */
3585 if (dfa
->mb_cur_max
> 1)
3586 bitset_mask (sbcset
, dfa
->sb_char
);
3588 /* Build a tree for simple bracket. */
3589 re_token_t br_token
= { .type
= SIMPLE_BRACKET
, .opr
.sbcset
= sbcset
};
3590 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3591 if (__glibc_unlikely (tree
== NULL
))
3592 goto build_word_op_espace
;
3594 if (dfa
->mb_cur_max
> 1)
3596 bin_tree_t
*mbc_tree
;
3597 /* Build a tree for complex bracket. */
3598 br_token
.type
= COMPLEX_BRACKET
;
3599 br_token
.opr
.mbcset
= mbcset
;
3600 dfa
->has_mb_node
= 1;
3601 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3602 if (__glibc_unlikely (mbc_tree
== NULL
))
3603 goto build_word_op_espace
;
3604 /* Then join them by ALT node. */
3605 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3606 if (__glibc_likely (mbc_tree
!= NULL
))
3611 free_charset (mbcset
);
3615 build_word_op_espace
:
3617 free_charset (mbcset
);
3622 /* This is intended for the expressions like "a{1,3}".
3623 Fetch a number from 'input', and return the number.
3624 Return -1 if the number field is empty like "{,1}".
3625 Return RE_DUP_MAX + 1 if the number field is too large.
3626 Return -2 if an error occurred. */
3629 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3635 fetch_token (token
, input
, syntax
);
3637 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3639 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3641 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3645 : MIN (RE_DUP_MAX
+ 1, num
* 10 + c
- '0'));
3651 free_charset (re_charset_t
*cset
)
3653 re_free (cset
->mbchars
);
3655 re_free (cset
->coll_syms
);
3656 re_free (cset
->equiv_classes
);
3658 re_free (cset
->range_starts
);
3659 re_free (cset
->range_ends
);
3660 re_free (cset
->char_classes
);
3664 /* Functions for binary tree operation. */
3666 /* Create a tree node. */
3669 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3670 re_token_type_t type
)
3672 re_token_t t
= { .type
= type
};
3673 return create_token_tree (dfa
, left
, right
, &t
);
3677 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3678 const re_token_t
*token
)
3681 if (__glibc_unlikely (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
))
3683 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3685 if (storage
== NULL
)
3687 storage
->next
= dfa
->str_tree_storage
;
3688 dfa
->str_tree_storage
= storage
;
3689 dfa
->str_tree_storage_idx
= 0;
3691 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3693 tree
->parent
= NULL
;
3695 tree
->right
= right
;
3696 tree
->token
= *token
;
3697 tree
->token
.duplicated
= 0;
3698 tree
->token
.opt_subexp
= 0;
3701 tree
->node_idx
= -1;
3704 left
->parent
= tree
;
3706 right
->parent
= tree
;
3710 /* Mark the tree SRC as an optional subexpression.
3711 To be called from preorder or postorder. */
3713 static reg_errcode_t
3714 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3716 Idx idx
= (uintptr_t) extra
;
3717 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3718 node
->token
.opt_subexp
= 1;
3723 /* Free the allocated memory inside NODE. */
3726 free_token (re_token_t
*node
)
3728 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3729 free_charset (node
->opr
.mbcset
);
3730 else if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3731 re_free (node
->opr
.sbcset
);
3734 /* Worker function for tree walking. Free the allocated memory inside NODE
3735 and its children. */
3737 static reg_errcode_t
3738 free_tree (void *extra
, bin_tree_t
*node
)
3740 free_token (&node
->token
);
3745 /* Duplicate the node SRC, and return new node. This is a preorder
3746 visit similar to the one implemented by the generic visitor, but
3747 we need more infrastructure to maintain two parallel trees --- so,
3748 it's easier to duplicate. */
3751 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3753 const bin_tree_t
*node
;
3754 bin_tree_t
*dup_root
;
3755 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3757 for (node
= root
; ; )
3759 /* Create a new tree and link it back to the current parent. */
3760 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3763 (*p_new
)->parent
= dup_node
;
3764 (*p_new
)->token
.duplicated
= 1;
3767 /* Go to the left node, or up and to the right. */
3771 p_new
= &dup_node
->left
;
3775 const bin_tree_t
*prev
= NULL
;
3776 while (node
->right
== prev
|| node
->right
== NULL
)
3779 node
= node
->parent
;
3780 dup_node
= dup_node
->parent
;
3785 p_new
= &dup_node
->right
;