2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
8 * Copyright (c) 2011 The FreeBSD Foundation
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
13 * This code is derived from software contributed to Berkeley by
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 * 3. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
43 #if defined(LIBC_SCCS) && !defined(lint)
44 static char sccsid
[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
45 #endif /* LIBC_SCCS and not lint */
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
49 #include <sys/types.h>
60 /* We want the extensions implemented with LIBREGEX... */
65 /* ...but we also want to use the collation functions from nlsfuncs.cc. */
77 * Branching context, used to keep track of branch state for all of the branch-
78 * aware functions. In addition to keeping track of branch positions for the
79 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
80 * detection of whether ^ is acting as an anchor or being used erroneously and
81 * also for whether we're in a sub-expression or not.
95 * parse structure, passed up and down to avoid global variables and
99 const char *next
; /* next character in RE */
100 const char *end
; /* end of string (-> NUL normally) */
101 int error
; /* has an error been seen? */
103 sop
*strip
; /* malloced strip */
104 sopno ssize
; /* malloced strip size (allocated) */
105 sopno slen
; /* malloced strip length (used) */
106 int ncsalloc
; /* number of csets allocated */
108 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
109 sopno pbegin
[NPAREN
]; /* -> ( ([0] unused) */
110 sopno pend
[NPAREN
]; /* -> ) ([0] unused) */
111 bool allowbranch
; /* can this expression branch? */
112 bool bre
; /* convenience; is this a BRE? */
113 int pflags
; /* other parsing flags -- legacy escapes? */
114 bool (*parse_expr
)(struct parse
*, struct branchc
*);
115 void (*pre_parse
)(struct parse
*, struct branchc
*);
116 void (*post_parse
)(struct parse
*, struct branchc
*);
119 #define PFLAG_LEGACY_ESC 0x00000001
121 /* ========= begin header generated by ./mkh ========= */
126 /* === regcomp.c === */
127 static bool p_ere_exp(struct parse
*p
, struct branchc
*bc
);
128 static void p_str(struct parse
*p
);
129 static int p_branch_eat_delim(struct parse
*p
, struct branchc
*bc
);
130 static void p_branch_ins_offset(struct parse
*p
, struct branchc
*bc
);
131 static void p_branch_fix_tail(struct parse
*p
, struct branchc
*bc
);
132 static bool p_branch_empty(struct parse
*p
, struct branchc
*bc
);
133 static bool p_branch_do(struct parse
*p
, struct branchc
*bc
);
134 static void p_bre_pre_parse(struct parse
*p
, struct branchc
*bc
);
135 static void p_bre_post_parse(struct parse
*p
, struct branchc
*bc
);
136 static void p_re(struct parse
*p
, int end1
, int end2
);
137 static bool p_simp_re(struct parse
*p
, struct branchc
*bc
);
138 static int p_count(struct parse
*p
);
139 static void p_bracket(struct parse
*p
);
140 static int p_range_cmp(wint_t c1
, wint_t c2
);
141 static void p_b_term(struct parse
*p
, cset
*cs
);
142 static int p_b_pseudoclass(struct parse
*p
, char c
);
143 static void p_b_cclass(struct parse
*p
, cset
*cs
);
144 static void p_b_cclass_named(struct parse
*p
, cset
*cs
, const char[]);
145 static void p_b_eclass(struct parse
*p
, cset
*cs
);
146 static wint_t p_b_symbol(struct parse
*p
);
147 static wint_t p_b_coll_elem(struct parse
*p
, wint_t endc
);
148 static bool may_escape(struct parse
*p
, const wint_t ch
);
149 static wint_t othercase(wint_t ch
);
150 static void bothcases(struct parse
*p
, wint_t ch
);
151 static void ordinary(struct parse
*p
, wint_t ch
);
152 static void nonnewline(struct parse
*p
);
153 static void repeat(struct parse
*p
, sopno start
, int from
, int to
);
154 static int seterr(struct parse
*p
, int e
);
155 static cset
*allocset(struct parse
*p
);
156 static void freeset(struct parse
*p
, cset
*cs
);
157 static void CHadd(struct parse
*p
, cset
*cs
, wint_t ch
);
158 static void CHaddrange(struct parse
*p
, cset
*cs
, wint_t min
, wint_t max
);
159 static void CHaddtype(struct parse
*p
, cset
*cs
, wctype_t wct
);
160 static wint_t singleton(cset
*cs
);
161 static sopno
dupl(struct parse
*p
, sopno start
, sopno finish
);
162 static void doemit(struct parse
*p
, sop op
, size_t opnd
);
163 static void doinsert(struct parse
*p
, sop op
, size_t opnd
, sopno pos
);
164 static void dofwd(struct parse
*p
, sopno pos
, sop value
);
165 static int enlarge(struct parse
*p
, sopno size
);
166 static void stripsnug(struct parse
*p
, struct re_guts
*g
);
167 static void findmust(struct parse
*p
, struct re_guts
*g
);
168 static int altoffset(sop
*scan
, int offset
);
169 static void computejumps(struct parse
*p
, struct re_guts
*g
);
170 static void computematchjumps(struct parse
*p
, struct re_guts
*g
);
171 static sopno
pluscount(struct parse
*p
, struct re_guts
*g
);
172 static wint_t wgetnext(struct parse
*p
);
177 /* ========= end header generated by ./mkh ========= */
179 static char nuls
[10]; /* place to point scanner in event of error */
182 * macros for use with parse structure
183 * BEWARE: these know that the parse structure is named `p' !!!
185 #define PEEK() (*p->next)
186 #define PEEK2() (*(p->next+1))
187 #define MORE() (p->end - p->next > 0)
188 #define MORE2() (p->end - p->next > 1)
189 #define SEE(c) (MORE() && PEEK() == (c))
190 #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b))
191 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
192 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
193 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
194 #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
195 #define NEXT() (p->next++)
196 #define NEXT2() (p->next += 2)
197 #define NEXTn(n) (p->next += (n))
198 #define GETNEXT() (*p->next++)
199 #define WGETNEXT() wgetnext(p)
200 #define SETERROR(e) seterr(p, (e))
201 #define REQUIRE(co, e) ((co) || SETERROR(e))
202 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
203 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
204 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
205 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
206 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
207 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
208 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
209 #define HERE() (p->slen)
210 #define THERE() (p->slen - 1)
211 #define THERETHERE() (p->slen - 2)
212 #define DROP(n) (p->slen -= (n))
214 /* Macro used by computejump()/computematchjump() */
215 #define MIN(a,b) ((a)<(b)?(a):(b))
217 static int /* 0 success, otherwise REG_something */
218 regcomp_internal(regex_t
* __restrict preg
,
219 const char * __restrict pattern
,
220 int cflags
, int pflags
)
224 struct parse
*p
= &pa
;
229 # define GOODFLAGS(f) (f)
231 # define GOODFLAGS(f) ((f)&~REG_DUMP)
234 cflags
= GOODFLAGS(cflags
);
235 if ((cflags
®_EXTENDED
) && (cflags
®_NOSPEC
))
238 if (cflags
®_PEND
) {
239 if (preg
->re_endp
< pattern
)
241 len
= preg
->re_endp
- pattern
;
243 len
= strlen(pattern
);
245 /* do the mallocs early so failure handling is easy */
246 g
= (struct re_guts
*)malloc(sizeof(struct re_guts
));
250 * Limit the pattern space to avoid a 32-bit overflow on buffer
251 * extension. Also avoid any signed overflow in case of conversion
252 * so make the real limit based on a 31-bit overflow.
254 * Likely not applicable on 64-bit systems but handle the case
255 * generically (who are we to stop people from using ~715MB+
258 maxlen
= ((size_t)-1 >> 1) / sizeof(sop
) * 2 / 3;
263 p
->ssize
= len
/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
264 assert(p
->ssize
>= len
);
266 p
->strip
= (sop
*)malloc(p
->ssize
* sizeof(sop
));
268 if (p
->strip
== NULL
) {
275 p
->next
= pattern
; /* convenience; we do not modify it */
276 p
->end
= p
->next
+ len
;
280 for (i
= 0; i
< NPAREN
; i
++) {
285 if (cflags
®_POSIX
) {
287 p
->allowbranch
= (cflags
& REG_EXTENDED
) != 0;
289 p
->gnuext
= p
->allowbranch
= true;
292 p
->allowbranch
= (cflags
& REG_EXTENDED
) != 0;
294 if (cflags
& REG_EXTENDED
) {
296 p
->parse_expr
= p_ere_exp
;
298 p
->post_parse
= NULL
;
301 p
->parse_expr
= p_simp_re
;
302 p
->pre_parse
= p_bre_pre_parse
;
303 p
->post_parse
= p_bre_post_parse
;
321 g
->firststate
= THERE();
322 if (cflags
& REG_NOSPEC
)
327 g
->laststate
= THERE();
329 /* tidy up loose ends and fill things in */
332 /* only use Boyer-Moore algorithm if the pattern is bigger
333 * than three characters
337 computematchjumps(p
, g
);
338 if(g
->matchjump
== NULL
&& g
->charjump
!= NULL
) {
339 free(&g
->charjump
[CHAR_MIN
]);
343 g
->nplus
= pluscount(p
, g
);
345 preg
->re_nsub
= g
->nsub
;
347 preg
->re_magic
= MAGIC1
;
349 /* not debugging, so can't rely on the assert() in regexec() */
351 SETERROR(REG_ASSERT
);
354 /* win or lose, we're done */
355 if (p
->error
!= 0) /* lose */
361 - regcomp - interface for parser and compilation
362 = extern int regcomp(regex_t *, const char *, int);
363 = #define REG_BASIC 0000
364 = #define REG_EXTENDED 0001
365 = #define REG_ICASE 0002
366 = #define REG_NOSUB 0004
367 = #define REG_NEWLINE 0010
368 = #define REG_NOSPEC 0020
369 = #define REG_PEND 0040
370 = #define REG_DUMP 0200
372 int /* 0 success, otherwise REG_something */
373 regcomp(regex_t
* __restrict preg
,
374 const char * __restrict pattern
,
378 return (regcomp_internal(preg
, pattern
, cflags
, 0));
383 * Legacy interface that requires more lax escaping behavior.
386 freebsd12_regcomp(regex_t
* __restrict preg
,
387 const char * __restrict pattern
,
388 int cflags
, int pflags
)
391 return (regcomp_internal(preg
, pattern
, cflags
, PFLAG_LEGACY_ESC
));
394 __sym_compat(regcomp
, freebsd12_regcomp
, FBSD_1
.0
);
395 #endif /* !LIBREGEX */
398 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
399 - return whether we should terminate or not
400 == static bool p_ere_exp(struct parse *p);
403 p_ere_exp(struct parse
*p
, struct branchc
*bc
)
418 assert(MORE()); /* caller should have ensured this */
427 (void)REQUIRE(MORE(), REG_EPAREN
);
431 p
->pbegin
[subno
] = HERE();
432 EMIT(OLPAREN
, subno
);
435 if (subno
< NPAREN
) {
436 p
->pend
[subno
] = HERE();
437 assert(p
->pend
[subno
] != 0);
439 EMIT(ORPAREN
, subno
);
440 (void)MUSTEAT(')', REG_EPAREN
);
442 #ifndef POSIX_MISTAKE
443 case ')': /* happens only if no current unmatched ( */
445 * You may ask, why the ifndef? Because I didn't notice
446 * this until slightly too late for 1003.2, and none of the
447 * other 1003.2 regular-expression reviewers noticed it at
448 * all. So an unmatched ) is legal POSIX, at least until
449 * we can get it fixed.
451 SETERROR(REG_EPAREN
);
456 p
->g
->iflags
|= USEBOL
;
462 p
->g
->iflags
|= USEEOL
;
472 SETERROR(REG_BADRPT
);
475 if (p
->g
->cflags
®_NEWLINE
)
484 (void)REQUIRE(MORE(), REG_EESCAPE
);
506 p_b_pseudoclass(p
, wc
);
519 if (p
->pend
[i
] != 0) {
520 assert(i
<= p
->g
->nsub
);
522 assert(p
->pbegin
[i
] != 0);
523 assert(OP(p
->strip
[p
->pbegin
[i
]]) == OLPAREN
);
524 assert(OP(p
->strip
[p
->pend
[i
]]) == ORPAREN
);
525 (void) dupl(p
, p
->pbegin
[i
]+1, p
->pend
[i
]);
528 SETERROR(REG_ESUBREG
);
534 /* Don't proceed to the POSIX bits if we've already handled it */
547 if (may_escape(p
, wc
))
550 SETERROR(REG_EESCAPE
);
566 /* we call { a repetition if followed by a digit */
567 if (!( c
== '*' || c
== '+' || c
== '?' || c
== '{'))
568 return (false); /* no repetition, we're done */
570 (void)REQUIRE(MORE2() && \
571 (isdigit((uch
)PEEK2()) || PEEK2() == ','), REG_BADRPT
);
574 (void)REQUIRE(!wascaret
, REG_BADRPT
);
576 case '*': /* implemented as +? */
577 /* this case does not require the (y|) trick, noKLUDGE */
580 INSERT(OQUEST_
, pos
);
581 ASTERN(O_QUEST
, pos
);
588 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
589 INSERT(OCH_
, pos
); /* offset slightly wrong */
590 ASTERN(OOR1
, pos
); /* this one's right */
591 AHEAD(pos
); /* fix the OCH_ */
592 EMIT(OOR2
, 0); /* offset very wrong... */
593 AHEAD(THERE()); /* ...so fix it */
594 ASTERN(O_CH
, THERETHERE());
599 if (isdigit((uch
)PEEK())) {
601 (void)REQUIRE(count
<= count2
, REG_BADBR
);
602 } else /* single number with comma */
604 } else /* just a single number */
606 repeat(p
, pos
, count
, count2
);
607 if (!EAT('}')) { /* error heuristics */
608 while (MORE() && PEEK() != '}')
610 (void)REQUIRE(MORE(), REG_EBRACE
);
619 if (!( c
== '*' || c
== '+' || c
== '?' ||
620 (c
== '{' && MORE2() && isdigit((uch
)PEEK2())) ) )
622 SETERROR(REG_BADRPT
);
627 - p_str - string (no metacharacters) "parser"
628 == static void p_str(struct parse *p);
631 p_str(struct parse
*p
)
633 (void)REQUIRE(MORE(), REG_EMPTY
);
635 ordinary(p
, WGETNEXT());
639 * Eat consecutive branch delimiters for the kind of expression that we are
640 * parsing, return the number of delimiters that we ate.
643 p_branch_eat_delim(struct parse
*p
, struct branchc
*bc
)
655 * Insert necessary branch book-keeping operations. This emits a
656 * bogus 'next' offset, since we still have more to parse
659 p_branch_ins_offset(struct parse
*p
, struct branchc
*bc
)
662 if (bc
->nbranch
== 0) {
663 INSERT(OCH_
, bc
->start
); /* offset is wrong */
665 bc
->back
= bc
->start
;
668 ASTERN(OOR1
, bc
->back
);
670 AHEAD(bc
->fwd
); /* fix previous offset */
672 EMIT(OOR2
, 0); /* offset is very wrong */
677 * Fix the offset of the tail branch, if we actually had any branches.
678 * This is to correct the bogus placeholder offset that we use.
681 p_branch_fix_tail(struct parse
*p
, struct branchc
*bc
)
684 /* Fix bogus offset at the tail if we actually have branches */
685 if (bc
->nbranch
> 0) {
687 ASTERN(O_CH
, bc
->back
);
692 * Signal to the parser that an empty branch has been encountered; this will,
693 * in the future, be used to allow for more permissive behavior with empty
694 * branches. The return value should indicate whether parsing may continue
698 p_branch_empty(struct parse
*p
, struct branchc
*bc
)
707 * Take care of any branching requirements. This includes inserting the
708 * appropriate branching instructions as well as eating all of the branch
709 * delimiters until we either run out of pattern or need to parse more pattern.
712 p_branch_do(struct parse
*p
, struct branchc
*bc
)
716 ate
= p_branch_eat_delim(p
, bc
);
719 else if ((ate
> 1 || (bc
->outer
&& !MORE())) && !p_branch_empty(p
, bc
))
721 * Halt parsing only if we have an empty branch and p_branch_empty
722 * indicates that we must not continue. In the future, this will not
723 * necessarily be an error.
726 p_branch_ins_offset(p
, bc
);
732 p_bre_pre_parse(struct parse
*p
, struct branchc
*bc
)
737 * Does not move cleanly into expression parser because of
738 * ordinary interpration of * at the beginning position of
743 p
->g
->iflags
|= USEBOL
;
749 p_bre_post_parse(struct parse
*p
, struct branchc
*bc
)
752 /* Expression is terminating due to EOL token */
756 p
->g
->iflags
|= USEEOL
;
762 - p_re - Top level parser, concatenation and BRE anchoring
763 == static void p_re(struct parse *p, int end1, int end2);
764 * Giving end1 as OUT essentially eliminates the end1/end2 check.
766 * This implementation is a bit of a kludge, in that a trailing $ is first
767 * taken as an ordinary character and then revised to be an anchor.
768 * The amount of lookahead needed to avoid this kludge is excessive.
771 p_re(struct parse
*p
,
772 int end1
, /* first terminating character */
773 int end2
) /* second terminating character; ignored for EREs */
778 if (end1
== OUT
&& end2
== OUT
)
782 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
786 bc
.terminate
= false;
787 if (p
->pre_parse
!= NULL
)
788 p
->pre_parse(p
, &bc
);
789 while (MORE() && (!p
->allowbranch
|| !SEESPEC('|')) && !SEEEND()) {
790 bc
.terminate
= p
->parse_expr(p
, &bc
);
793 if (p
->post_parse
!= NULL
)
794 p
->post_parse(p
, &bc
);
795 (void) REQUIRE(p
->gnuext
|| HERE() != bc
.start
, REG_EMPTY
);
797 if (HERE() == bc
.start
&& !p_branch_empty(p
, &bc
))
803 * p_branch_do's return value indicates whether we should
804 * continue parsing or not. This is both for correctness and
805 * a slight optimization, because it will check if we've
806 * encountered an empty branch or the end of the string
807 * immediately following a branch delimiter.
809 if (!p_branch_do(p
, &bc
))
814 p_branch_fix_tail(p
, &bc
);
815 assert(!MORE() || SEE(end1
));
819 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
820 == static bool p_simp_re(struct parse *p, struct branchc *bc);
822 static bool /* was the simple RE an unbackslashed $? */
823 p_simp_re(struct parse
*p
, struct branchc
*bc
)
826 int cc
; /* convenient/control character */
834 # define BACKSL (1<<CHAR_BIT)
836 pos
= HERE(); /* repetition op, if any, covers from here */
839 assert(MORE()); /* caller should have ensured this */
842 (void)REQUIRE(MORE(), REG_EESCAPE
);
865 p_b_pseudoclass(p
, cc
);
876 if (p
->g
->cflags
®_NEWLINE
)
891 SETERROR(REG_BADRPT
);
897 p
->pbegin
[subno
] = HERE();
898 EMIT(OLPAREN
, subno
);
899 /* the MORE here is an error heuristic */
900 if (MORE() && !SEETWO('\\', ')'))
902 if (subno
< NPAREN
) {
903 p
->pend
[subno
] = HERE();
904 assert(p
->pend
[subno
] != 0);
906 EMIT(ORPAREN
, subno
);
907 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN
);
909 case BACKSL
|')': /* should not get here -- must be user */
910 SETERROR(REG_EPAREN
);
921 i
= (c
&~BACKSL
) - '0';
923 if (p
->pend
[i
] != 0) {
924 assert(i
<= p
->g
->nsub
);
926 assert(p
->pbegin
[i
] != 0);
927 assert(OP(p
->strip
[p
->pbegin
[i
]]) == OLPAREN
);
928 assert(OP(p
->strip
[p
->pend
[i
]]) == ORPAREN
);
929 (void) dupl(p
, p
->pbegin
[i
]+1, p
->pend
[i
]);
932 SETERROR(REG_ESUBREG
);
937 * Ordinary if used as the first character beyond BOL anchor of
938 * a (sub-)expression, counts as a bad repetition operator if it
941 (void)REQUIRE(bc
->nchain
== 0, REG_BADRPT
);
945 return (false); /* Definitely not $... */
948 if ((c
& BACKSL
) == 0 || may_escape(p
, wc
))
951 SETERROR(REG_EESCAPE
);
956 if (EAT('*')) { /* implemented as +? */
957 /* this case does not require the (y|) trick, noKLUDGE */
960 INSERT(OQUEST_
, pos
);
961 ASTERN(O_QUEST
, pos
);
963 } else if (p
->gnuext
&& EATTWO('\\', '?')) {
964 INSERT(OQUEST_
, pos
);
965 ASTERN(O_QUEST
, pos
);
966 } else if (p
->gnuext
&& EATTWO('\\', '+')) {
970 } else if (EATTWO('\\', '{')) {
973 if (MORE() && isdigit((uch
)PEEK())) {
975 (void)REQUIRE(count
<= count2
, REG_BADBR
);
976 } else /* single number with comma */
978 } else /* just a single number */
980 repeat(p
, pos
, count
, count2
);
981 if (!EATTWO('\\', '}')) { /* error heuristics */
982 while (MORE() && !SEETWO('\\', '}'))
984 (void)REQUIRE(MORE(), REG_EBRACE
);
987 } else if (c
== '$') /* $ (but not \$) ends it */
994 - p_count - parse a repetition count
995 == static int p_count(struct parse *p);
997 static int /* the value */
998 p_count(struct parse
*p
)
1003 while (MORE() && isdigit((uch
)PEEK()) && count
<= DUPMAX
) {
1004 count
= count
*10 + (GETNEXT() - '0');
1008 (void)REQUIRE(ndigits
> 0 && count
<= DUPMAX
, REG_BADBR
);
1013 - p_bracket - parse a bracketed character list
1014 == static void p_bracket(struct parse *p);
1017 p_bracket(struct parse
*p
)
1022 /* Dept of Truly Sickening Special-Case Kludges */
1023 if (p
->end
- p
->next
> 5) {
1024 if (strncmp(p
->next
, "[:<:]]", 6) == 0) {
1029 if (strncmp(p
->next
, "[:>:]]", 6) == 0) {
1036 if ((cs
= allocset(p
)) == NULL
)
1039 if (p
->g
->cflags
®_ICASE
)
1047 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1051 (void)MUSTEAT(']', REG_EBRACK
);
1053 if (p
->error
!= 0) /* don't mess things up further */
1056 if (cs
->invert
&& p
->g
->cflags
®_NEWLINE
)
1057 cs
->bmp
['\n' >> 3] |= 1 << ('\n' & 7);
1059 if ((ch
= singleton(cs
)) != OUT
) { /* optimize singleton sets */
1063 EMIT(OANYOF
, (int)(cs
- p
->g
->sets
));
1067 p_range_cmp(wint_t c1
, wint_t c2
)
1069 #if 1//ndef LIBREGEX
1070 return __wcollate_range_cmp(c1
, c2
);
1072 /* Copied from libc/collate __wcollate_range_cmp */
1073 wint_t s1
[2], s2
[2];
1079 return (wcscoll(s1
, s2
));
1084 - p_b_term - parse one term of a bracketed character list
1085 == static void p_b_term(struct parse *p, cset *cs);
1088 p_b_term(struct parse
*p
, cset
*cs
)
1091 wint_t start
, finish
;
1094 struct xlocale_collate
*table
=
1095 (struct xlocale_collate
*)__get_locale()->components
[XLC_COLLATE
];
1097 /* classify what we've got */
1098 switch ((MORE()) ? PEEK() : '\0') {
1100 c
= (MORE2()) ? PEEK2() : '\0';
1103 SETERROR(REG_ERANGE
);
1104 return; /* NOTE RETURN */
1111 case ':': /* character class */
1113 (void)REQUIRE(MORE(), REG_EBRACK
);
1115 (void)REQUIRE(c
!= '-' && c
!= ']', REG_ECTYPE
);
1117 (void)REQUIRE(MORE(), REG_EBRACK
);
1118 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE
);
1120 case '=': /* equivalence class */
1122 (void)REQUIRE(MORE(), REG_EBRACK
);
1124 (void)REQUIRE(c
!= '-' && c
!= ']', REG_ECOLLATE
);
1126 (void)REQUIRE(MORE(), REG_EBRACK
);
1127 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE
);
1129 default: /* symbol, ordinary character, or range */
1130 start
= p_b_symbol(p
);
1131 if (SEE('-') && MORE2() && PEEK2() != ']') {
1137 finish
= p_b_symbol(p
);
1140 if (start
== finish
)
1141 CHadd(p
, cs
, start
);
1144 if (table
->__collate_load_error
|| MB_CUR_MAX
> 1) {
1146 if (MB_CUR_MAX
> 1) {
1148 (void)REQUIRE(p_range_cmp(start
, finish
) <= 0, REG_ERANGE
);
1149 CHaddrange(p
, cs
, start
, finish
);
1151 (void)REQUIRE(p_range_cmp(start
, finish
) <= 0, REG_ERANGE
);
1152 for (i
= 0; i
<= UCHAR_MAX
; i
++) {
1153 if (p_range_cmp(start
, i
) <= 0 &&
1154 p_range_cmp(i
, finish
) <= 0 )
1164 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1165 == static int p_b_pseudoclass(struct parse *p, char c)
1168 p_b_pseudoclass(struct parse
*p
, char c
) {
1171 if ((cs
= allocset(p
)) == NULL
)
1174 if (p
->g
->cflags
®_ICASE
)
1182 p_b_cclass_named(p
, cs
, "alnum");
1188 p_b_cclass_named(p
, cs
, "space");
1194 EMIT(OANYOF
, (int)(cs
- p
->g
->sets
));
1199 - p_b_cclass - parse a character-class name and deal with it
1200 == static void p_b_cclass(struct parse *p, cset *cs);
1203 p_b_cclass(struct parse
*p
, cset
*cs
)
1205 const char *sp
= p
->next
;
1209 while (MORE() && isalpha((uch
)PEEK()))
1212 if (len
>= sizeof(clname
) - 1) {
1213 SETERROR(REG_ECTYPE
);
1216 memcpy(clname
, sp
, len
);
1219 p_b_cclass_named(p
, cs
, clname
);
1222 - p_b_cclass_named - deal with a named character class
1223 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1226 p_b_cclass_named(struct parse
*p
, cset
*cs
, const char clname
[]) {
1229 if ((wct
= wctype(clname
)) == 0) {
1230 SETERROR(REG_ECTYPE
);
1233 CHaddtype(p
, cs
, wct
);
1237 - p_b_eclass - parse an equivalence-class name and deal with it
1238 == static void p_b_eclass(struct parse *p, cset *cs);
1240 * This implementation is incomplete. xxx
1243 p_b_eclass(struct parse
*p
, cset
*cs
)
1247 c
= p_b_coll_elem(p
, '=');
1252 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1253 == static wint_t p_b_symbol(struct parse *p);
1255 static wint_t /* value of symbol */
1256 p_b_symbol(struct parse
*p
)
1260 (void)REQUIRE(MORE(), REG_EBRACK
);
1261 if (!EATTWO('[', '.'))
1264 /* collating symbol */
1265 value
= p_b_coll_elem(p
, '.');
1266 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE
);
1271 - p_b_coll_elem - parse a collating-element name and look it up
1272 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1274 static wint_t /* value of collating element */
1275 p_b_coll_elem(struct parse
*p
,
1276 wint_t endc
) /* name ended by endc,']' */
1278 const char *sp
= p
->next
;
1284 while (MORE() && !SEETWO(endc
, ']'))
1287 SETERROR(REG_EBRACK
);
1291 for (cp
= cnames
; cp
->name
!= NULL
; cp
++)
1292 if (strncmp(cp
->name
, sp
, len
) == 0 && strlen(cp
->name
) == len
)
1293 return(cp
->code
); /* known name */
1294 memset(&mbs
, 0, sizeof(mbs
));
1295 if ((clen
= mbrtowi(&wc
, sp
, len
, &mbs
)) == len
)
1296 return (wc
); /* single character */
1297 else if (clen
== (size_t)-1 || clen
== (size_t)-2)
1298 SETERROR(REG_ILLSEQ
);
1300 SETERROR(REG_ECOLLATE
); /* neither */
1305 - may_escape - determine whether 'ch' is escape-able in the current context
1306 == static int may_escape(struct parse *p, const wint_t ch)
1309 may_escape(struct parse
*p
, const wint_t ch
)
1312 if ((p
->pflags
& PFLAG_LEGACY_ESC
) != 0)
1314 if (isalpha(ch
) || ch
== '\'' || ch
== '`')
1319 * Build a whitelist of characters that may be escaped to produce an
1320 * ordinary in the current context. This assumes that these have not
1321 * been otherwise interpreted as a special character. Escaping an
1322 * ordinary character yields undefined results according to
1323 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1324 * advantage of this and use escaped ordinary characters to provide
1325 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1331 /* The above characters may not be escaped in BREs */
1332 if (!(p
->g
->cflags
®_EXTENDED
))
1354 - othercase - return the case counterpart of an alphabetic
1355 == static wint_t othercase(wint_t ch);
1357 static wint_t /* if no counterpart, return ch */
1358 othercase(wint_t ch
)
1360 assert(iswalpha(ch
));
1362 return(towlower(ch
));
1363 else if (iswlower(ch
))
1364 return(towupper(ch
));
1365 else /* peculiar, but could happen */
1370 - bothcases - emit a dualcase version of a two-case character
1371 == static void bothcases(struct parse *p, wint_t ch);
1373 * Boy, is this implementation ever a kludge...
1376 bothcases(struct parse
*p
, wint_t ch
)
1378 const char *oldnext
= p
->next
;
1379 const char *oldend
= p
->end
;
1380 char bracket
[3 + MB_LEN_MAX
];
1384 assert(othercase(ch
) != ch
); /* p_bracket() would recurse */
1386 memset(&mbs
, 0, sizeof(mbs
));
1387 n
= wirtomb(bracket
, ch
, &mbs
);
1388 assert(n
!= (size_t)-1);
1390 bracket
[n
+ 1] = '\0';
1391 p
->end
= bracket
+n
+1;
1393 assert(p
->next
== p
->end
);
1399 - ordinary - emit an ordinary character
1400 == static void ordinary(struct parse *p, wint_t ch);
1403 ordinary(struct parse
*p
, wint_t ch
)
1407 if ((p
->g
->cflags
®_ICASE
) && iswalpha(ch
) && othercase(ch
) != ch
)
1409 else if ((ch
& OPDMASK
) == ch
)
1413 * Kludge: character is too big to fit into an OCHAR operand.
1414 * Emit a singleton set.
1416 if ((cs
= allocset(p
)) == NULL
)
1419 EMIT(OANYOF
, (int)(cs
- p
->g
->sets
));
1424 - nonnewline - emit REG_NEWLINE version of OANY
1425 == static void nonnewline(struct parse *p);
1427 * Boy, is this implementation ever a kludge...
1430 nonnewline(struct parse
*p
)
1432 const char *oldnext
= p
->next
;
1433 const char *oldend
= p
->end
;
1443 assert(p
->next
== bracket
+3);
1449 - repeat - generate code for a bounded repetition, recursively if needed
1450 == static void repeat(struct parse *p, sopno start, int from, int to);
1453 repeat(struct parse
*p
,
1454 sopno start
, /* operand from here to end of strip */
1455 int from
, /* repeated from this number */
1456 int to
) /* to this number of times (maybe INFINITY) */
1458 sopno finish
= HERE();
1461 # define REP(f, t) ((f)*8 + (t))
1462 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1465 if (p
->error
!= 0) /* head off possible runaway recursion */
1470 switch (REP(MAP(from
), MAP(to
))) {
1471 case REP(0, 0): /* must be user doing this */
1472 DROP(finish
-start
); /* drop the operand */
1474 case REP(0, 1): /* as x{1,1}? */
1475 case REP(0, N
): /* as x{1,n}? */
1476 case REP(0, INF
): /* as x{1,}? */
1477 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1478 INSERT(OCH_
, start
); /* offset is wrong... */
1479 repeat(p
, start
+1, 1, to
);
1480 ASTERN(OOR1
, start
);
1481 AHEAD(start
); /* ... fix it */
1484 ASTERN(O_CH
, THERETHERE());
1486 case REP(1, 1): /* trivial case */
1489 case REP(1, N
): /* as x?x{1,n-1} */
1490 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1491 INSERT(OCH_
, start
);
1492 ASTERN(OOR1
, start
);
1494 EMIT(OOR2
, 0); /* offset very wrong... */
1495 AHEAD(THERE()); /* ...so fix it */
1496 ASTERN(O_CH
, THERETHERE());
1497 copy
= dupl(p
, start
+1, finish
+1);
1498 assert(copy
== finish
+4);
1499 repeat(p
, copy
, 1, to
-1);
1501 case REP(1, INF
): /* as x+ */
1502 INSERT(OPLUS_
, start
);
1503 ASTERN(O_PLUS
, start
);
1505 case REP(N
, N
): /* as xx{m-1,n-1} */
1506 copy
= dupl(p
, start
, finish
);
1507 repeat(p
, copy
, from
-1, to
-1);
1509 case REP(N
, INF
): /* as xx{n-1,INF} */
1510 copy
= dupl(p
, start
, finish
);
1511 repeat(p
, copy
, from
-1, to
);
1513 default: /* "can't happen" */
1514 SETERROR(REG_ASSERT
); /* just in case */
1520 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1521 - character from the parse struct, signals a REG_ILLSEQ error if the
1522 - character can't be converted. Returns the number of bytes consumed.
1525 wgetnext(struct parse
*p
)
1532 /* Kludge for more glibc compatibility. On Cygwin as well as on
1533 Linux, mbrtowc returns -1 if the current local's codeset is ASCII
1534 and the character is >= 0x80. Nevertheless, glibc's regcomp allows
1535 any char value, even stuff like [\xc0-\xff], if the locale's codeset
1536 is ASCII, so in regcomp it ignores the fact that chars >= 0x80 are
1537 invalid ASCII chars. To be more Linux-compatible, we align the
1538 behaviour to glibc here. Allow any character value if the current
1539 local's codeset is ASCII. */
1540 if (*__current_locale_charset () == 'A') /* SCII */
1541 return (wint_t) (unsigned char) *p
->next
++;
1543 memset(&mbs
, 0, sizeof(mbs
));
1544 n
= mbrtowi(&wc
, p
->next
, p
->end
- p
->next
, &mbs
);
1545 if (n
== (size_t)-1 || n
== (size_t)-2) {
1546 SETERROR(REG_ILLSEQ
);
1556 - seterr - set an error condition
1557 == static int seterr(struct parse *p, int e);
1559 static int /* useless but makes type checking happy */
1560 seterr(struct parse
*p
, int e
)
1562 if (p
->error
== 0) /* keep earliest error condition */
1564 p
->next
= nuls
; /* try to bring things to a halt */
1566 return(0); /* make the return value well-defined */
1570 - allocset - allocate a set of characters for []
1571 == static cset *allocset(struct parse *p);
1574 allocset(struct parse
*p
)
1578 ncs
= reallocarray(p
->g
->sets
, p
->g
->ncsets
+ 1, sizeof(*ncs
));
1580 SETERROR(REG_ESPACE
);
1584 cs
= &p
->g
->sets
[p
->g
->ncsets
++];
1585 memset(cs
, 0, sizeof(*cs
));
1591 - freeset - free a now-unused set
1592 == static void freeset(struct parse *p, cset *cs);
1595 freeset(struct parse
*p
, cset
*cs
)
1597 cset
*top
= &p
->g
->sets
[p
->g
->ncsets
];
1602 memset(cs
, 0, sizeof(*cs
));
1603 if (cs
== top
-1) /* recover only the easy case */
1608 - singleton - Determine whether a set contains only one character,
1609 - returning it if so, otherwise returning OUT.
1616 for (i
= n
= 0; i
< NC
; i
++)
1623 if (cs
->nwides
== 1 && cs
->nranges
== 0 && cs
->ntypes
== 0 &&
1625 return (cs
->wides
[0]);
1626 /* Don't bother handling the other cases. */
1631 - CHadd - add character to character set.
1634 CHadd(struct parse
*p
, cset
*cs
, wint_t ch
)
1636 wint_t nch
, *newwides
;
1639 cs
->bmp
[ch
>> 3] |= 1 << (ch
& 7);
1641 newwides
= reallocarray(cs
->wides
, cs
->nwides
+ 1,
1642 sizeof(*cs
->wides
));
1643 if (newwides
== NULL
) {
1644 SETERROR(REG_ESPACE
);
1647 cs
->wides
= newwides
;
1648 cs
->wides
[cs
->nwides
++] = ch
;
1651 if ((nch
= towlower(ch
)) < NC
)
1652 cs
->bmp
[nch
>> 3] |= 1 << (nch
& 7);
1653 if ((nch
= towupper(ch
)) < NC
)
1654 cs
->bmp
[nch
>> 3] |= 1 << (nch
& 7);
1659 - CHaddrange - add all characters in the range [min,max] to a character set.
1662 CHaddrange(struct parse
*p
, cset
*cs
, wint_t min
, wint_t max
)
1666 for (; min
< NC
&& min
<= max
; min
++)
1668 newranges
= reallocarray(cs
->ranges
, cs
->nranges
+ 1,
1669 sizeof(*cs
->ranges
));
1670 if (newranges
== NULL
) {
1671 SETERROR(REG_ESPACE
);
1674 cs
->ranges
= newranges
;
1675 cs
->ranges
[cs
->nranges
].min
= min
;
1676 cs
->ranges
[cs
->nranges
].max
= max
;
1681 - CHaddtype - add all characters of a certain type to a character set.
1684 CHaddtype(struct parse
*p
, cset
*cs
, wctype_t wct
)
1689 for (i
= 0; i
< NC
; i
++)
1690 if (iswctype(i
, wct
))
1692 newtypes
= reallocarray(cs
->types
, cs
->ntypes
+ 1,
1693 sizeof(*cs
->types
));
1694 if (newtypes
== NULL
) {
1695 SETERROR(REG_ESPACE
);
1698 cs
->types
= newtypes
;
1699 cs
->types
[cs
->ntypes
++] = wct
;
1703 - dupl - emit a duplicate of a bunch of sops
1704 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1706 static sopno
/* start of duplicate */
1707 dupl(struct parse
*p
,
1708 sopno start
, /* from here */
1709 sopno finish
) /* to this less one */
1712 sopno len
= finish
- start
;
1714 assert(finish
>= start
);
1717 if (!enlarge(p
, p
->ssize
+ len
)) /* this many unexpected additions */
1719 (void) memcpy((char *)(p
->strip
+ p
->slen
),
1720 (char *)(p
->strip
+ start
), (size_t)len
*sizeof(sop
));
1726 - doemit - emit a strip operator
1727 == static void doemit(struct parse *p, sop op, size_t opnd);
1729 * It might seem better to implement this as a macro with a function as
1730 * hard-case backup, but it's just too big and messy unless there are
1731 * some changes to the data structures. Maybe later.
1734 doemit(struct parse
*p
, sop op
, size_t opnd
)
1736 /* avoid making error situations worse */
1740 /* deal with oversize operands ("can't happen", more or less) */
1741 assert(opnd
< 1<<OPSHIFT
);
1743 /* deal with undersized strip */
1744 if (p
->slen
>= p
->ssize
)
1745 if (!enlarge(p
, (p
->ssize
+1) / 2 * 3)) /* +50% */
1748 /* finally, it's all reduced to the easy case */
1749 p
->strip
[p
->slen
++] = SOP(op
, opnd
);
1753 - doinsert - insert a sop into the strip
1754 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1757 doinsert(struct parse
*p
, sop op
, size_t opnd
, sopno pos
)
1763 /* avoid making error situations worse */
1768 EMIT(op
, opnd
); /* do checks, ensure space */
1769 assert(HERE() == sn
+1);
1772 /* adjust paren pointers */
1774 for (i
= 1; i
< NPAREN
; i
++) {
1775 if (p
->pbegin
[i
] >= pos
) {
1778 if (p
->pend
[i
] >= pos
) {
1783 memmove((char *)&p
->strip
[pos
+1], (char *)&p
->strip
[pos
],
1784 (HERE()-pos
-1)*sizeof(sop
));
1789 - dofwd - complete a forward reference
1790 == static void dofwd(struct parse *p, sopno pos, sop value);
1793 dofwd(struct parse
*p
, sopno pos
, sop value
)
1795 /* avoid making error situations worse */
1799 assert(value
< 1<<OPSHIFT
);
1800 p
->strip
[pos
] = OP(p
->strip
[pos
]) | value
;
1804 - enlarge - enlarge the strip
1805 == static int enlarge(struct parse *p, sopno size);
1808 enlarge(struct parse
*p
, sopno size
)
1812 if (p
->ssize
>= size
)
1815 sp
= reallocarray(p
->strip
, size
, sizeof(sop
));
1817 SETERROR(REG_ESPACE
);
1826 - stripsnug - compact the strip
1827 == static void stripsnug(struct parse *p, struct re_guts *g);
1830 stripsnug(struct parse
*p
, struct re_guts
*g
)
1832 g
->nstates
= p
->slen
;
1833 g
->strip
= reallocarray((char *)p
->strip
, p
->slen
, sizeof(sop
));
1834 if (g
->strip
== NULL
) {
1835 SETERROR(REG_ESPACE
);
1836 g
->strip
= p
->strip
;
1841 - findmust - fill in must and mlen with longest mandatory literal string
1842 == static void findmust(struct parse *p, struct re_guts *g);
1844 * This algorithm could do fancy things like analyzing the operands of |
1845 * for common subsequences. Someday. This code is simple and finds most
1846 * of the interesting cases.
1848 * Note that must and mlen got initialized during setup.
1851 findmust(struct parse
*p
, struct re_guts
*g
)
1855 sop
*newstart
= NULL
;
1860 char buf
[MB_LEN_MAX
];
1864 /* avoid making error situations worse */
1869 * It's not generally safe to do a ``char'' substring search on
1870 * multibyte character strings, but it's safe for at least
1871 * UTF-8 (see RFC 3629).
1873 if (MB_CUR_MAX
> 1 &&
1874 strcmp(__current_locale_charset (), "UTF-8") != 0)
1877 /* find the longest OCHAR sequence in strip */
1881 scan
= g
->strip
+ 1;
1885 case OCHAR
: /* sequence member */
1886 if (newlen
== 0) { /* new sequence */
1887 memset(&mbs
, 0, sizeof(mbs
));
1888 newstart
= scan
- 1;
1890 clen
= wirtomb(buf
, OPND(s
), &mbs
);
1891 if (clen
== (size_t)-1)
1895 case OPLUS_
: /* things that don't break one */
1899 case OQUEST_
: /* things that must be skipped */
1901 offset
= altoffset(scan
, offset
);
1906 /* assert() interferes w debug printouts */
1907 if (OP(s
) != (sop
)O_QUEST
&&
1908 OP(s
) != (sop
)O_CH
&& OP(s
) != (sop
)OOR2
) {
1912 } while (OP(s
) != (sop
)O_QUEST
&& OP(s
) != (sop
)O_CH
);
1914 case OBOW
: /* things that break a sequence */
1925 if (newlen
> (sopno
)g
->mlen
) { /* ends one */
1929 g
->moffset
+= offset
;
1932 g
->moffset
= offset
;
1940 if (newlen
> (sopno
)g
->mlen
) { /* ends one */
1944 g
->moffset
+= offset
;
1947 g
->moffset
= offset
;
1956 case OANYOF
: /* may or may not invalidate offset */
1957 /* First, everything as OANY */
1958 if (newlen
> (sopno
)g
->mlen
) { /* ends one */
1962 g
->moffset
+= offset
;
1965 g
->moffset
= offset
;
1976 /* Anything here makes it impossible or too hard
1977 * to calculate the offset -- so we give up;
1978 * save the last known good offset, in case the
1979 * must sequence doesn't occur later.
1981 if (newlen
> (sopno
)g
->mlen
) { /* ends one */
1985 g
->moffset
+= offset
;
1987 g
->moffset
= offset
;
1993 } while (OP(s
) != OEND
);
1995 if (g
->mlen
== 0) { /* there isn't one */
2000 /* turn it into a character string */
2001 g
->must
= malloc((size_t)g
->mlen
+ 1);
2002 if (g
->must
== NULL
) { /* argh; just forget it */
2009 memset(&mbs
, 0, sizeof(mbs
));
2010 while (cp
< g
->must
+ g
->mlen
) {
2011 while (OP(s
= *scan
++) != OCHAR
)
2013 clen
= wirtomb(cp
, OPND(s
), &mbs
);
2014 assert(clen
!= (size_t)-1);
2017 assert(cp
== g
->must
+ g
->mlen
);
2018 *cp
++ = '\0'; /* just on general principles */
2022 - altoffset - choose biggest offset among multiple choices
2023 == static int altoffset(sop *scan, int offset);
2025 * Compute, recursively if necessary, the largest offset among multiple
2029 altoffset(sop
*scan
, int offset
)
2035 /* If we gave up already on offsets, return */
2042 while (OP(s
) != (sop
)O_QUEST
&& OP(s
) != (sop
)O_CH
) {
2051 try = altoffset(scan
, try);
2058 if (OP(s
) != (sop
)O_QUEST
&&
2059 OP(s
) != (sop
)O_CH
&& OP(s
) != (sop
)OOR2
)
2061 } while (OP(s
) != (sop
)O_QUEST
&& OP(s
) != (sop
)O_CH
);
2062 /* We must skip to the next position, or we'll
2063 * leave altoffset() too early.
2091 return largest
+offset
;
2095 - computejumps - compute char jumps for BM scan
2096 == static void computejumps(struct parse *p, struct re_guts *g);
2098 * This algorithm assumes g->must exists and is has size greater than
2099 * zero. It's based on the algorithm found on Computer Algorithms by
2102 * A char jump is the number of characters one needs to jump based on
2103 * the value of the character from the text that was mismatched.
2106 computejumps(struct parse
*p
, struct re_guts
*g
)
2111 /* Avoid making errors worse */
2115 g
->charjump
= (int *)malloc((NC_MAX
+ 1) * sizeof(int));
2116 if (g
->charjump
== NULL
) /* Not a fatal error */
2118 /* Adjust for signed chars, if necessary */
2119 g
->charjump
= &g
->charjump
[-(CHAR_MIN
)];
2121 /* If the character does not exist in the pattern, the jump
2122 * is equal to the number of characters in the pattern.
2124 for (ch
= CHAR_MIN
; ch
< (CHAR_MAX
+ 1); ch
++)
2125 g
->charjump
[ch
] = g
->mlen
;
2127 /* If the character does exist, compute the jump that would
2128 * take us to the last character in the pattern equal to it
2129 * (notice that we match right to left, so that last character
2130 * is the first one that would be matched).
2132 for (mindex
= 0; mindex
< g
->mlen
; mindex
++)
2133 g
->charjump
[(int)g
->must
[mindex
]] = g
->mlen
- mindex
- 1;
2137 - computematchjumps - compute match jumps for BM scan
2138 == static void computematchjumps(struct parse *p, struct re_guts *g);
2140 * This algorithm assumes g->must exists and is has size greater than
2141 * zero. It's based on the algorithm found on Computer Algorithms by
2144 * A match jump is the number of characters one needs to advance based
2145 * on the already-matched suffix.
2146 * Notice that all values here are minus (g->mlen-1), because of the way
2147 * the search algorithm works.
2150 computematchjumps(struct parse
*p
, struct re_guts
*g
)
2152 int mindex
; /* General "must" iterator */
2153 int suffix
; /* Keeps track of matching suffix */
2154 int ssuffix
; /* Keeps track of suffixes' suffix */
2155 int* pmatches
; /* pmatches[k] points to the next i
2156 * such that i+1...mlen is a substring
2157 * of k+1...k+mlen-i-1
2160 /* Avoid making errors worse */
2164 pmatches
= (int*) malloc(g
->mlen
* sizeof(int));
2165 if (pmatches
== NULL
) {
2166 g
->matchjump
= NULL
;
2170 g
->matchjump
= (int*) malloc(g
->mlen
* sizeof(int));
2171 if (g
->matchjump
== NULL
) { /* Not a fatal error */
2176 /* Set maximum possible jump for each character in the pattern */
2177 for (mindex
= 0; mindex
< g
->mlen
; mindex
++)
2178 g
->matchjump
[mindex
] = 2*g
->mlen
- mindex
- 1;
2180 /* Compute pmatches[] */
2181 for (mindex
= g
->mlen
- 1, suffix
= g
->mlen
; mindex
>= 0;
2182 mindex
--, suffix
--) {
2183 pmatches
[mindex
] = suffix
;
2185 /* If a mismatch is found, interrupting the substring,
2186 * compute the matchjump for that position. If no
2187 * mismatch is found, then a text substring mismatched
2188 * against the suffix will also mismatch against the
2191 while (suffix
< g
->mlen
2192 && g
->must
[mindex
] != g
->must
[suffix
]) {
2193 g
->matchjump
[suffix
] = MIN(g
->matchjump
[suffix
],
2194 g
->mlen
- mindex
- 1);
2195 suffix
= pmatches
[suffix
];
2199 /* Compute the matchjump up to the last substring found to jump
2200 * to the beginning of the largest must pattern prefix matching
2203 for (mindex
= 0; mindex
<= suffix
; mindex
++)
2204 g
->matchjump
[mindex
] = MIN(g
->matchjump
[mindex
],
2205 g
->mlen
+ suffix
- mindex
);
2207 ssuffix
= pmatches
[suffix
];
2208 while (suffix
< g
->mlen
) {
2209 while (suffix
<= ssuffix
&& suffix
< g
->mlen
) {
2210 g
->matchjump
[suffix
] = MIN(g
->matchjump
[suffix
],
2211 g
->mlen
+ ssuffix
- suffix
);
2214 if (suffix
< g
->mlen
)
2215 ssuffix
= pmatches
[ssuffix
];
2222 - pluscount - count + nesting
2223 == static sopno pluscount(struct parse *p, struct re_guts *g);
2225 static sopno
/* nesting depth */
2226 pluscount(struct parse
*p
, struct re_guts
*g
)
2234 return(0); /* there may not be an OEND */
2236 scan
= g
->strip
+ 1;
2244 if (plusnest
> maxnest
)
2249 } while (OP(s
) != OEND
);