2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright 2012 Milan Jurik. All rights reserved.
5 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6 * Copyright (c) 1992, 1993, 1994
7 * The Regents of the University of California. All rights reserved.
9 * This code is derived from software contributed to Berkeley by
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 #include <sys/types.h>
49 #include "../locale/runetype.h"
50 #include "../locale/collate.h"
56 #include "../locale/mblocal.h"
59 * parse structure, passed up and down to avoid global variables and
63 const char *next
; /* next character in RE */
64 const char *end
; /* end of string (-> NUL normally) */
65 int error
; /* has an error been seen? */
66 sop
*strip
; /* malloced strip */
67 sopno ssize
; /* malloced strip size (allocated) */
68 sopno slen
; /* malloced strip length (used) */
69 int ncsalloc
; /* number of csets allocated */
71 #define NPAREN 10 /* we need to remember () 1-9 for back refs */
72 sopno pbegin
[NPAREN
]; /* -> ( ([0] unused) */
73 sopno pend
[NPAREN
]; /* -> ) ([0] unused) */
76 /* ========= begin header generated by ./mkh ========= */
81 /* === regcomp.c === */
82 static void p_ere(struct parse
*p
, int stop
);
83 static void p_ere_exp(struct parse
*p
);
84 static void p_str(struct parse
*p
);
85 static void p_bre(struct parse
*p
, int end1
, int end2
);
86 static int p_simp_re(struct parse
*p
, int starordinary
);
87 static int p_count(struct parse
*p
);
88 static void p_bracket(struct parse
*p
);
89 static void p_b_term(struct parse
*p
, cset
*cs
);
90 static void p_b_cclass(struct parse
*p
, cset
*cs
);
91 static void p_b_eclass(struct parse
*p
, cset
*cs
);
92 static wint_t p_b_symbol(struct parse
*p
);
93 static wint_t p_b_coll_elem(struct parse
*p
, wint_t endc
);
94 static wint_t othercase(wint_t ch
);
95 static void bothcases(struct parse
*p
, wint_t ch
);
96 static void ordinary(struct parse
*p
, wint_t ch
);
97 static void nonnewline(struct parse
*p
);
98 static void repeat(struct parse
*p
, sopno start
, int from
, int to
);
99 static int seterr(struct parse
*p
, int e
);
100 static cset
*allocset(struct parse
*p
);
101 static void freeset(struct parse
*p
, cset
*cs
);
102 static void CHadd(struct parse
*p
, cset
*cs
, wint_t ch
);
103 static void CHaddrange(struct parse
*p
, cset
*cs
, wint_t min
, wint_t max
);
104 static void CHaddtype(struct parse
*p
, cset
*cs
, wctype_t wct
);
105 static wint_t singleton(cset
*cs
);
106 static sopno
dupl(struct parse
*p
, sopno start
, sopno finish
);
107 static void doemit(struct parse
*p
, sop op
, size_t opnd
);
108 static void doinsert(struct parse
*p
, sop op
, size_t opnd
, sopno pos
);
109 static void dofwd(struct parse
*p
, sopno pos
, sop value
);
110 static int enlarge(struct parse
*p
, sopno size
);
111 static void stripsnug(struct parse
*p
, struct re_guts
*g
);
112 static void findmust(struct parse
*p
, struct re_guts
*g
);
113 static int altoffset(sop
*scan
, int offset
);
114 static void computejumps(struct parse
*p
, struct re_guts
*g
);
115 static void computematchjumps(struct parse
*p
, struct re_guts
*g
);
116 static sopno
pluscount(struct parse
*p
, struct re_guts
*g
);
117 static wint_t wgetnext(struct parse
*p
);
122 /* ========= end header generated by ./mkh ========= */
124 static char nuls
[10]; /* place to point scanner in event of error */
127 * macros for use with parse structure
128 * BEWARE: these know that the parse structure is named `p' !!!
130 #define PEEK() (*p->next)
131 #define PEEK2() (*(p->next+1))
132 #define MORE() (p->next < p->end)
133 #define MORE2() (p->next+1 < p->end)
134 #define SEE(c) (MORE() && PEEK() == (c))
135 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
136 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
137 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
138 #define NEXT() (p->next++)
139 #define NEXT2() (p->next += 2)
140 #define NEXTn(n) (p->next += (n))
141 #define GETNEXT() (*p->next++)
142 #define WGETNEXT() wgetnext(p)
143 #define SETERROR(e) ((void)seterr(p, (e)))
144 #define REQUIRE(co, e) ((co) || seterr(p, e))
145 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
146 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
147 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
148 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
149 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
150 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
151 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
152 #define HERE() (p->slen)
153 #define THERE() (p->slen - 1)
154 #define THERETHERE() (p->slen - 2)
155 #define DROP(n) (p->slen -= (n))
158 static int never
= 0; /* for use in asserts; shuts lint up */
160 #define never 0 /* some <assert.h>s have bugs too */
164 * regcomp - interface for parser and compilation
166 int /* 0 success, otherwise REG_something */
167 regcomp(regex_t
*_RESTRICT_KYWD preg
, const char *_RESTRICT_KYWD pattern
,
172 struct parse
*p
= &pa
;
177 #define GOODFLAGS(f) (f)
179 #define GOODFLAGS(f) ((f)&~REG_DUMP)
182 /* We had REG_INVARG, but we don't have that on Solaris. */
183 cflags
= GOODFLAGS(cflags
);
184 if ((cflags
®_EXTENDED
) && (cflags
®_NOSPEC
))
187 if (cflags
®_PEND
) {
188 if (preg
->re_endp
< pattern
)
190 len
= preg
->re_endp
- pattern
;
192 len
= strlen(pattern
);
194 /* do the mallocs early so failure handling is easy */
195 g
= (struct re_guts
*)malloc(sizeof (struct re_guts
));
199 * Limit the pattern space to avoid a 32-bit overflow on buffer
200 * extension. Also avoid any signed overflow in case of conversion
201 * so make the real limit based on a 31-bit overflow.
203 * Likely not applicable on 64-bit systems but handle the case
204 * generically (who are we to stop people from using ~715MB+
207 maxlen
= ((size_t)-1 >> 1) / sizeof (sop
) * 2 / 3;
212 p
->ssize
= len
/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
213 assert(p
->ssize
>= len
);
215 p
->strip
= (sop
*)malloc(p
->ssize
* sizeof (sop
));
217 if (p
->strip
== NULL
) {
224 p
->next
= pattern
; /* convenience; we do not modify it */
225 p
->end
= p
->next
+ len
;
228 for (i
= 0; i
< NPAREN
; i
++) {
248 g
->firststate
= THERE();
249 if (cflags
®_EXTENDED
)
251 else if (cflags
®_NOSPEC
)
256 g
->laststate
= THERE();
258 /* tidy up loose ends and fill things in */
262 * only use Boyer-Moore algorithm if the pattern is bigger
263 * than three characters
267 computematchjumps(p
, g
);
268 if (g
->matchjump
== NULL
&& g
->charjump
!= NULL
) {
273 g
->nplus
= pluscount(p
, g
);
275 preg
->re_nsub
= g
->nsub
;
277 preg
->re_magic
= MAGIC1
;
279 /* not debugging, so can't rely on the assert() in regexec() */
281 SETERROR(REG_EFATAL
);
284 /* win or lose, we're done */
285 if (p
->error
!= 0) /* lose */
291 * p_ere - ERE parser top level, concatenation and alternation
294 p_ere(struct parse
*p
,
295 int stop
) /* character this ERE should end at */
301 int first
= 1; /* is this the first alternative? */
304 /* do a bunch of concatenated expressions */
306 while (MORE() && (c
= PEEK()) != '|' && c
!= stop
)
308 /* require nonempty */
309 (void) REQUIRE(HERE() != conc
, REG_BADPAT
);
312 break; /* NOTE BREAK OUT */
315 INSERT(OCH_
, conc
); /* offset is wrong */
320 ASTERN(OOR1
, prevback
);
322 AHEAD(prevfwd
); /* fix previous offset */
324 EMIT(OOR2
, 0); /* offset is very wrong */
327 if (!first
) { /* tail-end fixups */
329 ASTERN(O_CH
, prevback
);
332 assert(!MORE() || SEE(stop
));
336 * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
339 p_ere_exp(struct parse
*p
)
349 assert(MORE()); /* caller should have ensured this */
355 (void) REQUIRE(MORE(), REG_EPAREN
);
359 p
->pbegin
[subno
] = HERE();
360 EMIT(OLPAREN
, subno
);
363 if (subno
< NPAREN
) {
364 p
->pend
[subno
] = HERE();
365 assert(p
->pend
[subno
] != 0);
367 EMIT(ORPAREN
, subno
);
368 (void) MUSTEAT(')', REG_EPAREN
);
370 #ifndef POSIX_MISTAKE
371 case ')': /* happens only if no current unmatched ( */
373 * You may ask, why the ifndef? Because I didn't notice
374 * this until slightly too late for 1003.2, and none of the
375 * other 1003.2 regular-expression reviewers noticed it at
376 * all. So an unmatched ) is legal POSIX, at least until
377 * we can get it fixed.
379 SETERROR(REG_EPAREN
);
384 p
->g
->iflags
|= USEBOL
;
390 p
->g
->iflags
|= USEEOL
;
394 SETERROR(REG_BADPAT
);
399 SETERROR(REG_BADRPT
);
402 if (p
->g
->cflags
®_NEWLINE
)
411 (void) REQUIRE(MORE(), REG_EESCAPE
);
425 case '{': /* okay as ordinary except if digit follows */
426 (void) REQUIRE(!MORE() || !isdigit((uch
)PEEK()), REG_BADRPT
);
440 /* we call { a repetition if followed by a digit */
441 if (!(c
== '*' || c
== '+' || c
== '?' ||
442 (c
== '{' && MORE2() && isdigit((uch
)PEEK2()))))
443 return; /* no repetition, we're done */
446 (void) REQUIRE(!wascaret
, REG_BADRPT
);
448 case '*': /* implemented as +? */
449 /* this case does not require the (y|) trick, noKLUDGE */
452 INSERT(OQUEST_
, pos
);
453 ASTERN(O_QUEST
, pos
);
460 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
461 INSERT(OCH_
, pos
); /* offset slightly wrong */
462 ASTERN(OOR1
, pos
); /* this one's right */
463 AHEAD(pos
); /* fix the OCH_ */
464 EMIT(OOR2
, 0); /* offset very wrong... */
465 AHEAD(THERE()); /* ...so fix it */
466 ASTERN(O_CH
, THERETHERE());
471 if (isdigit((uch
)PEEK())) {
473 (void) REQUIRE(count
<= count2
, REG_BADBR
);
474 } else /* single number with comma */
476 } else /* just a single number */
478 repeat(p
, pos
, count
, count2
);
479 if (!EAT('}')) { /* error heuristics */
480 while (MORE() && PEEK() != '}')
482 (void) REQUIRE(MORE(), REG_EBRACE
);
491 if (!(c
== '*' || c
== '+' || c
== '?' ||
492 (c
== '{' && MORE2() && isdigit((uch
)PEEK2()))))
494 SETERROR(REG_BADRPT
);
498 * p_str - string (no metacharacters) "parser"
501 p_str(struct parse
*p
)
503 (void) REQUIRE(MORE(), REG_BADPAT
);
505 ordinary(p
, WGETNEXT());
509 * p_bre - BRE parser top level, anchoring and concatenation
510 * Giving end1 as OUT essentially eliminates the end1/end2 check.
512 * This implementation is a bit of a kludge, in that a trailing $ is first
513 * taken as an ordinary character and then revised to be an anchor.
514 * The amount of lookahead needed to avoid this kludge is excessive.
517 p_bre(struct parse
*p
,
518 int end1
, /* first terminating character */
519 int end2
) /* second terminating character */
521 sopno start
= HERE();
522 int first
= 1; /* first subexpression? */
527 p
->g
->iflags
|= USEBOL
;
530 while (MORE() && !SEETWO(end1
, end2
)) {
531 wasdollar
= p_simp_re(p
, first
);
534 if (wasdollar
) { /* oops, that was a trailing anchor */
537 p
->g
->iflags
|= USEEOL
;
541 (void) REQUIRE(HERE() != start
, REG_BADPAT
); /* require nonempty */
545 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
547 static int /* was the simple RE an unbackslashed $? */
548 p_simp_re(struct parse
*p
,
549 int starordinary
) /* is a leading * an ordinary character? */
558 #define BACKSL (1<<CHAR_BIT)
560 pos
= HERE(); /* repetition op, if any, covers from here */
562 assert(MORE()); /* caller should have ensured this */
565 (void) REQUIRE(MORE(), REG_EESCAPE
);
566 c
= BACKSL
| GETNEXT();
570 if (p
->g
->cflags
®_NEWLINE
)
585 SETERROR(REG_BADRPT
);
591 p
->pbegin
[subno
] = HERE();
592 EMIT(OLPAREN
, subno
);
593 /* the MORE here is an error heuristic */
594 if (MORE() && !SEETWO('\\', ')'))
596 if (subno
< NPAREN
) {
597 p
->pend
[subno
] = HERE();
598 assert(p
->pend
[subno
] != 0);
600 EMIT(ORPAREN
, subno
);
601 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN
);
603 case BACKSL
|')': /* should not get here -- must be user */
605 SETERROR(REG_EPAREN
);
616 i
= (c
&~BACKSL
) - '0';
618 if (p
->pend
[i
] != 0) {
619 assert(i
<= p
->g
->nsub
);
621 assert(p
->pbegin
[i
] != 0);
622 assert(OP(p
->strip
[p
->pbegin
[i
]]) == OLPAREN
);
623 assert(OP(p
->strip
[p
->pend
[i
]]) == ORPAREN
);
624 (void) dupl(p
, p
->pbegin
[i
]+1, p
->pend
[i
]);
627 SETERROR(REG_ESUBREG
);
631 (void) REQUIRE(starordinary
, REG_BADRPT
);
635 return (0); /* Definitely not $... */
642 if (EAT('*')) { /* implemented as +? */
643 /* this case does not require the (y|) trick, noKLUDGE */
646 INSERT(OQUEST_
, pos
);
647 ASTERN(O_QUEST
, pos
);
648 } else if (EATTWO('\\', '{')) {
651 if (MORE() && isdigit((uch
)PEEK())) {
653 (void) REQUIRE(count
<= count2
, REG_BADBR
);
654 } else /* single number with comma */
656 } else /* just a single number */
658 repeat(p
, pos
, count
, count2
);
659 if (!EATTWO('\\', '}')) { /* error heuristics */
660 while (MORE() && !SEETWO('\\', '}'))
662 (void) REQUIRE(MORE(), REG_EBRACE
);
665 } else if (c
== '$') /* $ (but not \$) ends it */
672 * p_count - parse a repetition count
674 static int /* the value */
675 p_count(struct parse
*p
)
680 while (MORE() && isdigit((uch
)PEEK()) && count
<= DUPMAX
) {
681 count
= count
*10 + (GETNEXT() - '0');
685 (void) REQUIRE(ndigits
> 0 && count
<= DUPMAX
, REG_BADBR
);
690 * p_bracket - parse a bracketed character list
693 p_bracket(struct parse
*p
)
698 /* Dept of Truly Sickening Special-Case Kludges */
699 if (p
->next
+ 5 < p
->end
&& strncmp(p
->next
, "[:<:]]", 6) == 0) {
704 if (p
->next
+ 5 < p
->end
&& strncmp(p
->next
, "[:>:]]", 6) == 0) {
710 if ((cs
= allocset(p
)) == NULL
)
713 if (p
->g
->cflags
®_ICASE
)
721 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
725 (void) MUSTEAT(']', REG_EBRACK
);
727 if (p
->error
!= 0) /* don't mess things up further */
730 if (cs
->invert
&& p
->g
->cflags
®_NEWLINE
)
731 cs
->bmp
['\n' >> 3] |= 1 << ('\n' & 7);
733 if ((ch
= singleton(cs
)) != OUT
) { /* optimize singleton sets */
737 EMIT(OANYOF
, (int)(cs
- p
->g
->sets
));
741 * p_b_term - parse one term of a bracketed character list
744 p_b_term(struct parse
*p
, cset
*cs
)
747 wint_t start
, finish
;
749 locale_t loc
= uselocale(NULL
);
751 /* classify what we've got */
752 switch ((MORE()) ? PEEK() : '\0') {
754 c
= (MORE2()) ? PEEK2() : '\0';
757 SETERROR(REG_ERANGE
);
758 return; /* NOTE RETURN */
765 case ':': /* character class */
767 (void) REQUIRE(MORE(), REG_EBRACK
);
769 (void) REQUIRE(c
!= '-' && c
!= ']', REG_ECTYPE
);
771 (void) REQUIRE(MORE(), REG_EBRACK
);
772 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE
);
774 case '=': /* equivalence class */
776 (void) REQUIRE(MORE(), REG_EBRACK
);
778 (void) REQUIRE(c
!= '-' && c
!= ']', REG_ECOLLATE
);
780 (void) REQUIRE(MORE(), REG_EBRACK
);
781 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE
);
783 default: /* symbol, ordinary character, or range */
784 start
= p_b_symbol(p
);
785 if (SEE('-') && MORE2() && PEEK2() != ']') {
791 finish
= p_b_symbol(p
);
797 if (loc
->collate
->lc_is_posix
) {
798 (void) REQUIRE((uch
)start
<= (uch
)finish
,
800 CHaddrange(p
, cs
, start
, finish
);
802 (void) REQUIRE(_collate_range_cmp(start
,
803 finish
, loc
) <= 0, REG_ERANGE
);
804 for (i
= 0; i
<= UCHAR_MAX
; i
++) {
805 if (_collate_range_cmp(start
, i
, loc
)
807 _collate_range_cmp(i
, finish
, loc
)
818 * p_b_cclass - parse a character-class name and deal with it
821 p_b_cclass(struct parse
*p
, cset
*cs
)
823 const char *sp
= p
->next
;
828 while (MORE() && isalpha((uch
)PEEK()))
831 if (len
>= sizeof (clname
) - 1) {
832 SETERROR(REG_ECTYPE
);
835 (void) memcpy(clname
, sp
, len
);
837 if ((wct
= wctype(clname
)) == 0) {
838 SETERROR(REG_ECTYPE
);
841 CHaddtype(p
, cs
, wct
);
845 * p_b_eclass - parse an equivalence-class name and deal with it
847 * This implementation is incomplete. xxx
850 p_b_eclass(struct parse
*p
, cset
*cs
)
854 c
= p_b_coll_elem(p
, '=');
859 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
861 static wint_t /* value of symbol */
862 p_b_symbol(struct parse
*p
)
866 (void) REQUIRE(MORE(), REG_EBRACK
);
867 if (!EATTWO('[', '.'))
870 /* collating symbol */
871 value
= p_b_coll_elem(p
, '.');
872 (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE
);
877 * p_b_coll_elem - parse a collating-element name and look it up
879 static wint_t /* value of collating element */
880 p_b_coll_elem(struct parse
*p
,
881 wint_t endc
) /* name ended by endc,']' */
883 const char *sp
= p
->next
;
889 while (MORE() && !SEETWO(endc
, ']'))
892 SETERROR(REG_EBRACK
);
896 for (cp
= cnames
; cp
->name
!= NULL
; cp
++)
897 if (strncmp(cp
->name
, sp
, len
) == 0 && cp
->name
[len
] == '\0')
898 return (cp
->code
); /* known name */
899 (void) memset(&mbs
, 0, sizeof (mbs
));
900 if ((clen
= mbrtowc(&wc
, sp
, len
, &mbs
)) == len
)
901 return (wc
); /* single character */
902 else if (clen
== (size_t)-1 || clen
== (size_t)-2)
905 SETERROR(REG_ECOLLATE
); /* neither */
910 * othercase - return the case counterpart of an alphabetic
912 static wint_t /* if no counterpart, return ch */
915 assert(iswalpha(ch
));
917 return (towlower(ch
));
918 else if (iswlower(ch
))
919 return (towupper(ch
));
920 else /* peculiar, but could happen */
925 * bothcases - emit a dualcase version of a two-case character
927 * Boy, is this implementation ever a kludge...
930 bothcases(struct parse
*p
, wint_t ch
)
932 const char *oldnext
= p
->next
;
933 const char *oldend
= p
->end
;
934 char bracket
[3 + MB_LEN_MAX
];
938 assert(othercase(ch
) != ch
); /* p_bracket() would recurse */
940 (void) memset(&mbs
, 0, sizeof (mbs
));
941 n
= wcrtomb(bracket
, ch
, &mbs
);
942 assert(n
!= (size_t)-1);
944 bracket
[n
+ 1] = '\0';
945 p
->end
= bracket
+n
+1;
947 assert(p
->next
== p
->end
);
953 * ordinary - emit an ordinary character
956 ordinary(struct parse
*p
, wint_t ch
)
960 if ((p
->g
->cflags
®_ICASE
) && iswalpha(ch
) && othercase(ch
) != ch
)
962 else if ((ch
& OPDMASK
) == ch
)
966 * Kludge: character is too big to fit into an OCHAR operand.
967 * Emit a singleton set.
969 if ((cs
= allocset(p
)) == NULL
)
972 EMIT(OANYOF
, (int)(cs
- p
->g
->sets
));
977 * nonnewline - emit REG_NEWLINE version of OANY
979 * Boy, is this implementation ever a kludge...
982 nonnewline(struct parse
*p
)
984 const char *oldnext
= p
->next
;
985 const char *oldend
= p
->end
;
995 assert(p
->next
== bracket
+3);
1001 * repeat - generate code for a bounded repetition, recursively if needed
1004 repeat(struct parse
*p
,
1005 sopno start
, /* operand from here to end of strip */
1006 int from
, /* repeated from this number */
1007 int to
) /* to this number of times (maybe INFINITY) */
1009 sopno finish
= HERE();
1012 #define REP(f, t) ((f)*8 + (t))
1013 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1016 if (p
->error
!= 0) /* head off possible runaway recursion */
1021 switch (REP(MAP(from
), MAP(to
))) {
1022 case REP(0, 0): /* must be user doing this */
1023 DROP(finish
-start
); /* drop the operand */
1025 case REP(0, 1): /* as x{1,1}? */
1026 case REP(0, N
): /* as x{1,n}? */
1027 case REP(0, INF
): /* as x{1,}? */
1028 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1029 INSERT(OCH_
, start
); /* offset is wrong... */
1030 repeat(p
, start
+1, 1, to
);
1031 ASTERN(OOR1
, start
);
1032 AHEAD(start
); /* ... fix it */
1035 ASTERN(O_CH
, THERETHERE());
1037 case REP(1, 1): /* trivial case */
1040 case REP(1, N
): /* as x?x{1,n-1} */
1041 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1042 INSERT(OCH_
, start
);
1043 ASTERN(OOR1
, start
);
1045 EMIT(OOR2
, 0); /* offset very wrong... */
1046 AHEAD(THERE()); /* ...so fix it */
1047 ASTERN(O_CH
, THERETHERE());
1048 copy
= dupl(p
, start
+1, finish
+1);
1049 assert(copy
== finish
+4);
1050 repeat(p
, copy
, 1, to
-1);
1052 case REP(1, INF
): /* as x+ */
1053 INSERT(OPLUS_
, start
);
1054 ASTERN(O_PLUS
, start
);
1056 case REP(N
, N
): /* as xx{m-1,n-1} */
1057 copy
= dupl(p
, start
, finish
);
1058 repeat(p
, copy
, from
-1, to
-1);
1060 case REP(N
, INF
): /* as xx{n-1,INF} */
1061 copy
= dupl(p
, start
, finish
);
1062 repeat(p
, copy
, from
-1, to
);
1064 default: /* "can't happen" */
1065 SETERROR(REG_EFATAL
); /* just in case */
1071 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1072 * character from the parse struct, signals a REG_ILLSEQ error if the
1073 * character can't be converted. Returns the number of bytes consumed.
1076 wgetnext(struct parse
*p
)
1082 (void) memset(&mbs
, 0, sizeof (mbs
));
1083 n
= mbrtowc(&wc
, p
->next
, p
->end
- p
->next
, &mbs
);
1084 if (n
== (size_t)-1 || n
== (size_t)-2) {
1085 SETERROR(REG_ECHAR
);
1095 * seterr - set an error condition
1097 static int /* useless but makes type checking happy */
1098 seterr(struct parse
*p
, int e
)
1100 if (p
->error
== 0) /* keep earliest error condition */
1102 p
->next
= nuls
; /* try to bring things to a halt */
1104 return (0); /* make the return value well-defined */
1108 * allocset - allocate a set of characters for []
1111 allocset(struct parse
*p
)
1115 ncs
= reallocarray(p
->g
->sets
, p
->g
->ncsets
+ 1, sizeof (*ncs
));
1117 SETERROR(REG_ESPACE
);
1121 cs
= &p
->g
->sets
[p
->g
->ncsets
++];
1122 (void) memset(cs
, 0, sizeof (*cs
));
1128 * freeset - free a now-unused set
1131 freeset(struct parse
*p
, cset
*cs
)
1133 cset
*top
= &p
->g
->sets
[p
->g
->ncsets
];
1138 (void) memset(cs
, 0, sizeof (*cs
));
1139 if (cs
== top
-1) /* recover only the easy case */
1144 * singleton - Determine whether a set contains only one character,
1145 * returning it if so, otherwise returning OUT.
1152 for (i
= n
= 0; i
< NC
; i
++)
1159 if (cs
->nwides
== 1 && cs
->nranges
== 0 && cs
->ntypes
== 0 &&
1161 return (cs
->wides
[0]);
1162 /* Don't bother handling the other cases. */
1167 * CHadd - add character to character set.
1170 CHadd(struct parse
*p
, cset
*cs
, wint_t ch
)
1172 wint_t nch
, *newwides
;
1175 cs
->bmp
[ch
>> 3] |= 1 << (ch
& 7);
1177 newwides
= reallocarray(cs
->wides
, cs
->nwides
+ 1,
1178 sizeof (*cs
->wides
));
1179 if (newwides
== NULL
) {
1180 SETERROR(REG_ESPACE
);
1183 cs
->wides
= newwides
;
1184 cs
->wides
[cs
->nwides
++] = ch
;
1187 if ((nch
= towlower(ch
)) < NC
)
1188 cs
->bmp
[nch
>> 3] |= 1 << (nch
& 7);
1189 if ((nch
= towupper(ch
)) < NC
)
1190 cs
->bmp
[nch
>> 3] |= 1 << (nch
& 7);
1195 * CHaddrange - add all characters in the range [min,max] to a character set.
1198 CHaddrange(struct parse
*p
, cset
*cs
, wint_t min
, wint_t max
)
1202 for (; min
< NC
&& min
<= max
; min
++)
1206 newranges
= reallocarray(cs
->ranges
, cs
->nranges
+ 1,
1207 sizeof (*cs
->ranges
));
1208 if (newranges
== NULL
) {
1209 SETERROR(REG_ESPACE
);
1212 cs
->ranges
= newranges
;
1213 cs
->ranges
[cs
->nranges
].min
= min
;
1214 cs
->ranges
[cs
->nranges
].max
= max
;
1219 * CHaddtype - add all characters of a certain type to a character set.
1222 CHaddtype(struct parse
*p
, cset
*cs
, wctype_t wct
)
1227 for (i
= 0; i
< NC
; i
++)
1228 if (iswctype(i
, wct
))
1230 newtypes
= reallocarray(cs
->types
, cs
->ntypes
+ 1,
1231 sizeof (*cs
->types
));
1232 if (newtypes
== NULL
) {
1233 SETERROR(REG_ESPACE
);
1236 cs
->types
= newtypes
;
1237 cs
->types
[cs
->ntypes
++] = wct
;
1241 * dupl - emit a duplicate of a bunch of sops
1243 static sopno
/* start of duplicate */
1244 dupl(struct parse
*p
,
1245 sopno start
, /* from here */
1246 sopno finish
) /* to this less one */
1249 sopno len
= finish
- start
;
1251 assert(finish
>= start
);
1254 if (!enlarge(p
, p
->ssize
+ len
)) /* this many unexpected additions */
1256 assert(p
->ssize
>= p
->slen
+ len
);
1257 (void) memcpy((char *)(p
->strip
+ p
->slen
),
1258 (char *)(p
->strip
+ start
), (size_t)len
*sizeof (sop
));
1264 * doemit - emit a strip operator
1266 * It might seem better to implement this as a macro with a function as
1267 * hard-case backup, but it's just too big and messy unless there are
1268 * some changes to the data structures. Maybe later.
1271 doemit(struct parse
*p
, sop op
, size_t opnd
)
1273 /* avoid making error situations worse */
1277 /* deal with oversize operands ("can't happen", more or less) */
1278 assert(opnd
< 1<<OPSHIFT
);
1280 /* deal with undersized strip */
1281 if (p
->slen
>= p
->ssize
)
1282 if (!enlarge(p
, (p
->ssize
+1) / 2 * 3)) /* +50% */
1285 /* finally, it's all reduced to the easy case */
1286 p
->strip
[p
->slen
++] = SOP(op
, opnd
);
1290 * doinsert - insert a sop into the strip
1293 doinsert(struct parse
*p
, sop op
, size_t opnd
, sopno pos
)
1299 /* avoid making error situations worse */
1304 EMIT(op
, opnd
); /* do checks, ensure space */
1305 assert(HERE() == sn
+1);
1308 /* adjust paren pointers */
1310 for (i
= 1; i
< NPAREN
; i
++) {
1311 if (p
->pbegin
[i
] >= pos
) {
1314 if (p
->pend
[i
] >= pos
) {
1319 (void) memmove((char *)&p
->strip
[pos
+1], (char *)&p
->strip
[pos
],
1320 (HERE()-pos
-1)*sizeof (sop
));
1325 * dofwd - complete a forward reference
1328 dofwd(struct parse
*p
, sopno pos
, sop value
)
1330 /* avoid making error situations worse */
1334 assert(value
< 1<<OPSHIFT
);
1335 p
->strip
[pos
] = OP(p
->strip
[pos
]) | value
;
1339 * enlarge - enlarge the strip
1342 enlarge(struct parse
*p
, sopno size
)
1346 if (p
->ssize
>= size
)
1349 sp
= reallocarray(p
->strip
, size
, sizeof (sop
));
1351 SETERROR(REG_ESPACE
);
1360 * stripsnug - compact the strip
1363 stripsnug(struct parse
*p
, struct re_guts
*g
)
1365 g
->nstates
= p
->slen
;
1366 g
->strip
= reallocarray(p
->strip
, p
->slen
, sizeof (sop
));
1367 if (g
->strip
== NULL
) {
1368 SETERROR(REG_ESPACE
);
1369 g
->strip
= p
->strip
;
1374 * findmust - fill in must and mlen with longest mandatory literal string
1376 * This algorithm could do fancy things like analyzing the operands of |
1377 * for common subsequences. Someday. This code is simple and finds most
1378 * of the interesting cases.
1380 * Note that must and mlen got initialized during setup.
1383 findmust(struct parse
*p
, struct re_guts
*g
)
1387 sop
*newstart
= NULL
;
1392 char buf
[MB_LEN_MAX
];
1395 locale_t loc
= uselocale(NULL
);
1397 /* avoid making error situations worse */
1402 * It's not generally safe to do a ``char'' substring search on
1403 * multibyte character strings, but it's safe for at least
1404 * UTF-8 (see RFC 3629).
1406 if (MB_CUR_MAX
> 1 &&
1407 strcmp(loc
->runelocale
->__encoding
, "UTF-8") != 0)
1410 /* find the longest OCHAR sequence in strip */
1414 scan
= g
->strip
+ 1;
1418 case OCHAR
: /* sequence member */
1419 if (newlen
== 0) { /* new sequence */
1420 (void) memset(&mbs
, 0, sizeof (mbs
));
1421 newstart
= scan
- 1;
1423 clen
= wcrtomb(buf
, OPND(s
), &mbs
);
1424 if (clen
== (size_t)-1)
1428 case OPLUS_
: /* things that don't break one */
1432 case OQUEST_
: /* things that must be skipped */
1434 offset
= altoffset(scan
, offset
);
1439 /* assert() interferes w debug printouts */
1440 if (OP(s
) != O_QUEST
&& OP(s
) != O_CH
&&
1445 } while (OP(s
) != O_QUEST
&& OP(s
) != O_CH
);
1447 case OBOW
: /* things that break a sequence */
1454 if (newlen
> g
->mlen
) { /* ends one */
1458 g
->moffset
+= offset
;
1461 g
->moffset
= offset
;
1469 if (newlen
> g
->mlen
) { /* ends one */
1473 g
->moffset
+= offset
;
1476 g
->moffset
= offset
;
1485 case OANYOF
: /* may or may not invalidate offset */
1486 /* First, everything as OANY */
1487 if (newlen
> g
->mlen
) { /* ends one */
1491 g
->moffset
+= offset
;
1494 g
->moffset
= offset
;
1506 * Anything here makes it impossible or too hard
1507 * to calculate the offset -- so we give up;
1508 * save the last known good offset, in case the
1509 * must sequence doesn't occur later.
1511 if (newlen
> g
->mlen
) { /* ends one */
1515 g
->moffset
+= offset
;
1517 g
->moffset
= offset
;
1523 } while (OP(s
) != OEND
);
1525 if (g
->mlen
== 0) { /* there isn't one */
1530 /* turn it into a character string */
1531 g
->must
= malloc((size_t)g
->mlen
+ 1);
1532 if (g
->must
== NULL
) { /* argh; just forget it */
1539 (void) memset(&mbs
, 0, sizeof (mbs
));
1540 while (cp
< g
->must
+ g
->mlen
) {
1541 while (OP(s
= *scan
++) != OCHAR
)
1543 clen
= wcrtomb(cp
, OPND(s
), &mbs
);
1544 assert(clen
!= (size_t)-1);
1547 assert(cp
== g
->must
+ g
->mlen
);
1548 *cp
++ = '\0'; /* just on general principles */
1552 * altoffset - choose biggest offset among multiple choices
1554 * Compute, recursively if necessary, the largest offset among multiple
1558 altoffset(sop
*scan
, int offset
)
1564 /* If we gave up already on offsets, return */
1571 while (OP(s
) != O_QUEST
&& OP(s
) != O_CH
) {
1580 try = altoffset(scan
, try);
1587 if (OP(s
) != O_QUEST
&& OP(s
) != O_CH
&&
1590 } while (OP(s
) != O_QUEST
&& OP(s
) != O_CH
);
1592 * We must skip to the next position, or we'll
1593 * leave altoffset() too early.
1620 return (largest
+offset
);
1624 * computejumps - compute char jumps for BM scan
1626 * This algorithm assumes g->must exists and is has size greater than
1627 * zero. It's based on the algorithm found on Computer Algorithms by
1630 * A char jump is the number of characters one needs to jump based on
1631 * the value of the character from the text that was mismatched.
1634 computejumps(struct parse
*p
, struct re_guts
*g
)
1639 /* Avoid making errors worse */
1643 g
->charjump
= (int *)malloc((NC
+ 1) * sizeof (int));
1644 if (g
->charjump
== NULL
) /* Not a fatal error */
1646 /* Adjust for signed chars, if necessary */
1647 g
->charjump
= &g
->charjump
[-(CHAR_MIN
)];
1650 * If the character does not exist in the pattern, the jump
1651 * is equal to the number of characters in the pattern.
1653 for (ch
= CHAR_MIN
; ch
< (CHAR_MAX
+ 1); ch
++)
1654 g
->charjump
[ch
] = g
->mlen
;
1657 * If the character does exist, compute the jump that would
1658 * take us to the last character in the pattern equal to it
1659 * (notice that we match right to left, so that last character
1660 * is the first one that would be matched).
1662 for (mindex
= 0; mindex
< g
->mlen
; mindex
++)
1663 g
->charjump
[(int)g
->must
[mindex
]] = g
->mlen
- mindex
- 1;
1667 * computematchjumps - compute match jumps for BM scan
1669 * This algorithm assumes g->must exists and is has size greater than
1670 * zero. It's based on the algorithm found on Computer Algorithms by
1673 * A match jump is the number of characters one needs to advance based
1674 * on the already-matched suffix.
1675 * Notice that all values here are minus (g->mlen-1), because of the way
1676 * the search algorithm works.
1679 computematchjumps(struct parse
*p
, struct re_guts
*g
)
1681 int mindex
; /* General "must" iterator */
1682 int suffix
; /* Keeps track of matching suffix */
1683 int ssuffix
; /* Keeps track of suffixes' suffix */
1686 * pmatches[k] points to the next i
1687 * such that i+1...mlen is a substring
1688 * of k+1...k+mlen-i-1
1691 /* Avoid making errors worse */
1695 pmatches
= (int *)malloc(g
->mlen
* sizeof (unsigned int));
1696 if (pmatches
== NULL
) {
1697 g
->matchjump
= NULL
;
1701 g
->matchjump
= (int *)malloc(g
->mlen
* sizeof (unsigned int));
1702 if (g
->matchjump
== NULL
) { /* Not a fatal error */
1707 /* Set maximum possible jump for each character in the pattern */
1708 for (mindex
= 0; mindex
< g
->mlen
; mindex
++)
1709 g
->matchjump
[mindex
] = 2*g
->mlen
- mindex
- 1;
1711 /* Compute pmatches[] */
1712 for (mindex
= g
->mlen
- 1, suffix
= g
->mlen
; mindex
>= 0;
1713 mindex
--, suffix
--) {
1714 pmatches
[mindex
] = suffix
;
1717 * If a mismatch is found, interrupting the substring,
1718 * compute the matchjump for that position. If no
1719 * mismatch is found, then a text substring mismatched
1720 * against the suffix will also mismatch against the
1723 while (suffix
< g
->mlen
&& g
->must
[mindex
] != g
->must
[suffix
]) {
1724 g
->matchjump
[suffix
] = MIN(g
->matchjump
[suffix
],
1725 g
->mlen
- mindex
- 1);
1726 suffix
= pmatches
[suffix
];
1731 * Compute the matchjump up to the last substring found to jump
1732 * to the beginning of the largest must pattern prefix matching
1735 for (mindex
= 0; mindex
<= suffix
; mindex
++)
1736 g
->matchjump
[mindex
] = MIN(g
->matchjump
[mindex
],
1737 g
->mlen
+ suffix
- mindex
);
1739 ssuffix
= pmatches
[suffix
];
1740 while (suffix
< g
->mlen
) {
1741 while (suffix
<= ssuffix
&& suffix
< g
->mlen
) {
1742 g
->matchjump
[suffix
] = MIN(g
->matchjump
[suffix
],
1743 g
->mlen
+ ssuffix
- suffix
);
1746 if (suffix
< g
->mlen
)
1747 ssuffix
= pmatches
[ssuffix
];
1754 * pluscount - count + nesting
1756 static sopno
/* nesting depth */
1757 pluscount(struct parse
*p
, struct re_guts
*g
)
1765 return (0); /* there may not be an OEND */
1767 scan
= g
->strip
+ 1;
1775 if (plusnest
> maxnest
)
1780 } while (OP(s
) != OEND
);