Patrick Welche <prlw1@cam.ac.uk>
[netbsd-mini2440.git] / regress / lib / libc / regex / tests
blob157852b7db908691710a905e6e197f1790614411
1 #       $NetBSD$
3 # regular expression test set
4 # Lines are at least three fields, separated by one or more tabs.  "" stands
5 # for an empty field.  First field is an RE.  Second field is flags.  If
6 # C flag given, regcomp() is expected to fail, and the third field is the
7 # error name (minus the leading REG_).
9 # Otherwise it is expected to succeed, and the third field is the string to
10 # try matching it against.  If there is no fourth field, the match is
11 # expected to fail.  If there is a fourth field, it is the substring that
12 # the RE is expected to match.  If there is a fifth field, it is a comma-
13 # separated list of what the subexpressions should match, with - indicating
14 # no match for that one.  In both the fourth and fifth fields, a (sub)field
15 # starting with @ indicates that the (sub)expression is expected to match
16 # a null string followed by the stuff after the @; this provides a way to
17 # test where null strings match.  The character `N' in REs and strings
18 # is newline, `S' is space, `T' is tab, `Z' is NUL.
20 # The full list of flags:
21 #       -       placeholder, does nothing
22 #       b       RE is a BRE, not an ERE
23 #       &       try it as both an ERE and a BRE
24 #       C       regcomp() error expected, third field is error name
25 #       i       REG_ICASE
26 #       m       ("mundane") REG_NOSPEC
27 #       s       REG_NOSUB (not really testable)
28 #       n       REG_NEWLINE
29 #       ^       REG_NOTBOL
30 #       $       REG_NOTEOL
31 #       #       REG_STARTEND (see below)
32 #       p       REG_PEND
34 # For REG_STARTEND, the start/end offsets are those of the substring
35 # enclosed in ().
37 # basics
38 a               &       a       a
39 abc             &       abc     abc
40 abc|de          -       abc     abc
41 a|b|c           -       abc     a
43 # parentheses and perversions thereof
44 a(b)c           -       abc     abc
45 a\(b\)c         b       abc     abc
46 a(              C       EPAREN
47 a(              b       a(      a(
48 a\(             -       a(      a(
49 a\(             bC      EPAREN
50 a\(b            bC      EPAREN
51 a(b             C       EPAREN
52 a(b             b       a(b     a(b
53 # gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
54 a)              -       a)      a)
55 )               -       )       )
56 # end gagging (in a just world, those *should* give EPAREN)
57 a)              b       a)      a)
58 a\)             bC      EPAREN
59 \)              bC      EPAREN
60 a()b            -       ab      ab
61 a\(\)b          b       ab      ab
63 # anchoring and REG_NEWLINE
64 ^abc$           &       abc     abc
65 a^b             -       a^b
66 a^b             b       a^b     a^b
67 a$b             -       a$b
68 a$b             b       a$b     a$b
69 ^               &       abc     @abc
70 $               &       abc     @
71 ^$              &       ""      @
72 $^              -       ""      @
73 \($\)\(^\)      b       ""      @
74 # stop retching, those are legitimate (although disgusting)
75 ^^              -       ""      @
76 $$              -       ""      @
77 b$              &       abNc
78 b$              &n      abNc    b
79 ^b$             &       aNbNc
80 ^b$             &n      aNbNc   b
81 ^$              &n      aNNb    @Nb
82 ^$              n       abc
83 ^$              n       abcN    @
84 $^              n       aNNb    @Nb
85 \($\)\(^\)      bn      aNNb    @Nb
86 ^^              n^      aNNb    @Nb
87 $$              n       aNNb    @NN
88 ^a              ^       a
89 a$              $       a
90 ^a              ^n      aNb
91 ^b              ^n      aNb     b
92 a$              $n      bNa
93 b$              $n      bNa     b
94 a*(^b$)c*       -       b       b
95 a*\(^b$\)c*     b       b       b
97 # certain syntax errors and non-errors
98 |               C       EMPTY
99 |               b       |       |
100 *               C       BADRPT
101 *               b       *       *
102 +               C       BADRPT
103 ?               C       BADRPT
104 ""              &C      EMPTY
105 ()              -       abc     @abc
106 \(\)            b       abc     @abc
107 a||b            C       EMPTY
108 |ab             C       EMPTY
109 ab|             C       EMPTY
110 (|a)b           C       EMPTY
111 (a|)b           C       EMPTY
112 (*a)            C       BADRPT
113 (+a)            C       BADRPT
114 (?a)            C       BADRPT
115 ({1}a)          C       BADRPT
116 \(\{1\}a\)      bC      BADRPT
117 (a|*b)          C       BADRPT
118 (a|+b)          C       BADRPT
119 (a|?b)          C       BADRPT
120 (a|{1}b)        C       BADRPT
121 ^*              C       BADRPT
122 ^*              b       *       *
123 ^+              C       BADRPT
124 ^?              C       BADRPT
125 ^{1}            C       BADRPT
126 ^\{1\}          bC      BADRPT
128 # metacharacters, backslashes
129 a.c             &       abc     abc
130 a[bc]d          &       abd     abd
131 a\*c            &       a*c     a*c
132 a\\b            &       a\b     a\b
133 a\\\*b          &       a\*b    a\*b
134 a\bc            &       abc     abc
135 a\              &C      EESCAPE
136 a\\bc           &       a\bc    a\bc
137 \{              bC      BADRPT
138 a\[b            &       a[b     a[b
139 a[b             &C      EBRACK
140 # trailing $ is a peculiar special case for the BRE code
141 a$              &       a       a
142 a$              &       a$
143 a\$             &       a
144 a\$             &       a$      a$
145 a\\$            &       a
146 a\\$            &       a$
147 a\\$            &       a\$
148 a\\$            &       a\      a\
150 # back references, ugh
151 a\(b\)\2c       bC      ESUBREG
152 a\(b\1\)c       bC      ESUBREG
153 a\(b*\)c\1d     b       abbcbbd abbcbbd bb
154 a\(b*\)c\1d     b       abbcbd
155 a\(b*\)c\1d     b       abbcbbbd
156 ^\(.\)\1        b       abc
157 a\([bc]\)\1d    b       abcdabbd        abbd    b
158 a\(\([bc]\)\2\)*d       b       abbccd  abbccd
159 a\(\([bc]\)\2\)*d       b       abbcbd
160 # actually, this next one probably ought to fail, but the spec is unclear
161 a\(\(b\)*\2\)*d         b       abbbd   abbbd
162 # here is a case that no NFA implementation does right
163 \(ab*\)[ab]*\1  b       ababaaa ababaaa a
164 # check out normal matching in the presence of back refs
165 \(a\)\1bcd      b       aabcd   aabcd
166 \(a\)\1bc*d     b       aabcd   aabcd
167 \(a\)\1bc*d     b       aabd    aabd
168 \(a\)\1bc*d     b       aabcccd aabcccd
169 \(a\)\1bc*[ce]d b       aabcccd aabcccd
170 ^\(a\)\1b\(c\)*cd$      b       aabcccd aabcccd
172 # ordinary repetitions
173 ab*c            &       abc     abc
174 ab+c            -       abc     abc
175 ab?c            -       abc     abc
176 a\(*\)b         b       a*b     a*b
177 a\(**\)b        b       ab      ab
178 a\(***\)b       bC      BADRPT
179 *a              b       *a      *a
180 **a             b       a       a
181 ***a            bC      BADRPT
183 # the dreaded bounded repetitions
184 {               &       {       {
185 {abc            &       {abc    {abc
186 {1              C       BADRPT
187 {1}             C       BADRPT
188 a{b             &       a{b     a{b
189 a{1}b           -       ab      ab
190 a\{1\}b         b       ab      ab
191 a{1,}b          -       ab      ab
192 a\{1,\}b        b       ab      ab
193 a{1,2}b         -       aab     aab
194 a\{1,2\}b       b       aab     aab
195 a{1             C       EBRACE
196 a\{1            bC      EBRACE
197 a{1a            C       EBRACE
198 a\{1a           bC      EBRACE
199 a{1a}           C       BADBR
200 a\{1a\}         bC      BADBR
201 a{,2}           -       a{,2}   a{,2}
202 a\{,2\}         bC      BADBR
203 a{,}            -       a{,}    a{,}
204 a\{,\}          bC      BADBR
205 a{1,x}          C       BADBR
206 a\{1,x\}        bC      BADBR
207 a{1,x           C       EBRACE
208 a\{1,x          bC      EBRACE
209 a{300}          C       BADBR
210 a\{300\}        bC      BADBR
211 a{1,0}          C       BADBR
212 a\{1,0\}        bC      BADBR
213 ab{0,0}c        -       abcac   ac
214 ab\{0,0\}c      b       abcac   ac
215 ab{0,1}c        -       abcac   abc
216 ab\{0,1\}c      b       abcac   abc
217 ab{0,3}c        -       abbcac  abbc
218 ab\{0,3\}c      b       abbcac  abbc
219 ab{1,1}c        -       acabc   abc
220 ab\{1,1\}c      b       acabc   abc
221 ab{1,3}c        -       acabc   abc
222 ab\{1,3\}c      b       acabc   abc
223 ab{2,2}c        -       abcabbc abbc
224 ab\{2,2\}c      b       abcabbc abbc
225 ab{2,4}c        -       abcabbc abbc
226 ab\{2,4\}c      b       abcabbc abbc
227 ((a{1,10}){1,10}){1,10} -       a       a       a,a
229 # multiple repetitions
230 a**             &C      BADRPT
231 a++             C       BADRPT
232 a??             C       BADRPT
233 a*+             C       BADRPT
234 a*?             C       BADRPT
235 a+*             C       BADRPT
236 a+?             C       BADRPT
237 a?*             C       BADRPT
238 a?+             C       BADRPT
239 a{1}{1}         C       BADRPT
240 a*{1}           C       BADRPT
241 a+{1}           C       BADRPT
242 a?{1}           C       BADRPT
243 a{1}*           C       BADRPT
244 a{1}+           C       BADRPT
245 a{1}?           C       BADRPT
246 a*{b}           -       a{b}    a{b}
247 a\{1\}\{1\}     bC      BADRPT
248 a*\{1\}         bC      BADRPT
249 a\{1\}*         bC      BADRPT
251 # brackets, and numerous perversions thereof
252 a[b]c           &       abc     abc
253 a[ab]c          &       abc     abc
254 a[^ab]c         &       adc     adc
255 a[]b]c          &       a]c     a]c
256 a[[b]c          &       a[c     a[c
257 a[-b]c          &       a-c     a-c
258 a[^]b]c         &       adc     adc
259 a[^-b]c         &       adc     adc
260 a[b-]c          &       a-c     a-c
261 a[b             &C      EBRACK
262 a[]             &C      EBRACK
263 a[1-3]c         &       a2c     a2c
264 a[3-1]c         &C      ERANGE
265 a[1-3-5]c       &C      ERANGE
266 a[[.-.]--]c     &       a-c     a-c
267 a[1-            &C      ERANGE
268 a[[.            &C      EBRACK
269 a[[.x           &C      EBRACK
270 a[[.x.          &C      EBRACK
271 a[[.x.]         &C      EBRACK
272 a[[.x.]]        &       ax      ax
273 a[[.x,.]]       &C      ECOLLATE
274 a[[.one.]]b     &       a1b     a1b
275 a[[.notdef.]]b  &C      ECOLLATE
276 a[[.].]]b       &       a]b     a]b
277 a[[:alpha:]]c   &       abc     abc
278 a[[:notdef:]]c  &C      ECTYPE
279 a[[:            &C      EBRACK
280 a[[:alpha       &C      EBRACK
281 a[[:alpha:]     &C      EBRACK
282 a[[:alpha,:]    &C      ECTYPE
283 a[[:]:]]b       &C      ECTYPE
284 a[[:-:]]b       &C      ECTYPE
285 a[[:alph:]]     &C      ECTYPE
286 a[[:alphabet:]] &C      ECTYPE
287 [[:alnum:]]+    -       -%@a0X- a0X
288 [[:alpha:]]+    -       -%@aX0- aX
289 [[:blank:]]+    -       aSSTb   SST
290 [[:cntrl:]]+    -       aNTb    NT
291 [[:digit:]]+    -       a019b   019
292 [[:graph:]]+    -       Sa%bS   a%b
293 [[:lower:]]+    -       AabC    ab
294 [[:print:]]+    -       NaSbN   aSb
295 [[:punct:]]+    -       S%-&T   %-&
296 [[:space:]]+    -       aSNTb   SNT
297 [[:upper:]]+    -       aBCd    BC
298 [[:xdigit:]]+   -       p0f3Cq  0f3C
299 a[[=b=]]c       &       abc     abc
300 a[[=            &C      EBRACK
301 a[[=b           &C      EBRACK
302 a[[=b=          &C      EBRACK
303 a[[=b=]         &C      EBRACK
304 a[[=b,=]]       &C      ECOLLATE
305 a[[=one=]]b     &       a1b     a1b
307 # complexities
308 a(((b)))c       -       abc     abc
309 a(b|(c))d       -       abd     abd
310 a(b*|c)d        -       abbd    abbd
311 # just gotta have one DFA-buster, of course
312 a[ab]{20}       -       aaaaabaaaabaaaabaaaab   aaaaabaaaabaaaabaaaab
313 # and an inline expansion in case somebody gets tricky
314 a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab]       -       aaaaabaaaabaaaabaaaab   aaaaabaaaabaaaabaaaab
315 # and in case somebody just slips in an NFA...
316 a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night)      -       aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights
317 # fish for anomalies as the number of states passes 32
318 12345678901234567890123456789   -       a12345678901234567890123456789b 12345678901234567890123456789
319 123456789012345678901234567890  -       a123456789012345678901234567890b        123456789012345678901234567890
320 1234567890123456789012345678901 -       a1234567890123456789012345678901b       1234567890123456789012345678901
321 12345678901234567890123456789012        -       a12345678901234567890123456789012b      12345678901234567890123456789012
322 123456789012345678901234567890123       -       a123456789012345678901234567890123b     123456789012345678901234567890123
323 # and one really big one, beyond any plausible word width
324 1234567890123456789012345678901234567890123456789012345678901234567890  -       a1234567890123456789012345678901234567890123456789012345678901234567890b        1234567890123456789012345678901234567890123456789012345678901234567890
325 # fish for problems as brackets go past 8
326 [ab][cd][ef][gh][ij][kl][mn]    -       xacegikmoq      acegikm
327 [ab][cd][ef][gh][ij][kl][mn][op]        -       xacegikmoq      acegikmo
328 [ab][cd][ef][gh][ij][kl][mn][op][qr]    -       xacegikmoqy     acegikmoq
329 [ab][cd][ef][gh][ij][kl][mn][op][q]     -       xacegikmoqy     acegikmoq
331 # subtleties of matching
332 abc             &       xabcy   abc
333 a\(b\)?c\1d     b       acd
334 aBc             i       Abc     Abc
335 a[Bc]*d         i       abBCcd  abBCcd
336 0[[:upper:]]1   &i      0a1     0a1
337 0[[:lower:]]1   &i      0A1     0A1
338 a[^b]c          &i      abc
339 a[^b]c          &i      aBc
340 a[^b]c          &i      adc     adc
341 [a]b[c]         -       abc     abc
342 [a]b[a]         -       aba     aba
343 [abc]b[abc]     -       abc     abc
344 [abc]b[abd]     -       abd     abd
345 a(b?c)+d        -       accd    accd
346 (wee|week)(knights|night)       -       weeknights      weeknights
347 (we|wee|week|frob)(knights|night|day)   -       weeknights      weeknights
348 a[bc]d          -       xyzaaabcaababdacd       abd
349 a[ab]c          -       aaabc   abc
350 abc             s       abc     abc
351 a*              &       b       @b
353 # Let's have some fun -- try to match a C comment.
354 # first the obvious, which looks okay at first glance...
355 /\*.*\*/        -       /*x*/   /*x*/
356 # but...
357 /\*.*\*/        -       /*x*/y/*z*/     /*x*/y/*z*/
358 # okay, we must not match */ inside; try to do that...
359 /\*([^*]|\*[^/])*\*/    -       /*x*/   /*x*/
360 /\*([^*]|\*[^/])*\*/    -       /*x*/y/*z*/     /*x*/
361 # but...
362 /\*([^*]|\*[^/])*\*/    -       /*x**/y/*z*/    /*x**/y/*z*/
363 # and a still fancier version, which does it right (I think)...
364 /\*([^*]|\*+[^*/])*\*+/ -       /*x*/   /*x*/
365 /\*([^*]|\*+[^*/])*\*+/ -       /*x*/y/*z*/     /*x*/
366 /\*([^*]|\*+[^*/])*\*+/ -       /*x**/y/*z*/    /*x**/
367 /\*([^*]|\*+[^*/])*\*+/ -       /*x****/y/*z*/  /*x****/
368 /\*([^*]|\*+[^*/])*\*+/ -       /*x**x*/y/*z*/  /*x**x*/
369 /\*([^*]|\*+[^*/])*\*+/ -       /*x***x/y/*z*/  /*x***x/y/*z*/
371 # subexpressions
372 a(b)(c)d        -       abcd    abcd    b,c
373 a(((b)))c       -       abc     abc     b,b,b
374 a(b|(c))d       -       abd     abd     b,-
375 a(b*|c|e)d      -       abbd    abbd    bb
376 a(b*|c|e)d      -       acd     acd     c
377 a(b*|c|e)d      -       ad      ad      @d
378 a(b?)c          -       abc     abc     b
379 a(b?)c          -       ac      ac      @c
380 a(b+)c          -       abc     abc     b
381 a(b+)c          -       abbbc   abbbc   bbb
382 a(b*)c          -       ac      ac      @c
383 (a|ab)(bc([de]+)f|cde)  -       abcdef  abcdef  a,bcdef,de
384 # the regression tester only asks for 9 subexpressions
385 a(b)(c)(d)(e)(f)(g)(h)(i)(j)k   -       abcdefghijk     abcdefghijk     b,c,d,e,f,g,h,i,j
386 a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l        -       abcdefghijkl    abcdefghijkl    b,c,d,e,f,g,h,i,j,k
387 a([bc]?)c       -       abc     abc     b
388 a([bc]?)c       -       ac      ac      @c
389 a([bc]+)c       -       abc     abc     b
390 a([bc]+)c       -       abcc    abcc    bc
391 a([bc]+)bc      -       abcbc   abcbc   bc
392 a(bb+|b)b       -       abb     abb     b
393 a(bbb+|bb+|b)b  -       abb     abb     b
394 a(bbb+|bb+|b)b  -       abbb    abbb    bb
395 a(bbb+|bb+|b)bb -       abbb    abbb    b
396 (.*).*          -       abcdef  abcdef  abcdef
397 (a*)*           -       bc      @b      @b
399 # do we get the right subexpression when it is used more than once?
400 a(b|c)*d        -       ad      ad      -
401 a(b|c)*d        -       abcd    abcd    c
402 a(b|c)+d        -       abd     abd     b
403 a(b|c)+d        -       abcd    abcd    c
404 a(b|c?)+d       -       ad      ad      @d
405 a(b|c?)+d       -       abcd    abcd    @d
406 a(b|c){0,0}d    -       ad      ad      -
407 a(b|c){0,1}d    -       ad      ad      -
408 a(b|c){0,1}d    -       abd     abd     b
409 a(b|c){0,2}d    -       ad      ad      -
410 a(b|c){0,2}d    -       abcd    abcd    c
411 a(b|c){0,}d     -       ad      ad      -
412 a(b|c){0,}d     -       abcd    abcd    c
413 a(b|c){1,1}d    -       abd     abd     b
414 a(b|c){1,1}d    -       acd     acd     c
415 a(b|c){1,2}d    -       abd     abd     b
416 a(b|c){1,2}d    -       abcd    abcd    c
417 a(b|c){1,}d     -       abd     abd     b
418 a(b|c){1,}d     -       abcd    abcd    c
419 a(b|c){2,2}d    -       acbd    acbd    b
420 a(b|c){2,2}d    -       abcd    abcd    c
421 a(b|c){2,4}d    -       abcd    abcd    c
422 a(b|c){2,4}d    -       abcbd   abcbd   b
423 a(b|c){2,4}d    -       abcbcd  abcbcd  c
424 a(b|c){2,}d     -       abcd    abcd    c
425 a(b|c){2,}d     -       abcbd   abcbd   b
426 a(b+|((c)*))+d  -       abd     abd     @d,@d,-
427 a(b+|((c)*))+d  -       abcd    abcd    @d,@d,-
429 # check out the STARTEND option
430 [abc]           &#      a(b)c   b
431 [abc]           &#      a(d)c
432 [abc]           &#      a(bc)d  b
433 [abc]           &#      a(dc)d  c
434 .               &#      a()c
435 b.*c            &#      b(bc)c  bc
436 b.*             &#      b(bc)c  bc
437 .*c             &#      b(bc)c  bc
439 # plain strings, with the NOSPEC flag
440 abc             m       abc     abc
441 abc             m       xabcy   abc
442 abc             m       xyz
443 a*b             m       aba*b   a*b
444 a*b             m       ab
445 ""              mC      EMPTY
447 # cases involving NULs
448 aZb             &       a       a
449 aZb             &p      a
450 aZb             &p#     (aZb)   aZb
451 aZ*b            &p#     (ab)    ab
452 a.b             &#      (aZb)   aZb
453 a.*             &#      (aZb)c  aZb
455 # word boundaries (ick)
456 [[:<:]]a        &       a       a
457 [[:<:]]a        &       ba
458 [[:<:]]a        &       -a      a
459 a[[:>:]]        &       a       a
460 a[[:>:]]        &       ab
461 a[[:>:]]        &       a-      a
462 [[:<:]]a.c[[:>:]]       &       axcd-dayc-dazce-abc     abc
463 [[:<:]]a.c[[:>:]]       &       axcd-dayc-dazce-abc-q   abc
464 [[:<:]]a.c[[:>:]]       &       axc-dayc-dazce-abc      axc
465 [[:<:]]b.c[[:>:]]       &       a_bxc-byc_d-bzc-q       bzc
466 [[:<:]].x..[[:>:]]      &       y_xa_-_xb_y-_xc_-axdc   _xc_
467 [[:<:]]a_b[[:>:]]       &       x_a_b
469 # past problems, and suspected problems
470 (A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A])   -       A1      A1
471 abcdefghijklmnop        i       abcdefghijklmnop        abcdefghijklmnop
472 abcdefghijklmnopqrstuv  i       abcdefghijklmnopqrstuv  abcdefghijklmnopqrstuv
473 (ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN])     -       CC11    CC11
474 CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a    -       CC11    CC11
475 Char \([a-z0-9_]*\)\[.* b       Char xyz[k      Char xyz[k      xyz
476 a?b     -       ab      ab
477 -\{0,1\}[0-9]*$ b       -5      -5