26763: fix problem on failed cd -s to relative path
[zsh.git] / Src / pattern.c
blobd4941253cbcebbe90dd56ff4d122fc9573a1700b
1 /*
2 * pattern.c - pattern matching
4 * This file is part of zsh, the Z shell.
6 * Copyright (c) 1999 Peter Stephenson
7 * All rights reserved.
9 * Permission is hereby granted, without written agreement and without
10 * license or royalty fees, to use, copy, modify, and distribute this
11 * software and to distribute modified versions of this software for any
12 * purpose, provided that the above copyright notice and the following
13 * two paragraphs appear in all copies of this software.
15 * In no event shall Peter Stephenson or the Zsh Development Group be liable
16 * to any party for direct, indirect, special, incidental, or consequential
17 * damages arising out of the use of this software and its documentation,
18 * even if Peter Stephenson and the Zsh Development Group have been advised of
19 * the possibility of such damage.
21 * Peter Stephenson and the Zsh Development Group specifically disclaim any
22 * warranties, including, but not limited to, the implied warranties of
23 * merchantability and fitness for a particular purpose. The software
24 * provided hereunder is on an "as is" basis, and Peter Stephenson and the
25 * Zsh Development Group have no obligation to provide maintenance,
26 * support, updates, enhancements, or modifications.
28 * Pattern matching code derived from the regexp library by Henry
29 * Spencer, which has the following copyright.
31 * Copyright (c) 1986 by University of Toronto.
32 * Written by Henry Spencer. Not derived from licensed software.
34 * Permission is granted to anyone to use this software for any
35 * purpose on any computer system, and to redistribute it freely,
36 * subject to the following restrictions:
38 * 1. The author is not responsible for the consequences of use of
39 * this software, no matter how awful, even if they arise
40 * from defects in it.
42 * 2. The origin of this software must not be misrepresented, either
43 * by explicit claim or by omission.
45 * 3. Altered versions must be plainly marked as such, and must not
46 * be misrepresented as being the original software.
48 * Eagle-eyed readers will notice this is an altered version. Incredibly
49 * sharp-eyed readers might even find bits that weren't altered.
52 * And I experienced a sense that, like certain regular
53 * expressions, seemed to match the day from beginning to end, so
54 * that I did not need to identify the parenthesised subexpression
55 * that told of dawn, nor the group of characters that indicated
56 * the moment when my grandfather returned home with news of
57 * Swann's departure for Paris; and the whole length of the month
58 * of May, as if matched by a closure, fitted into the buffer of my
59 * life with no sign of overflowing, turning the days, like a
60 * procession of insects that could consist of this or that
61 * species, into a random and unstructured repetition of different
62 * sequences, anchored from the first day of the month to the last
63 * in the same fashion as the weeks when I knew I would not see
64 * Gilberte and would search in vain for any occurrences of the
65 * string in the avenue of hawthorns by Tansonville, without my
66 * having to delimit explicitly the start or finish of the pattern.
68 * M. Proust, "In Search of Lost Files",
69 * bk I, "The Walk by Bourne's Place".
72 #include "zsh.mdh"
75 * The following union is used mostly for alignment purposes.
76 * Normal nodes are longs, while certain nodes take a char * as an argument;
77 * here we make sure that they both work out to the same length.
78 * The compiled regexp we construct consists of upats stuck together;
79 * anything else to be added (strings, numbers) is stuck after and
80 * then aligned to a whole number of upat units.
82 * Note also that offsets are in terms of the sizes of these things.
84 union upat {
85 long l;
86 unsigned char *p;
89 typedef union upat *Upat;
91 #include "pattern.pro"
93 /* Number of active parenthesized expressions allowed in backreferencing */
94 #define NSUBEXP 9
96 /* definition number opnd? meaning */
97 #define P_END 0x00 /* no End of program. */
98 #define P_EXCSYNC 0x01 /* no Test if following exclude already failed */
99 #define P_EXCEND 0x02 /* no Test if exclude matched orig branch */
100 #define P_BACK 0x03 /* no Match "", "next" ptr points backward. */
101 #define P_EXACTLY 0x04 /* lstr Match this string. */
102 #define P_NOTHING 0x05 /* no Match empty string. */
103 #define P_ONEHASH 0x06 /* node Match this (simple) thing 0 or more times. */
104 #define P_TWOHASH 0x07 /* node Match this (simple) thing 1 or more times. */
105 #define P_GFLAGS 0x08 /* long Match nothing and set globbing flags */
106 #define P_ISSTART 0x09 /* no Match start of string. */
107 #define P_ISEND 0x0a /* no Match end of string. */
108 #define P_COUNTSTART 0x0b /* no Initialise P_COUNT */
109 #define P_COUNT 0x0c /* 3*long uc* node Match a number of repetitions */
110 /* numbered so we can test bit 5 for a branch */
111 #define P_BRANCH 0x20 /* node Match this alternative, or the next... */
112 #define P_WBRANCH 0x21 /* uc* node P_BRANCH, but match at least 1 char */
113 /* excludes are also branches, but have bit 4 set, too */
114 #define P_EXCLUDE 0x30 /* uc* node Exclude this from previous branch */
115 #define P_EXCLUDP 0x31 /* uc* node Exclude, using full file path so far */
116 /* numbered so we can test bit 6 so as not to match initial '.' */
117 #define P_ANY 0x40 /* no Match any one character. */
118 #define P_ANYOF 0x41 /* str Match any character in this string. */
119 #define P_ANYBUT 0x42 /* str Match any character not in this string. */
120 #define P_STAR 0x43 /* no Match any set of characters. */
121 #define P_NUMRNG 0x44 /* zr, zr Match a numeric range. */
122 #define P_NUMFROM 0x45 /* zr Match a number >= X */
123 #define P_NUMTO 0x46 /* zr Match a number <= X */
124 #define P_NUMANY 0x47 /* no Match any set of decimal digits */
125 /* spaces left for P_OPEN+n,... for backreferences */
126 #define P_OPEN 0x80 /* no Mark this point in input as start of n. */
127 #define P_CLOSE 0x90 /* no Analogous to OPEN. */
129 * no no argument
130 * zr the range type zrange_t: may be zlong or unsigned long
131 * char a single char
132 * uc* a pointer to unsigned char, used at run time and initialised
133 * to NULL.
134 * str null-terminated, metafied string
135 * lstr length as long then string, not null-terminated, unmetafied.
139 * Notes on usage:
140 * P_WBRANCH: This works like a branch and is used in complex closures,
141 * to ensure we don't succeed on a zero-length match of the pattern,
142 * since that would cause an infinite loop. We do this by recording
143 * the positions where we have already tried to match. See the
144 * P_WBRANCH test in patmatch().
146 * P_ANY, P_ANYOF: the operand is a null terminated
147 * string. Normal characters match as expected. Characters
148 * in the range Meta+PP_ALPHA..Meta+PP_UNKNWN do the appropriate
149 * Posix range tests. This relies on imeta returning true for these
150 * characters. We treat unknown POSIX ranges as never matching.
151 * PP_RANGE means the next two (possibly metafied) characters form
152 * the limits of a range to test; it's too much like hard work to
153 * expand the range.
155 * P_EXCLUDE, P_EXCSYNC, PEXCEND: P_EXCLUDE appears in the pattern like
156 * P_BRANCH, but applies to the immediately preceding branch. The code in
157 * the corresponding branch is followed by a P_EXCSYNC, which simply
158 * acts as a marker that a P_EXCLUDE comes next. The P_EXCLUDE
159 * has a pointer to char embeded in it, which works
160 * like P_WBRANCH: if we get to the P_EXCSYNC, and we already matched
161 * up to the same position, fail. Thus we are forced to backtrack
162 * on closures in the P_BRANCH if the first attempt was excluded.
163 * Corresponding to P_EXCSYNC in the original branch, there is a
164 * P_EXCEND in the exclusion. If we get to this point, and we did
165 * *not* match in the original branch, the exclusion itself fails,
166 * otherwise it succeeds since we know the tail already matches,
167 * so P_EXCEND is the end of the exclusion test.
168 * The whole sorry mess looks like this, where the upper lines
169 * show the linkage of the branches, and the lower shows the linkage
170 * of their pattern arguments.
172 * --------------------- ----------------------
173 * ^ v ^ v
174 * ( <BRANCH>:apat-><EXCSYNC> <EXCLUDE>:excpat-><EXCEND> ) tail
176 * | |
177 * --------------------------------------
179 * P_EXCLUDP: this behaves exactly like P_EXCLUDE, with the sole exception
180 * that we prepend the path so far to the exclude pattern. This is
181 * for top level file globs, e.g. ** / *.c~*foo.c
182 * ^ I had to leave this space
183 * P_NUM*: zl is a zlong if that is 64-bit, else an unsigned long.
185 * P_COUNTSTART, P_COUNT: a P_COUNTSTART flags the start of a quantified
186 * closure (#cN,M) and is used to initialise the count. Executing
187 * the pattern leads back to the P_COUNT, while the next links of the
188 * P_COUNTSTART and P_COUNT lead to the tail of the pattern:
190 * ----------------
191 * v ^
192 * <COUNTSTART><COUNT>pattern<BACK> tail
193 * v v ^
194 * ------------------------
197 #define P_OP(p) ((p)->l & 0xff)
198 #define P_NEXT(p) ((p)->l >> 8)
199 #define P_OPERAND(p) ((p) + 1)
200 #define P_ISBRANCH(p) ((p)->l & 0x20)
201 #define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30)
202 #define P_NOTDOT(p) ((p)->l & 0x40)
204 /* Specific to lstr type, i.e. P_EXACTLY. */
205 #define P_LS_LEN(p) ((p)[1].l) /* can be used as lvalue */
206 #define P_LS_STR(p) ((char *)((p) + 2))
208 /* Specific to P_COUNT: arguments as offset in nodes from operator */
209 #define P_CT_CURRENT (1) /* Current count */
210 #define P_CT_MIN (2) /* Minimum count */
211 #define P_CT_MAX (3) /* Maximum count, -1 for none */
212 #define P_CT_PTR (4) /* Pointer to last match start */
213 #define P_CT_OPERAND (5) /* Operand of P_COUNT */
215 /* Flags needed when pattern is executed */
216 #define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */
217 #define P_HSTART 0x02 /* Starts with # or ##'d pattern. */
218 #define P_PURESTR 0x04 /* Can be matched with a strcmp */
220 #if defined(ZSH_64_BIT_TYPE) || defined(LONG_IS_64_BIT)
221 typedef zlong zrange_t;
222 #define ZRANGE_T_IS_SIGNED (1)
223 #else
224 typedef unsigned long zrange_t;
225 #endif
228 * Characters which terminate a pattern segment. We actually use
229 * a pointer patendseg which skips the first character if we are not
230 * parsing a file pattern.
231 * Note that the size of this and the next array are hard-wired
232 * via the definitions.
235 static char endseg[] = {
236 '/', /* file only */
237 '\0', Bar, Outpar, /* all patterns */
238 Tilde /* extended glob only */
241 #define PATENDSEGLEN_NORM 4
242 #define PATENDSEGLEN_EXT 5
244 /* Characters which terminate a simple string */
246 static char endstr[] = {
247 '/', /* file only */
248 '\0', Bar, Outpar, Quest, Star, Inbrack, Inpar, Inang, Bnullkeep,
249 /* all patterns */
250 Tilde, Hat, Pound /* extended glob only */
253 #define PATENDSTRLEN_NORM 10
254 #define PATENDSTRLEN_EXT 13
257 /* Default size for pattern buffer */
258 #define P_DEF_ALLOC 256
260 /* Flags used in compilation */
261 static char *patstart, *patparse; /* input pointers */
262 static int patnpar; /* () count */
263 static char *patcode; /* point of code emission */
264 static long patsize; /* size of code */
265 static char *patout; /* start of code emission string */
266 static long patalloc; /* size allocated for same */
267 static char *patendseg; /* characters ending segment */
268 static int patendseglen; /* length of same */
269 static char *patendstr; /* characters ending plain string */
270 static int patendstrlen; /* length of sameo */
272 /* Flags used in both compilation and execution */
273 static int patflags; /* flags passed down to patcompile */
274 static int patglobflags; /* globbing flags & approx */
277 * Increment pointer to metafied multibyte string.
279 #ifdef MULTIBYTE_SUPPORT
280 typedef wint_t patint_t;
282 #define PEOF WEOF
284 #define METACHARINC(x) ((void)metacharinc(&x))
287 * TODO: the shiftstate isn't well handled; we don't guarantee
288 * to maintain it properly between characters. If we don't
289 * need it we should use mbtowc() instead.
291 static mbstate_t shiftstate;
294 * Multibyte version: it's (almost) as easy to return the
295 * value as not, so do so since we sometimes need it..
297 static wchar_t
298 metacharinc(char **x)
300 char *inptr = *x;
301 char inchar;
302 size_t ret = MB_INVALID;
303 wchar_t wc;
306 * Cheat if the top bit isn't set. This is second-guessing
307 * the library, but we know for sure that if the character
308 * set doesn't have the property that all bytes with the 8th
309 * bit clear are single characters then we are stuffed.
311 if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*inptr) & 0x80))
313 if (itok(*inptr))
314 inchar = ztokens[*inptr++ - Pound];
315 else if (*inptr == Meta) {
316 inptr++;
317 inchar = *inptr++ ^ 32;
318 } else {
319 inchar = *inptr++;
321 *x = inptr;
322 return (wchar_t)STOUC(inchar);
325 while (*inptr) {
326 if (itok(*inptr))
327 inchar = ztokens[*inptr++ - Pound];
328 else if (*inptr == Meta) {
329 inptr++;
330 inchar = *inptr++ ^ 32;
331 } else {
332 inchar = *inptr++;
334 ret = mbrtowc(&wc, &inchar, 1, &shiftstate);
336 if (ret == MB_INVALID)
337 break;
338 if (ret == MB_INCOMPLETE)
339 continue;
340 *x = inptr;
341 return wc;
344 /* Error. Treat as single byte. */
345 /* Reset the shift state for next time. */
346 memset(&shiftstate, 0, sizeof(shiftstate));
347 return (wchar_t) STOUC(*(*x)++);
350 #else
351 typedef int patint_t;
353 #define PEOF EOF
355 #define METACHARINC(x) ((void)((x) += (*(x) == Meta) ? 2 : 1))
356 #endif
359 * Return unmetafied char from string (x is any char *).
360 * Used with MULTIBYTE_SUPPORT if the GF_MULTIBYTE is not
361 * in effect.
363 #define UNMETA(x) (*(x) == Meta ? (x)[1] ^ 32 : *(x))
365 /* Add n more characters, ensuring there is enough space. */
367 enum {
368 PA_NOALIGN = 1,
369 PA_UNMETA = 2
372 /**/
373 static void
374 patadd(char *add, int ch, long n, int paflags)
376 /* Make sure everything gets aligned unless we get PA_NOALIGN. */
377 long newpatsize = patsize + n;
378 if (!(paflags & PA_NOALIGN))
379 newpatsize = (newpatsize + sizeof(union upat) - 1) &
380 ~(sizeof(union upat) - 1);
381 if (patalloc < newpatsize) {
382 long newpatalloc =
383 2*(newpatsize > patalloc ? newpatsize : patalloc);
384 patout = (char *)zrealloc((char *)patout, newpatalloc);
385 patcode = patout + patsize;
386 patalloc = newpatalloc;
388 patsize = newpatsize;
389 if (add) {
390 if (paflags & PA_UNMETA) {
392 * Unmetafy and untokenize the string as we go.
393 * The Meta characters in add aren't counted in n.
395 while (n--) {
396 if (itok(*add))
397 *patcode++ = ztokens[*add++ - Pound];
398 else if (*add == Meta) {
399 add++;
400 *patcode++ = *add++ ^ 32;
401 } else {
402 *patcode++ = *add++;
405 } else {
406 while (n--)
407 *patcode++ = *add++;
409 } else
410 *patcode++ = ch;
411 patcode = patout + patsize;
414 static long rn_offs;
415 /* operates on pointers to union upat, returns a pointer */
416 #define PATNEXT(p) ((rn_offs = P_NEXT(p)) ? \
417 (P_OP(p) == P_BACK) ? \
418 ((p)-rn_offs) : ((p)+rn_offs) : NULL)
420 /* Called before parsing a set of file matchs to initialize flags */
422 /**/
423 void
424 patcompstart(void)
426 if (isset(CASEGLOB))
427 patglobflags = 0;
428 else
429 patglobflags = GF_IGNCASE;
430 if (isset(MULTIBYTE))
431 patglobflags |= GF_MULTIBYTE;
435 * Top level pattern compilation subroutine
436 * exp is a null-terminated, metafied string.
437 * inflags is an or of some PAT_* flags.
438 * endexp, if non-null, is set to a pointer to the end of the
439 * part of exp which was compiled. This is used when
440 * compiling patterns for directories which must be
441 * matched recursively.
444 /**/
445 mod_export Patprog
446 patcompile(char *exp, int inflags, char **endexp)
448 int flags = 0;
449 long len = 0;
450 long startoff;
451 Upat pscan;
452 char *lng, *strp = NULL;
453 Patprog p;
455 startoff = sizeof(struct patprog);
456 /* Ensure alignment of start of program string */
457 startoff = (startoff + sizeof(union upat) - 1) & ~(sizeof(union upat) - 1);
459 /* Allocate reasonable sized chunk if none, reduce size if too big */
460 if (patalloc != P_DEF_ALLOC)
461 patout = (char *)zrealloc(patout, patalloc = P_DEF_ALLOC);
462 patcode = patout + startoff;
463 patsize = patcode - patout;
464 patstart = patparse = exp;
466 * Note global patnpar numbers parentheses 1..9, while patnpar
467 * in struct is actual count of parentheses.
469 patnpar = 1;
470 patflags = inflags & ~(PAT_PURES|PAT_HAS_EXCLUDP);
472 patendseg = endseg;
473 patendseglen = isset(EXTENDEDGLOB) ? PATENDSEGLEN_EXT : PATENDSEGLEN_NORM;
474 patendstr = endstr;
475 patendstrlen = isset(EXTENDEDGLOB) ? PATENDSTRLEN_EXT : PATENDSTRLEN_NORM;
477 if (!(patflags & PAT_FILE)) {
478 patendseg++;
479 patendstr++;
480 patendseglen--;
481 patendstrlen--;
482 remnulargs(patparse);
483 if (isset(MULTIBYTE))
484 patglobflags = GF_MULTIBYTE;
485 else
486 patglobflags = 0;
488 if (patflags & PAT_LCMATCHUC)
489 patglobflags |= GF_LCMATCHUC;
491 * Have to be set now, since they get updated during compilation.
493 ((Patprog)patout)->globflags = patglobflags;
495 if (!(patflags & PAT_ANY)) {
496 /* Look for a really pure string, with no tokens at all. */
497 if (!(patglobflags & ~GF_MULTIBYTE)
498 #ifdef __CYGWIN__
500 * If the OS treats files case-insensitively and we
501 * are looking at files, we don't need to use pattern
502 * matching to find the file.
504 || (!(patglobflags & ~GF_IGNCASE) && (patflags & PAT_FILE))
505 #endif
509 * Waah! I wish I understood this.
510 * Empty metafied strings have an initial Nularg.
511 * This never corresponds to a real character in
512 * a glob pattern or string, so skip it.
514 if (*exp == Nularg)
515 exp++;
516 for (strp = exp; *strp &&
517 (!(patflags & PAT_FILE) || *strp != '/') && !itok(*strp);
518 strp++)
521 if (!strp || (*strp && *strp != '/')) {
522 /* No, do normal compilation. */
523 strp = NULL;
524 if (patcompswitch(0, &flags) == 0)
525 return NULL;
526 } else {
528 * Yes, copy the string, and skip compilation altogether.
529 * Null terminate for the benefit of globbing.
530 * Leave metafied both for globbing and for our own
531 * efficiency.
533 patparse = strp;
534 len = strp - exp;
535 patadd(exp, 0, len + 1, 0);
536 patout[startoff + len] = '\0';
537 patflags |= PAT_PURES;
541 /* end of compilation: safe to use pointers */
542 p = (Patprog)patout;
543 p->startoff = startoff;
544 p->patstartch = '\0';
545 p->globend = patglobflags;
546 p->flags = patflags;
547 p->mustoff = 0;
548 p->size = patsize;
549 p->patmlen = len;
550 p->patnpar = patnpar-1;
552 if (!strp) {
553 pscan = (Upat)(patout + startoff);
555 if (!(patflags & PAT_ANY) && P_OP(PATNEXT(pscan)) == P_END) {
556 /* only one top level choice */
557 pscan = P_OPERAND(pscan);
559 if (flags & P_PURESTR) {
561 * The pattern can be matched with a simple strncmp/strcmp.
562 * Careful in case we've overwritten the node for the next ptr.
564 char *dst = patout + startoff;
565 Upat next;
566 p->flags |= PAT_PURES;
567 for (; pscan; pscan = next) {
568 next = PATNEXT(pscan);
569 if (P_OP(pscan) == P_EXACTLY) {
570 char *opnd = P_LS_STR(pscan), *mtest;
571 long oplen = P_LS_LEN(pscan), ilen;
572 int nmeta = 0;
574 * Unfortunately we unmetafied the string
575 * and we need to put any metacharacters
576 * back now we know it's a pure string.
577 * This shouldn't happen too often, it's
578 * just that there are some cases such
579 * as . and .. in files where we really
580 * need a pure string even if there are
581 * pattern characters flying around.
583 for (mtest = opnd, ilen = oplen; ilen;
584 mtest++, ilen--)
585 if (imeta(*mtest))
586 nmeta++;
587 if (nmeta) {
588 char *oldpatout = patout;
589 patadd(NULL, 0, nmeta, 0);
591 * Yuk.
593 p = (Patprog)patout;
594 opnd = patout + (opnd - oldpatout);
595 dst = patout + startoff;
598 while (oplen--) {
599 if (imeta(*opnd)) {
600 *dst++ = Meta;
601 *dst++ = *opnd++ ^ 32;
602 } else {
603 *dst++ = *opnd++;
608 p->size = dst - patout;
609 /* patmlen is really strlen. We don't need a null. */
610 p->patmlen = p->size - startoff;
611 } else {
612 /* starting point info */
613 if (P_OP(pscan) == P_EXACTLY && !p->globflags &&
614 P_LS_LEN(pscan))
615 p->patstartch = *P_LS_STR(pscan);
617 * Find the longest literal string in something expensive.
618 * This is itself not all that cheap if we have
619 * case-insensitive matching or approximation, so don't.
621 if ((flags & P_HSTART) && !p->globflags) {
622 lng = NULL;
623 len = 0;
624 for (; pscan; pscan = PATNEXT(pscan))
625 if (P_OP(pscan) == P_EXACTLY &&
626 P_LS_LEN(pscan) >= len) {
627 lng = P_LS_STR(pscan);
628 len = P_LS_LEN(pscan);
630 if (lng) {
631 p->mustoff = lng - patout;
632 p->patmlen = len;
640 * The pattern was compiled in a fixed buffer: unless told otherwise,
641 * we stick the compiled pattern on the heap. This is necessary
642 * for files where we will often be compiling multiple segments at once.
643 * But if we get the ZDUP flag we always put it in zalloc()ed memory.
645 if (patflags & PAT_ZDUP) {
646 Patprog newp = (Patprog)zalloc(patsize);
647 memcpy((char *)newp, (char *)p, patsize);
648 p = newp;
649 } else if (!(patflags & PAT_STATIC)) {
650 Patprog newp = (Patprog)zhalloc(patsize);
651 memcpy((char *)newp, (char *)p, patsize);
652 p = newp;
655 if (endexp)
656 *endexp = patparse;
657 return p;
661 * Main body or parenthesized subexpression in pattern
662 * Parenthesis (and any ksh_glob gubbins) will have been removed.
665 /**/
666 static long
667 patcompswitch(int paren, int *flagp)
669 long starter, br, ender, excsync = 0;
670 int parno = 0;
671 int flags, gfchanged = 0, savglobflags = patglobflags;
672 Upat ptr;
674 *flagp = 0;
676 if (paren && (patglobflags & GF_BACKREF) && patnpar <= NSUBEXP) {
678 * parenthesized: make an open node.
679 * We can only refer to the first nine parentheses.
680 * For any others, we just use P_OPEN on its own; there's
681 * no gain in arbitrarily limiting the number of parentheses.
683 parno = patnpar++;
684 starter = patnode(P_OPEN + parno);
685 } else
686 starter = 0;
688 br = patnode(P_BRANCH);
689 if (!patcompbranch(&flags))
690 return 0;
691 if (patglobflags != savglobflags)
692 gfchanged++;
693 if (starter)
694 pattail(starter, br);
695 else
696 starter = br;
698 *flagp |= flags & (P_HSTART|P_PURESTR);
700 while (*patparse == Bar ||
701 (isset(EXTENDEDGLOB) && *patparse == Tilde &&
702 (patparse[1] == '/' ||
703 !memchr(patendseg, patparse[1], patendseglen)))) {
704 int tilde = *patparse++ == Tilde;
705 long gfnode = 0, newbr;
707 *flagp &= ~P_PURESTR;
709 if (tilde) {
710 union upat up;
711 /* excsync remembers the P_EXCSYNC node before a chain of
712 * exclusions: all point back to this. only the
713 * original (non-excluded) branch gets a trailing P_EXCSYNC.
715 if (!excsync) {
716 excsync = patnode(P_EXCSYNC);
717 patoptail(br, excsync);
720 * By default, approximations are turned off in exclusions:
721 * we need to do this here as otherwise the code compiling
722 * the exclusion doesn't know if the flags have really
723 * changed if the error count gets restored.
725 patglobflags &= ~0xff;
726 if (!(patflags & PAT_FILET) || paren) {
727 br = patnode(P_EXCLUDE);
728 } else {
730 * At top level (paren == 0) in a file glob !(patflags
731 * &PAT_FILET) do the exclusion prepending the file path
732 * so far. We need to flag this to avoid unnecessarily
733 * copying the path.
735 br = patnode(P_EXCLUDP);
736 patflags |= PAT_HAS_EXCLUDP;
738 up.p = NULL;
739 patadd((char *)&up, 0, sizeof(up), 0);
740 /* / is not treated as special if we are at top level */
741 if (!paren && *patendseg == '/') {
742 tilde++;
743 patendseg++;
744 patendseglen--;
745 patendstr++;
746 patendstrlen--;
748 } else {
749 excsync = 0;
750 br = patnode(P_BRANCH);
752 * The position of the following statements means globflags
753 * set in the main branch carry over to the exclusion.
755 if (!paren) {
756 patglobflags = 0;
757 if (((Patprog)patout)->globflags) {
759 * If at top level, we need to reinitialize flags to zero,
760 * since (#i)foo|bar only applies to foo and we stuck
761 * the #i into the global flags.
762 * We could have done it so that they only got set in the
763 * first branch, but it's quite convenient having any
764 * global flags set in the header and not buried in the
765 * pattern. (Or maybe it isn't and we should
766 * forget this bit and always stick in an explicit GFLAGS
767 * statement instead of using the header.)
768 * Also, this can't happen for file globs where there are
769 * no top-level |'s.
771 * No gfchanged, as nothing to follow branch at top
772 * level.
774 union upat up;
775 gfnode = patnode(P_GFLAGS);
776 up.l = patglobflags;
777 patadd((char *)&up, 0, sizeof(union upat), 0);
779 } else {
780 patglobflags = savglobflags;
783 newbr = patcompbranch(&flags);
784 if (tilde == 2) {
785 /* restore special treatment of / */
786 patendseg--;
787 patendseglen++;
788 patendstr--;
789 patendstrlen++;
791 if (!newbr)
792 return 0;
793 if (gfnode)
794 pattail(gfnode, newbr);
795 if (!tilde && patglobflags != savglobflags)
796 gfchanged++;
797 pattail(starter, br);
798 if (excsync)
799 patoptail(br, patnode(P_EXCEND));
800 *flagp |= flags & P_HSTART;
804 * Make a closing node, hooking it to the end.
805 * Note that we can't optimize P_NOTHING out here, since another
806 * branch at that point would indicate the current choices continue,
807 * which they don't.
809 ender = patnode(paren ? parno ? P_CLOSE+parno : P_NOTHING : P_END);
810 pattail(starter, ender);
813 * Hook the tails of the branches to the closing node,
814 * except for exclusions which terminate where they are.
816 for (ptr = (Upat)patout + starter; ptr; ptr = PATNEXT(ptr))
817 if (!P_ISEXCLUDE(ptr))
818 patoptail(ptr-(Upat)patout, ender);
820 /* check for proper termination */
821 if ((paren && *patparse++ != Outpar) ||
822 (!paren && *patparse &&
823 !((patflags & PAT_FILE) && *patparse == '/')))
824 return 0;
826 if (paren && gfchanged) {
828 * Restore old values of flags when leaving parentheses.
829 * gfchanged detects a change in any branch (except exclusions
830 * which are separate), since we need to emit this even if
831 * a later branch happened to put the flags back.
833 pattail(ender, patnode(P_GFLAGS));
834 patglobflags = savglobflags;
835 patadd((char *)&savglobflags, 0, sizeof(long), 0);
838 return starter;
842 * Compile something ended by Bar, Outpar, Tilde, or end of string.
843 * Note the BRANCH or EXCLUDE tag must already have been omitted:
844 * this returns the position of the operand of that.
847 /**/
848 static long
849 patcompbranch(int *flagp)
851 long chain, latest = 0, starter;
852 int flags = 0;
854 *flagp = P_PURESTR;
856 starter = chain = 0;
857 while (!memchr(patendseg, *patparse, patendseglen) ||
858 (*patparse == Tilde && patparse[1] != '/' &&
859 memchr(patendseg, patparse[1], patendseglen))) {
860 if (isset(EXTENDEDGLOB) &&
861 ((!isset(SHGLOB) &&
862 (*patparse == Inpar && patparse[1] == Pound)) ||
863 (isset(KSHGLOB) && *patparse == '@' && patparse[1] == Inpar &&
864 patparse[2] == Pound))) {
865 /* Globbing flags. */
866 char *pp1 = patparse;
867 int oldglobflags = patglobflags, ignore;
868 long assert;
869 patparse += (*patparse == '@') ? 3 : 2;
870 if (!patgetglobflags(&patparse, &assert, &ignore))
871 return 0;
872 if (!ignore) {
873 if (assert) {
875 * Start/end assertion looking like flags, but
876 * actually handled as a normal node
878 latest = patnode(assert);
879 flags = 0;
880 } else {
881 if (pp1 == patstart) {
882 /* Right at start of pattern, the simplest case.
883 * Put them into the flags and don't emit anything.
885 ((Patprog)patout)->globflags = patglobflags;
886 continue;
887 } else if (!*patparse) {
888 /* Right at the end, so just leave the flags for
889 * the next Patprog in the chain to pick up.
891 break;
894 * Otherwise, we have to stick them in as a pattern
895 * matching nothing.
897 if (oldglobflags != patglobflags) {
898 /* Flags changed */
899 union upat up;
900 latest = patnode(P_GFLAGS);
901 up.l = patglobflags;
902 patadd((char *)&up, 0, sizeof(union upat), 0);
903 } else {
904 /* No effect. */
905 continue;
908 } else if (!*patparse)
909 break;
910 else
911 continue;
912 } else if (isset(EXTENDEDGLOB) && *patparse == Hat) {
914 * ^pat: anything but pat. For proper backtracking,
915 * etc., we turn this into (*~pat), except without the
916 * parentheses.
918 patparse++;
919 latest = patcompnot(0, &flags);
920 } else
921 latest = patcomppiece(&flags);
922 if (!latest)
923 return 0;
924 if (!starter)
925 starter = latest;
926 if (!(flags & P_PURESTR))
927 *flagp &= ~P_PURESTR;
928 if (!chain)
929 *flagp |= flags & P_HSTART;
930 else
931 pattail(chain, latest);
932 chain = latest;
934 /* check if there was nothing in the loop, i.e. () */
935 if (!chain)
936 starter = patnode(P_NOTHING);
938 return starter;
941 /* get glob flags, return 1 for success, 0 for failure */
943 /**/
945 patgetglobflags(char **strp, long *assertp, int *ignore)
947 char *nptr, *ptr = *strp;
948 zlong ret;
950 *assertp = 0;
951 *ignore = 1;
952 /* (#X): assumes we are still positioned on the first X */
953 for (; *ptr && *ptr != Outpar; ptr++) {
954 if (*ptr == 'q') {
955 /* Glob qualifiers, ignored in pattern code */
956 while (*ptr && *ptr != Outpar)
957 ptr++;
958 break;
959 } else {
960 *ignore = 0;
961 switch (*ptr) {
962 case 'a':
963 /* Approximate matching, max no. of errors follows */
964 ret = zstrtol(++ptr, &nptr, 10);
966 * We can't have more than 254, because we need 255 to
967 * mark 254 errors in wbranch and exclude sync strings
968 * (hypothetically --- hope no-one tries it).
970 if (ret < 0 || ret > 254 || ptr == nptr)
971 return 0;
972 patglobflags = (patglobflags & ~0xff) | (ret & 0xff);
973 ptr = nptr-1;
974 break;
976 case 'l':
977 /* Lowercase in pattern matches lower or upper in target */
978 patglobflags = (patglobflags & ~GF_IGNCASE) | GF_LCMATCHUC;
979 break;
981 case 'i':
982 /* Fully case insensitive */
983 patglobflags = (patglobflags & ~GF_LCMATCHUC) | GF_IGNCASE;
984 break;
986 case 'I':
987 /* Restore case sensitivity */
988 patglobflags &= ~(GF_LCMATCHUC|GF_IGNCASE);
989 break;
991 case 'b':
992 /* Make backreferences */
993 patglobflags |= GF_BACKREF;
994 break;
996 case 'B':
997 /* Don't make backreferences */
998 patglobflags &= ~GF_BACKREF;
999 break;
1001 case 'm':
1002 /* Make references to complete match */
1003 patglobflags |= GF_MATCHREF;
1004 break;
1006 case 'M':
1007 /* Don't */
1008 patglobflags &= ~GF_MATCHREF;
1009 break;
1011 case 's':
1012 *assertp = P_ISSTART;
1013 break;
1015 case 'e':
1016 *assertp = P_ISEND;
1017 break;
1019 case 'u':
1020 patglobflags |= GF_MULTIBYTE;
1021 break;
1023 case 'U':
1024 patglobflags &= ~GF_MULTIBYTE;
1025 break;
1027 default:
1028 return 0;
1032 if (*ptr != Outpar)
1033 return 0;
1034 /* Start/end assertions must appear on their own. */
1035 if (*assertp && (*strp)[1] != Outpar)
1036 return 0;
1037 *strp = ptr + 1;
1038 return 1;
1042 static const char *colon_stuffs[] = {
1043 "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph",
1044 "lower", "print", "punct", "space", "upper", "xdigit", "IDENT",
1045 "IFS", "IFSSPACE", "WORD", NULL
1049 * Handle the guts of a [:stuff:] character class element.
1050 * start is the beginning of "stuff" and len is its length.
1051 * This code is exported for the benefit of completion matching.
1054 /**/
1055 mod_export int
1056 range_type(char *start, int len)
1058 const char **csp;
1060 for (csp = colon_stuffs; *csp; csp++) {
1061 if (!strncmp(start, *csp, len))
1062 return (csp - colon_stuffs) + PP_FIRST;
1065 return PP_UNKWN;
1070 * Convert the contents of a [...] or [^...] expression (just the
1071 * ... part) back into a string. This is used by compfiles -p/-P
1072 * for some reason. The compiled form (a metafied string) is
1073 * passed in rangestr.
1075 * If outstr is non-NULL the compiled form is placed there. It
1076 * must be sufficiently long. A terminating NULL is appended.
1078 * Return the length required, not including the terminating NULL.
1080 * TODO: this is non-multibyte for now. It will need to be defined
1081 * appropriately with MULTIBYTE_SUPPORT when the completion matching
1082 * code catches up.
1085 /**/
1086 mod_export int
1087 pattern_range_to_string(char *rangestr, char *outstr)
1089 int len = 0;
1091 while (*rangestr) {
1092 if (imeta(STOUC(*rangestr))) {
1093 int swtype = STOUC(*rangestr) - STOUC(Meta);
1095 if (swtype == 0) {
1096 /* Ordindary metafied character */
1097 if (outstr)
1099 *outstr++ = Meta;
1100 *outstr++ = rangestr[1] ^ 32;
1102 len += 2;
1103 rangestr += 2;
1104 } else if (swtype == PP_RANGE) {
1105 /* X-Y range */
1106 int i;
1108 for (i = 0; i < 2; i++) {
1109 if (*rangestr == Meta) {
1110 if (outstr) {
1111 *outstr++ = Meta;
1112 *outstr++ = rangestr[1];
1114 len += 2;
1115 rangestr += 2;
1116 } else {
1117 if (outstr)
1118 *outstr++ = *rangestr;
1119 len++;
1120 rangestr++;
1123 if (i == 0) {
1124 if (outstr)
1125 *outstr++ = '-';
1126 len++;
1129 } else if (swtype >= PP_FIRST && swtype <= PP_LAST) {
1130 /* [:stuff:]; we need to output [: and :] */
1131 const char *found = colon_stuffs[swtype - PP_FIRST];
1132 int newlen = strlen(found);
1133 if (outstr) {
1134 strcpy(outstr, "[:");
1135 outstr += 2;
1136 memcpy(outstr, found, newlen);
1137 outstr += newlen;
1138 strcpy(outstr, ":]");
1139 outstr += 2;
1141 len += newlen + 4;
1142 rangestr++;
1143 } else {
1144 /* shouldn't happen */
1145 DPUTS(1, "BUG: unknown PP_ code in pattern range");
1146 rangestr++;
1148 } else {
1149 /* ordinary character, guaranteed no Meta handling needed */
1150 if (outstr)
1151 *outstr++ = *rangestr;
1152 len++;
1153 rangestr++;
1157 if (outstr)
1158 *outstr = '\0';
1159 return len;
1163 * compile a chunk such as a literal string or a [...] followed
1164 * by a possible hash operator
1167 /**/
1168 static long
1169 patcomppiece(int *flagp)
1171 long starter = 0, next, op, opnd;
1172 int flags, flags2, kshchar, len, ch, patch, nmeta;
1173 int pound, count;
1174 union upat up;
1175 char *nptr, *str0, *ptr, *patprev;
1176 zrange_t from, to;
1177 char *charstart;
1179 flags = 0;
1180 str0 = patprev = patparse;
1181 for (;;) {
1183 * Check if we have a string. First, we need to make sure
1184 * the string doesn't introduce a ksh-like parenthesized expression.
1186 kshchar = '\0';
1187 if (isset(KSHGLOB) && *patparse && patparse[1] == Inpar) {
1188 if (strchr("?*+!@", *patparse))
1189 kshchar = STOUC(*patparse);
1190 else if (*patparse == Star || *patparse == Quest)
1191 kshchar = STOUC(ztokens[*patparse - Pound]);
1195 * End of string (or no string at all) if ksh-type parentheses,
1196 * or special character, unless that character is a tilde and
1197 * the character following is an end-of-segment character. Thus
1198 * tildes are not special if there is nothing following to
1199 * be excluded.
1201 if (kshchar || (memchr(patendstr, *patparse, patendstrlen) &&
1202 (*patparse != Tilde ||
1203 patparse[1] == '/' ||
1204 !memchr(patendseg, patparse[1], patendseglen))))
1205 break;
1207 /* Remember the previous character for backtracking */
1208 patprev = patparse;
1209 METACHARINC(patparse);
1212 if (patparse > str0) {
1213 long slen = patparse - str0;
1214 int morelen;
1216 /* Ordinary string: cancel kshchar lookahead */
1217 kshchar = '\0';
1219 * Assume it matches a simple string until we find otherwise.
1221 flags |= P_PURESTR;
1222 DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece.");
1223 /* more than one character matched? */
1224 morelen = (patprev > str0);
1226 * If we have more than one character, a following hash
1227 * or (#c...) only applies to the last, so backtrack one character.
1229 if (isset(EXTENDEDGLOB) &&
1230 (*patparse == Pound ||
1231 (*patparse == Inpar && patparse[1] == Pound &&
1232 patparse[2] == 'c')) && morelen)
1233 patparse = patprev;
1235 * If len is 1, we can't have an active # following, so doesn't
1236 * matter that we don't make X in `XX#' simple.
1238 if (!morelen)
1239 flags |= P_SIMPLE;
1240 starter = patnode(P_EXACTLY);
1242 /* Get length of string without metafication. */
1243 nmeta = 0;
1244 /* inherited from domatch, but why, exactly? */
1245 if (*str0 == Nularg)
1246 str0++;
1247 for (ptr = str0; ptr < patparse; ptr++) {
1248 if (*ptr == Meta) {
1249 nmeta++;
1250 ptr++;
1253 slen = (patparse - str0) - nmeta;
1254 /* First add length, which is a long */
1255 patadd((char *)&slen, 0, sizeof(long), 0);
1257 * Then the string, not null terminated.
1258 * Unmetafy and untokenize; pass the final length,
1259 * which is what we need to allocate, i.e. not including
1260 * a count for each Meta in the string.
1262 patadd(str0, 0, slen, PA_UNMETA);
1263 nptr = P_LS_STR((Upat)patout + starter);
1265 * It's much simpler to turn off pure string mode for
1266 * any case-insensitive or approximate matching; usually,
1267 * that is correct, or they wouldn't have been turned on.
1268 * However, we need to make sure we match a "." or ".."
1269 * in a file name as a pure string. There's a minor bug
1270 * that this will also apply to something like
1271 * ..(#a1).. (i.e. the (#a1) has no effect), but if you're
1272 * going to write funny patterns, you get no sympathy from me.
1274 if (patglobflags &
1275 #ifdef __CYGWIN__
1277 * As above: don't use pattern matching for files
1278 * just because of case insensitivity if file system
1279 * is known to be case insensitive.
1281 * This is known to be necessary in at least one case:
1282 * if "mount -c /" is in effect, so that drives appear
1283 * directly under / instead of the usual /cygdrive, they
1284 * aren't shown by readdir(). So it's vital we don't use
1285 * globbing to find "/c", since that'll fail.
1287 ((patflags & PAT_FILE) ?
1288 (0xFF|GF_LCMATCHUC) :
1289 (0xFF|GF_LCMATCHUC|GF_IGNCASE))
1290 #else
1291 (0xFF|GF_LCMATCHUC|GF_IGNCASE)
1292 #endif
1294 if (!(patflags & PAT_FILE))
1295 flags &= ~P_PURESTR;
1296 else if (!(nptr[0] == '.' &&
1297 (slen == 1 || (nptr[1] == '.' && slen == 2))))
1298 flags &= ~P_PURESTR;
1300 } else {
1301 if (kshchar)
1302 patparse++;
1304 patch = *patparse;
1305 METACHARINC(patparse);
1306 switch(patch) {
1307 case Quest:
1308 flags |= P_SIMPLE;
1309 starter = patnode(P_ANY);
1310 break;
1311 case Star:
1312 /* kshchar is used as a sign that we can't have #'s. */
1313 kshchar = -1;
1314 starter = patnode(P_STAR);
1315 break;
1316 case Inbrack:
1317 flags |= P_SIMPLE;
1318 if (*patparse == Hat || *patparse == '^' || *patparse == '!') {
1319 patparse++;
1320 starter = patnode(P_ANYBUT);
1321 } else
1322 starter = patnode(P_ANYOF);
1323 if (*patparse == Outbrack) {
1324 patparse++;
1325 patadd(NULL, ']', 1, PA_NOALIGN);
1327 while (*patparse && *patparse != Outbrack) {
1328 /* Meta is not a token */
1329 if (*patparse == Inbrack && patparse[1] == ':' &&
1330 (nptr = strchr(patparse+2, ':')) &&
1331 nptr[1] == Outbrack) {
1332 /* Posix range. */
1333 patparse += 2;
1334 len = nptr - patparse;
1335 ch = range_type(patparse, len);
1336 patparse = nptr + 2;
1337 if (ch != PP_UNKWN)
1338 patadd(NULL, STOUC(Meta) + ch, 1, PA_NOALIGN);
1339 continue;
1341 charstart = patparse;
1342 METACHARINC(patparse);
1344 if (*patparse == '-' && patparse[1] &&
1345 patparse[1] != Outbrack) {
1346 patadd(NULL, STOUC(Meta)+PP_RANGE, 1, PA_NOALIGN);
1347 if (itok(*charstart)) {
1348 patadd(0, STOUC(ztokens[*charstart - Pound]), 1,
1349 PA_NOALIGN);
1350 } else {
1351 patadd(charstart, 0, patparse-charstart, PA_NOALIGN);
1353 charstart = ++patparse; /* skip ASCII '-' */
1354 METACHARINC(patparse);
1356 if (itok(*charstart)) {
1357 patadd(0, STOUC(ztokens[*charstart - Pound]), 1,
1358 PA_NOALIGN);
1359 } else {
1360 patadd(charstart, 0, patparse-charstart, PA_NOALIGN);
1363 if (*patparse != Outbrack)
1364 return 0;
1365 patparse++;
1366 /* terminate null string and fix alignment */
1367 patadd(NULL, 0, 1, 0);
1368 break;
1369 case Inpar:
1370 /* is this how to treat parentheses in SHGLOB? */
1371 if (isset(SHGLOB) && !kshchar)
1372 return 0;
1373 if (kshchar == '!') {
1374 /* This is nasty, we should really either handle all
1375 * kshglobbing below or here. But most of the
1376 * others look like non-ksh patterns, while this one
1377 * doesn't, so we handle it here and leave the rest.
1378 * We treat it like an extendedglob ^, except that
1379 * it goes into parentheses.
1381 * If we did do kshglob here, we could support
1382 * the old behaviour that things like !(foo)##
1383 * work, but it makes the code more complicated at
1384 * the expense of allowing the user to do things
1385 * they shouldn't.
1387 if (!(starter = patcompnot(1, &flags2)))
1388 return 0;
1389 } else if (!(starter = patcompswitch(1, &flags2)))
1390 return 0;
1391 flags |= flags2 & P_HSTART;
1392 break;
1393 case Inang:
1394 /* Numeric glob */
1395 len = 0; /* beginning present 1, end present 2 */
1396 if (idigit(*patparse)) {
1397 from = (zrange_t) zstrtol((char *)patparse,
1398 (char **)&nptr, 10);
1399 patparse = nptr;
1400 len |= 1;
1402 DPUTS(*patparse != '-', "BUG: - missing from numeric glob");
1403 patparse++;
1404 if (idigit(*patparse)) {
1405 to = (zrange_t) zstrtol((char *)patparse,
1406 (char **)&nptr, 10);
1407 patparse = nptr;
1408 len |= 2;
1410 if (*patparse != Outang)
1411 return 0;
1412 patparse++;
1413 switch(len) {
1414 case 3:
1415 starter = patnode(P_NUMRNG);
1416 patadd((char *)&from, 0, sizeof(from), 0);
1417 patadd((char *)&to, 0, sizeof(to), 0);
1418 break;
1419 case 2:
1420 starter = patnode(P_NUMTO);
1421 patadd((char *)&to, 0, sizeof(to), 0);
1422 break;
1423 case 1:
1424 starter = patnode(P_NUMFROM);
1425 patadd((char *)&from, 0, sizeof(from), 0);
1426 break;
1427 case 0:
1428 starter = patnode(P_NUMANY);
1429 break;
1431 /* This can't be simple, because it isn't.
1432 * Mention in manual that matching digits with [...]
1433 * is more efficient.
1435 break;
1436 case Pound:
1437 DPUTS(!isset(EXTENDEDGLOB), "BUG: # not treated as string");
1439 * A hash here is an error; it should follow something
1440 * repeatable.
1442 return 0;
1443 break;
1444 case Bnullkeep:
1446 * Marker for restoring a backslash in output:
1447 * does not match a character.
1449 next = patcomppiece(flagp);
1451 * Can't match a pure string since we need to do this
1452 * as multiple chunks.
1454 *flagp &= ~P_PURESTR;
1455 return next;
1456 break;
1457 #ifdef DEBUG
1458 default:
1459 dputs("BUG: character not handled in patcomppiece");
1460 return 0;
1461 break;
1462 #endif
1466 count = 0;
1467 if (!(pound = (*patparse == Pound && isset(EXTENDEDGLOB))) &&
1468 !(count = (isset(EXTENDEDGLOB) && *patparse == Inpar &&
1469 patparse[1] == Pound && patparse[2] == 'c')) &&
1470 (kshchar <= 0 || kshchar == '@' || kshchar == '!')) {
1471 *flagp = flags;
1472 return starter;
1475 /* too much at once doesn't currently work */
1476 if (kshchar && pound)
1477 return 0;
1479 if (kshchar == '*') {
1480 op = P_ONEHASH;
1481 *flagp = P_HSTART;
1482 } else if (kshchar == '+') {
1483 op = P_TWOHASH;
1484 *flagp = P_HSTART;
1485 } else if (kshchar == '?') {
1486 op = 0;
1487 *flagp = 0;
1488 } else if (count) {
1489 op = P_COUNT;
1490 patparse += 3;
1491 *flagp = P_HSTART;
1492 } else if (*++patparse == Pound) {
1493 op = P_TWOHASH;
1494 patparse++;
1495 *flagp = P_HSTART;
1496 } else {
1497 op = P_ONEHASH;
1498 *flagp = P_HSTART;
1502 * Note optimizations with pointers into P_NOTHING branches: some
1503 * should logically point to next node after current piece.
1505 * Backtracking is also encoded in a slightly obscure way: the
1506 * code emitted ensures we test the non-empty branch of complex
1507 * patterns before the empty branch on each repetition. Hence
1508 * each time we fail on a non-empty branch, we try the empty branch,
1509 * which is equivalent to backtracking.
1511 if (op == P_COUNT) {
1512 /* (#cN,M) */
1513 union upat countargs[P_CT_OPERAND];
1514 char *opp = patparse;
1516 countargs[0].l = P_COUNT;
1517 countargs[P_CT_CURRENT].l = 0L;
1518 countargs[P_CT_MIN].l = (long)zstrtol(patparse, &patparse, 10);
1519 if (patparse == opp) {
1520 /* missing number treated as zero */
1521 countargs[P_CT_MIN].l = 0L;
1523 if (*patparse != ',' && *patparse != Comma) {
1524 /* either max = min or error */
1525 if (*patparse != Outpar)
1526 return 0;
1527 countargs[P_CT_MAX].l = countargs[P_CT_MIN].l;
1528 } else {
1529 opp = ++patparse;
1530 countargs[P_CT_MAX].l = (long)zstrtol(patparse, &patparse, 10);
1531 if (*patparse != Outpar)
1532 return 0;
1533 if (patparse == opp) {
1534 /* missing number treated as infinity: record as -1 */
1535 countargs[P_CT_MAX].l = -1L;
1538 patparse++;
1539 countargs[P_CT_PTR].p = NULL;
1540 /* Mark this chain as a min/max count... */
1541 patinsert(P_COUNTSTART, starter, (char *)countargs, sizeof(countargs));
1543 * The next of the operand is a loop back to the P_COUNT. This is
1544 * how we get recursion for the count. We don't loop back to
1545 * the P_COUNTSTART; that's used for initialising the count
1546 * and saving and restoring the count for any enclosing use
1547 * of the match.
1549 opnd = P_OPERAND(starter) + P_CT_OPERAND;
1550 pattail(opnd, patnode(P_BACK));
1551 pattail(opnd, P_OPERAND(starter));
1553 * The next of the counter operators is what follows the
1554 * closure.
1555 * This handles matching of the tail.
1557 next = patnode(P_NOTHING);
1558 pattail(starter, next);
1559 pattail(P_OPERAND(starter), next);
1560 } else if ((flags & P_SIMPLE) && (op == P_ONEHASH || op == P_TWOHASH) &&
1561 P_OP((Upat)patout+starter) == P_ANY) {
1562 /* Optimize ?# to *. Silly thing to do, since who would use
1563 * use ?# ? But it makes the later code shorter.
1565 Upat uptr = (Upat)patout + starter;
1566 if (op == P_TWOHASH) {
1567 /* ?## becomes ?* */
1568 uptr->l = (uptr->l & ~0xff) | P_ANY;
1569 pattail(starter, patnode(P_STAR));
1570 } else {
1571 uptr->l = (uptr->l & ~0xff) | P_STAR;
1573 } else if ((flags & P_SIMPLE) && op && !(patglobflags & 0xff)) {
1574 /* Simplify, but not if we need to look for approximations. */
1575 patinsert(op, starter, NULL, 0);
1576 } else if (op == P_ONEHASH) {
1577 /* Emit x# as (x&|), where & means "self". */
1578 up.p = NULL;
1579 patinsert(P_WBRANCH, starter, (char *)&up, sizeof(up));
1580 /* Either x */
1581 patoptail(starter, patnode(P_BACK)); /* and loop */
1582 patoptail(starter, starter); /* back */
1583 pattail(starter, patnode(P_BRANCH)); /* or */
1584 pattail(starter, patnode(P_NOTHING)); /* null. */
1585 } else if (op == P_TWOHASH) {
1586 /* Emit x## as x(&|) where & means "self". */
1587 next = patnode(P_WBRANCH); /* Either */
1588 up.p = NULL;
1589 patadd((char *)&up, 0, sizeof(up), 0);
1590 pattail(starter, next);
1591 pattail(patnode(P_BACK), starter); /* loop back */
1592 pattail(next, patnode(P_BRANCH)); /* or */
1593 pattail(starter, patnode(P_NOTHING)); /* null. */
1594 } else if (kshchar == '?') {
1595 /* Emit ?(x) as (x|) */
1596 patinsert(P_BRANCH, starter, NULL, 0); /* Either x */
1597 pattail(starter, patnode(P_BRANCH)); /* or */
1598 next = patnode(P_NOTHING); /* null */
1599 pattail(starter, next);
1600 patoptail(starter, next);
1602 if (*patparse == Pound)
1603 return 0;
1605 return starter;
1609 * Turn a ^foo (paren = 0) or !(foo) (paren = 1) into *~foo with
1610 * parentheses if necessary. As you see, that's really quite easy.
1613 /**/
1614 static long
1615 patcompnot(int paren, int *flagsp)
1617 union upat up;
1618 long excsync, br, excl, n, starter;
1619 int dummy;
1621 /* Here, we're matching a star at the start. */
1622 *flagsp = P_HSTART;
1624 starter = patnode(P_BRANCH);
1625 br = patnode(P_STAR);
1626 excsync = patnode(P_EXCSYNC);
1627 pattail(br, excsync);
1628 pattail(starter, excl = patnode(P_EXCLUDE));
1629 up.p = NULL;
1630 patadd((char *)&up, 0, sizeof(up), 0);
1631 if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy))))
1632 return 0;
1633 pattail(br, patnode(P_EXCEND));
1634 n = patnode(P_NOTHING); /* just so much easier */
1635 pattail(excsync, n);
1636 pattail(excl, n);
1638 return starter;
1641 /* Emit a node */
1643 /**/
1644 static long
1645 patnode(long op)
1647 long starter = (Upat)patcode - (Upat)patout;
1648 union upat up;
1650 up.l = op;
1651 patadd((char *)&up, 0, sizeof(union upat), 0);
1652 return starter;
1656 * insert an operator in front of an already emitted operand:
1657 * we relocate the operand. there had better be nothing else after.
1660 /**/
1661 static void
1662 patinsert(long op, int opnd, char *xtra, int sz)
1664 char *src, *dst, *opdst;
1665 union upat buf, *lptr;
1667 buf.l = 0;
1668 patadd((char *)&buf, 0, sizeof(buf), 0);
1669 if (sz)
1670 patadd(xtra, 0, sz, 0);
1671 src = patcode - sizeof(union upat) - sz;
1672 dst = patcode;
1673 opdst = patout + opnd * sizeof(union upat);
1674 while (src > opdst)
1675 *--dst = *--src;
1677 /* A cast can't be an lvalue */
1678 lptr = (Upat)opdst;
1679 lptr->l = op;
1680 opdst += sizeof(union upat);
1681 while (sz--)
1682 *opdst++ = *xtra++;
1685 /* set the 'next' pointer at the end of a node chain */
1687 /**/
1688 static void
1689 pattail(long p, long val)
1691 Upat scan, temp;
1692 long offset;
1694 scan = (Upat)patout + p;
1695 for (;;) {
1696 if (!(temp = PATNEXT(scan)))
1697 break;
1698 scan = temp;
1701 offset = (P_OP(scan) == P_BACK)
1702 ? (scan - (Upat)patout) - val : val - (scan - (Upat)patout);
1704 scan->l |= offset << 8;
1707 /* do pattail, but on operand of first argument; nop if operandless */
1709 /**/
1710 static void patoptail(long p, long val)
1712 Upat ptr = (Upat)patout + p;
1713 int op = P_OP(ptr);
1714 if (!p || !P_ISBRANCH(ptr))
1715 return;
1716 if (op == P_BRANCH)
1717 pattail(P_OPERAND(p), val);
1718 else
1719 pattail(P_OPERAND(p) + 1, val);
1724 * Run a pattern.
1726 static char *patinstart; /* Start of input string */
1727 static char *patinend; /* End of input string */
1728 static char *patinput; /* String input pointer */
1729 static char *patinpath; /* Full path for use with ~ exclusions */
1730 static int patinlen; /* Length of last successful match.
1731 * Includes count of Meta characters.
1734 static char *patbeginp[NSUBEXP]; /* Pointer to backref beginnings */
1735 static char *patendp[NSUBEXP]; /* Pointer to backref ends */
1736 static int parsfound; /* parentheses (with backrefs) found */
1738 static int globdots; /* Glob initial dots? */
1741 * Character functions operating on unmetafied strings.
1743 #ifdef MULTIBYTE_SUPPORT
1745 /* Get a character from the start point in a string */
1746 #define CHARREF(x, y) charref((x), (y))
1747 static wchar_t
1748 charref(char *x, char *y)
1750 wchar_t wc;
1751 size_t ret;
1753 if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80))
1754 return (wchar_t) STOUC(*x);
1756 ret = mbrtowc(&wc, x, y-x, &shiftstate);
1758 if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
1759 /* Error. Treat as single byte. */
1760 /* Reset the shift state for next time. */
1761 memset(&shiftstate, 0, sizeof(shiftstate));
1762 return (wchar_t) STOUC(*x);
1765 return wc;
1768 /* Get a pointer to the next character */
1769 #define CHARNEXT(x, y) charnext((x), (y))
1770 static char *
1771 charnext(char *x, char *y)
1773 wchar_t wc;
1774 size_t ret;
1776 if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80))
1777 return x + 1;
1779 ret = mbrtowc(&wc, x, y-x, &shiftstate);
1781 if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
1782 /* Error. Treat as single byte. */
1783 /* Reset the shift state for next time. */
1784 memset(&shiftstate, 0, sizeof(shiftstate));
1785 return x + 1;
1788 /* Nulls here are normal characters */
1789 return x + (ret ? ret : 1);
1792 /* Increment a pointer past the current character. */
1793 #define CHARINC(x, y) ((x) = charnext((x), (y)))
1796 /* Get a character and increment */
1797 #define CHARREFINC(x, y) charrefinc(&(x), (y))
1798 static wchar_t
1799 charrefinc(char **x, char *y)
1801 wchar_t wc;
1802 size_t ret;
1804 if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80))
1805 return (wchar_t) STOUC(*(*x)++);
1807 ret = mbrtowc(&wc, *x, y-*x, &shiftstate);
1809 if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
1810 /* Error. Treat as single byte. */
1811 /* Reset the shift state for next time. */
1812 memset(&shiftstate, 0, sizeof(shiftstate));
1813 return (wchar_t) STOUC(*(*x)++);
1816 /* Nulls here are normal characters */
1817 *x += ret ? ret : 1;
1819 return wc;
1824 * Counter the number of characters between two pointers, smaller first
1826 * This is used when setting values in parameters, so we obey
1827 * the MULTIBYTE option (even if it's been overridden locally).
1829 #define CHARSUB(x,y) charsub(x, y)
1830 static ptrdiff_t
1831 charsub(char *x, char *y)
1833 ptrdiff_t res = 0;
1834 size_t ret;
1835 wchar_t wc;
1837 if (!isset(MULTIBYTE))
1838 return y - x;
1840 while (x < y) {
1841 ret = mbrtowc(&wc, x, y-x, &shiftstate);
1843 if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
1844 /* Error. Treat remainder as single characters */
1845 return res + (y - x);
1848 /* Treat nulls as normal characters */
1849 if (!ret)
1850 ret = 1;
1851 res++;
1852 x += ret;
1855 return res;
1858 #else /* no MULTIBYTE_SUPPORT */
1860 /* Get a character from the start point in a string */
1861 #define CHARREF(x, y) (STOUC(*(x)))
1862 /* Get a pointer to the next character */
1863 #define CHARNEXT(x, y) ((x)+1)
1864 /* Increment a pointer past the current character. */
1865 #define CHARINC(x, y) ((x)++)
1866 /* Get a character and increment */
1867 #define CHARREFINC(x, y) (STOUC(*(x)++))
1868 /* Counter the number of characters between two pointers, smaller first */
1869 #define CHARSUB(x,y) ((y) - (x))
1871 #endif /* MULTIBYTE_SUPPORT */
1874 * The following need to be accessed in the globbing scanner for
1875 * a multi-component file path. See horror story in glob.c.
1877 /**/
1878 int errsfound; /* Total error count so far */
1880 /**/
1881 int forceerrs; /* Forced maximum error count */
1883 /**/
1884 void
1885 pattrystart(void)
1887 forceerrs = -1;
1888 errsfound = 0;
1892 * Test prog against null-terminated, metafied string.
1895 /**/
1896 mod_export int
1897 pattry(Patprog prog, char *string)
1899 return pattryrefs(prog, string, -1, -1, 0, NULL, NULL, NULL);
1903 * Test prog against string of given length, no null termination
1904 * but still metafied at this point. offset gives an offset
1905 * to include in reported match indices
1908 /**/
1909 mod_export int
1910 pattrylen(Patprog prog, char *string, int len, int unmetalen, int offset)
1912 return pattryrefs(prog, string, len, unmetalen, offset, NULL, NULL, NULL);
1916 * Test prog against string with given lengths. The input
1917 * string is metafied; stringlen is the raw string length, and
1918 * unmetalen the number of characters in the original string (some
1919 * of which may now be metafied). Either value may be -1
1920 * to indicate a null-terminated string which will be counted. Note
1921 * there may be a severe penalty for this if a lot of matching is done
1922 * on one string.
1924 * offset is the position in the original string (not seen by
1925 * the pattern module) at which we are trying to match.
1926 * This is added in to the positions recorded in patbeginp and patendp
1927 * when we are looking for substrings. Currently this only happens
1928 * in the parameter substitution code.
1930 * Note this is a character offset, i.e. a metafied character
1931 * counts as 1.
1933 * The last three arguments are used to report the positions for the
1934 * backreferences. On entry, *nump should contain the maximum number
1935 * of positions to report. In this case the match, mbegin, mend
1936 * arrays are not altered.
1938 * If nump is NULL but endp is not NULL, then *endp is set to the
1939 * end position of the match, taking into account patinstart.
1942 /**/
1943 mod_export int
1944 pattryrefs(Patprog prog, char *string, int stringlen, int unmetalen,
1945 int patoffset,
1946 int *nump, int *begp, int *endp)
1948 int i, maxnpos = 0, ret, needfullpath, unmetalenp;
1949 int origlen;
1950 char **sp, **ep, *tryalloced, *ptr;
1951 char *progstr = (char *)prog + prog->startoff;
1953 if (nump) {
1954 maxnpos = *nump;
1955 *nump = 0;
1957 /* inherited from domatch, but why, exactly? */
1958 if (*string == Nularg) {
1959 string++;
1960 unmetalen--;
1963 if (stringlen < 0)
1964 stringlen = strlen(string);
1965 origlen = stringlen;
1967 patflags = prog->flags;
1969 * For a top-level ~-exclusion, we will need the full
1970 * path to exclude, so copy the path so far and append the
1971 * current test string.
1973 needfullpath = (patflags & PAT_HAS_EXCLUDP) && pathpos;
1975 /* Get the length of the full string when unmetafied. */
1976 if (unmetalen < 0)
1977 unmetalen = ztrsub(string + stringlen, string);
1978 if (needfullpath)
1979 unmetalenp = ztrsub(pathbuf + pathpos, pathbuf);
1980 else
1981 unmetalenp = 0;
1983 DPUTS(needfullpath && (patflags & (PAT_PURES|PAT_ANY)),
1984 "rum sort of file exclusion");
1986 * Partly for efficiency, and partly for the convenience of
1987 * globbing, we don't unmetafy pure string patterns, and
1988 * there's no reason to if the pattern is just a *.
1990 if (!(patflags & (PAT_PURES|PAT_ANY))
1991 && (needfullpath || unmetalen != stringlen)) {
1993 * We need to copy if we need to prepend the path so far
1994 * (in which case we copy both chunks), or if we have
1995 * Meta characters.
1997 char *dst;
1998 int icopy, ncopy;
2000 dst = tryalloced = zalloc(unmetalen + unmetalenp);
2002 if (needfullpath) {
2003 /* loop twice, copy path buffer first time */
2004 ptr = pathbuf;
2005 ncopy = unmetalenp;
2006 } else {
2007 /* just loop once, copy string with unmetafication */
2008 ptr = string;
2009 ncopy = unmetalen;
2011 for (icopy = 0; icopy < 2; icopy++) {
2012 for (i = 0; i < ncopy; i++) {
2013 if (*ptr == Meta) {
2014 ptr++;
2015 *dst++ = *ptr++ ^ 32;
2016 } else {
2017 *dst++ = *ptr++;
2020 if (!needfullpath)
2021 break;
2022 /* next time append test string to path so far */
2023 ptr = string;
2024 ncopy = unmetalen;
2027 if (needfullpath) {
2028 patinstart = tryalloced + unmetalenp;
2029 patinpath = tryalloced;
2030 } else {
2031 patinstart = tryalloced;
2032 patinpath = NULL;
2034 stringlen = unmetalen;
2035 } else {
2036 patinstart = string;
2037 tryalloced = patinpath = NULL;
2040 patinend = patinstart + stringlen;
2042 * From now on we do not require NULL termination of
2043 * the test string. There should also be no more references
2044 * to the variable string.
2047 if (prog->flags & (PAT_PURES|PAT_ANY)) {
2049 * Either we are testing against a pure string,
2050 * or we can match anything at all.
2052 int ret;
2053 if (prog->flags & PAT_ANY) {
2055 * Optimisation for a single "*": always matches
2056 * (except for no_glob_dots, see below).
2058 ret = 1;
2059 } else {
2061 * Testing a pure string. See if initial
2062 * components match.
2064 int lendiff = stringlen - prog->patmlen;
2065 if (lendiff < 0) {
2066 /* No, the pattern string is too long. */
2067 ret = 0;
2068 } else if (!memcmp(progstr, patinstart, prog->patmlen)) {
2070 * Initial component matches. Matches either
2071 * if lengths are the same or we are not anchored
2072 * to the end of the string.
2074 ret = !lendiff || (prog->flags & PAT_NOANCH);
2075 } else {
2076 /* No match. */
2077 ret = 0;
2080 if (ret) {
2082 * For files, we won't match initial "."s unless
2083 * glob_dots is set.
2085 if ((prog->flags & PAT_NOGLD) && *patinstart == '.') {
2086 ret = 0;
2087 } else {
2089 * Remember the length in case used for ${..#..} etc.
2090 * In this case, we didn't unmetafy the string.
2092 patinlen = (int)prog->patmlen;
2093 /* if matching files, must update globbing flags */
2094 patglobflags = prog->globend;
2096 if ((patglobflags & GF_MATCHREF) &&
2097 !(patflags & PAT_FILE)) {
2098 char *str = ztrduppfx(patinstart, patinlen);
2099 char *ptr = patinstart;
2100 int mlen = 0;
2103 * Count the characters. We're not using CHARSUB()
2104 * because the string is still metafied. We're
2105 * not using mb_metastrlen() because that expects
2106 * the string to be null terminated.
2108 MB_METACHARINIT();
2109 while (ptr < patinstart + patinlen) {
2110 mlen++;
2111 ptr += MB_METACHARLEN(ptr);
2114 setsparam("MATCH", str);
2115 setiparam("MBEGIN",
2116 (zlong)(patoffset + !isset(KSHARRAYS)));
2117 setiparam("MEND",
2118 (zlong)(mlen + patoffset +
2119 !isset(KSHARRAYS) - 1));
2124 if (tryalloced)
2125 zfree(tryalloced, unmetalen + unmetalenp);
2127 return ret;
2128 } else {
2130 * Test for a `must match' string, unless we're scanning for a match
2131 * in which case we don't need to do this each time.
2133 ret = 1;
2134 if (!(prog->flags & PAT_SCAN) && prog->mustoff)
2136 char *testptr; /* start pointer into test string */
2137 char *teststop; /* last point from which we can match */
2138 char *patptr = (char *)prog + prog->mustoff;
2139 int patlen = prog->patmlen;
2140 int found = 0;
2142 if (patlen > stringlen) {
2143 /* Too long, can't match. */
2144 ret = 0;
2145 } else {
2146 teststop = patinend - patlen;
2148 for (testptr = patinstart; testptr <= teststop; testptr++)
2150 if (!memcmp(testptr, patptr, patlen)) {
2151 found = 1;
2152 break;
2156 if (!found)
2157 ret = 0;
2160 if (!ret) {
2161 if (tryalloced)
2162 zfree(tryalloced, unmetalen + unmetalenp);
2163 return 0;
2166 patglobflags = prog->globflags;
2167 if (!(patflags & PAT_FILE)) {
2168 forceerrs = -1;
2169 errsfound = 0;
2171 globdots = !(patflags & PAT_NOGLD);
2172 parsfound = 0;
2174 patinput = patinstart;
2176 if (patmatch((Upat)progstr)) {
2178 * we were lazy and didn't save the globflags if an exclusion
2179 * failed, so set it now
2181 patglobflags = prog->globend;
2184 * Record length of successful match, including Meta
2185 * characters. Do it here so that patmatchlen() can return
2186 * it even if we delete the pattern strings.
2188 patinlen = patinput - patinstart;
2190 * Optimization: if we didn't find any Meta characters
2191 * to begin with, we don't need to look for them now.
2193 if (unmetalen != origlen) {
2194 for (ptr = patinstart; ptr < patinput; ptr++)
2195 if (imeta(*ptr))
2196 patinlen++;
2200 * Should we clear backreferences and matches on a failed
2201 * match?
2203 if ((patglobflags & GF_MATCHREF) && !(patflags & PAT_FILE)) {
2205 * m flag: for global match. This carries no overhead
2206 * in the pattern matching part.
2208 * Remember the test pattern is already unmetafied.
2210 char *str;
2211 int mlen = CHARSUB(patinstart, patinput);
2213 str = metafy(patinstart, patinput - patinstart, META_DUP);
2214 setsparam("MATCH", str);
2215 setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS)));
2216 setiparam("MEND",
2217 (zlong)(mlen + patoffset +
2218 !isset(KSHARRAYS) - 1));
2220 if (prog->patnpar && nump) {
2222 * b flag: for backreferences using parentheses. Reported
2223 * directly.
2225 *nump = prog->patnpar;
2227 sp = patbeginp;
2228 ep = patendp;
2230 for (i = 0; i < prog->patnpar && i < maxnpos; i++) {
2231 if (parsfound & (1 << i)) {
2232 if (begp)
2233 *begp++ = CHARSUB(patinstart, *sp) + patoffset;
2234 if (endp)
2235 *endp++ = CHARSUB(patinstart, *ep) + patoffset
2236 - 1;
2237 } else {
2238 if (begp)
2239 *begp++ = -1;
2240 if (endp)
2241 *endp++ = -1;
2244 sp++;
2245 ep++;
2247 } else if (prog->patnpar && !(patflags & PAT_FILE)) {
2249 * b flag: for backreferences using parentheses.
2251 int palen = prog->patnpar+1;
2252 char **matcharr, **mbeginarr, **mendarr;
2253 char numbuf[DIGBUFSIZE];
2255 matcharr = zshcalloc(palen*sizeof(char *));
2256 mbeginarr = zshcalloc(palen*sizeof(char *));
2257 mendarr = zshcalloc(palen*sizeof(char *));
2259 sp = patbeginp;
2260 ep = patendp;
2262 for (i = 0; i < prog->patnpar; i++) {
2263 if (parsfound & (1 << i)) {
2264 matcharr[i] = metafy(*sp, *ep - *sp, META_DUP);
2266 * mbegin and mend give indexes into the string
2267 * in the standard notation, i.e. respecting
2268 * KSHARRAYS, and with the end index giving
2269 * the last character, not one beyond.
2270 * For example, foo=foo; [[ $foo = (f)oo ]] gives
2271 * (without KSHARRAYS) indexes 1 and 1, which
2272 * corresponds to indexing as ${foo[1,1]}.
2274 sprintf(numbuf, "%ld",
2275 (long)(CHARSUB(patinstart, *sp) +
2276 patoffset +
2277 !isset(KSHARRAYS)));
2278 mbeginarr[i] = ztrdup(numbuf);
2279 sprintf(numbuf, "%ld",
2280 (long)(CHARSUB(patinstart, *ep) +
2281 patoffset +
2282 !isset(KSHARRAYS) - 1));
2283 mendarr[i] = ztrdup(numbuf);
2284 } else {
2285 /* Pattern wasn't set: either it was in an
2286 * unmatched branch, or a hashed parenthesis
2287 * that didn't match at all.
2289 matcharr[i] = ztrdup("");
2290 mbeginarr[i] = ztrdup("-1");
2291 mendarr[i] = ztrdup("-1");
2293 sp++;
2294 ep++;
2296 setaparam("match", matcharr);
2297 setaparam("mbegin", mbeginarr);
2298 setaparam("mend", mendarr);
2301 if (!nump && endp) {
2303 * We just need the overall end position.
2305 *endp = CHARSUB(patinstart, patinput) + patoffset;
2308 ret = 1;
2309 } else
2310 ret = 0;
2312 if (tryalloced)
2313 zfree(tryalloced, unmetalen + unmetalenp);
2315 return ret;
2320 * Return length of previous succesful match. This is
2321 * in metafied bytes, i.e. includes a count of Meta characters.
2322 * Unusual and futile attempt at modular encapsulation.
2325 /**/
2327 patmatchlen(void)
2329 return patinlen;
2333 * Match literal characters with case insensitivity test: the first
2334 * comes from the input string, the second the current pattern.
2336 #ifdef MULTIBYTE_SUPPORT
2337 #define ISUPPER(x) iswupper(x)
2338 #define ISLOWER(x) iswlower(x)
2339 #define TOUPPER(x) towupper(x)
2340 #define TOLOWER(x) towlower(x)
2341 #define ISDIGIT(x) iswdigit(x)
2342 #else
2343 #define ISUPPER(x) isupper(x)
2344 #define ISLOWER(x) islower(x)
2345 #define TOUPPER(x) toupper(x)
2346 #define TOLOWER(x) tolower(x)
2347 #define ISDIGIT(x) idigit(x)
2348 #endif
2349 #define CHARMATCH(chin, chpa) (chin == chpa || \
2350 ((patglobflags & GF_IGNCASE) ? \
2351 ((ISUPPER(chin) ? TOLOWER(chin) : chin) == \
2352 (ISUPPER(chpa) ? TOLOWER(chpa) : chpa)) : \
2353 (patglobflags & GF_LCMATCHUC) ? \
2354 (ISLOWER(chpa) && TOUPPER(chpa) == chin) : 0))
2357 * The same but caching an expression from the first argument,
2358 * Requires local charmatch_cache definition.
2360 #define CHARMATCH_EXPR(expr, chpa) \
2361 (charmatch_cache = (expr), CHARMATCH(charmatch_cache, chpa))
2364 * exactpos is used to remember how far down an exact string we have
2365 * matched, if we are doing approximation and can therefore redo from
2366 * the same point; we never need to otherwise.
2368 * exactend is a pointer to the end of the string, which isn't
2369 * null-terminated.
2371 static char *exactpos, *exactend;
2374 * Main matching routine.
2376 * Testing the tail end of a match is usually done by recursion, but
2377 * we try to eliminate that in favour of looping for simple cases.
2380 /**/
2381 static int
2382 patmatch(Upat prog)
2384 /* Current and next nodes */
2385 Upat scan = prog, next, opnd;
2386 char *start, *save, *chrop, *chrend, *compend;
2387 int savglobflags, op, no, min, fail = 0, saverrsfound;
2388 zrange_t from, to, comp;
2389 patint_t nextch;
2391 while (scan) {
2392 next = PATNEXT(scan);
2394 if (!globdots && P_NOTDOT(scan) && patinput == patinstart &&
2395 patinput < patinend && *patinput == '.')
2396 return 0;
2398 switch (P_OP(scan)) {
2399 case P_ANY:
2400 if (patinput == patinend)
2401 fail = 1;
2402 else
2403 CHARINC(patinput, patinend);
2404 break;
2405 case P_EXACTLY:
2407 * acts as nothing if *chrop is null: this is used by
2408 * approx code.
2410 if (exactpos) {
2411 chrop = exactpos;
2412 chrend = exactend;
2413 } else {
2414 chrop = P_LS_STR(scan);
2415 chrend = chrop + P_LS_LEN(scan);
2417 exactpos = NULL;
2418 while (chrop < chrend && patinput < patinend) {
2419 char *savpatinput = patinput;
2420 char *savchrop = chrop;
2421 patint_t chin = CHARREFINC(patinput, patinend);
2422 patint_t chpa = CHARREFINC(chrop, chrend);
2423 if (!CHARMATCH(chin, chpa)) {
2424 fail = 1;
2425 patinput = savpatinput;
2426 chrop = savchrop;
2427 break;
2430 if (chrop < chrend) {
2431 exactpos = chrop;
2432 exactend = chrend;
2433 fail = 1;
2435 break;
2436 case P_ANYOF:
2437 case P_ANYBUT:
2438 if (patinput == patinend)
2439 fail = 1;
2440 else {
2441 #ifdef MULTIBYTE_SUPPORT
2442 wchar_t cr = CHARREF(patinput, patinend);
2443 char *scanop = (char *)P_OPERAND(scan);
2444 if (patglobflags & GF_MULTIBYTE) {
2445 if (mb_patmatchrange(scanop, cr, NULL, NULL) ^
2446 (P_OP(scan) == P_ANYOF))
2447 fail = 1;
2448 else
2449 CHARINC(patinput, patinend);
2450 } else if (patmatchrange(scanop, (int)cr, NULL, NULL) ^
2451 (P_OP(scan) == P_ANYOF))
2452 fail = 1;
2453 else
2454 CHARINC(patinput, patinend);
2455 #else
2456 if (patmatchrange((char *)P_OPERAND(scan),
2457 CHARREF(patinput, patinend), NULL, NULL) ^
2458 (P_OP(scan) == P_ANYOF))
2459 fail = 1;
2460 else
2461 CHARINC(patinput, patinend);
2462 #endif
2464 break;
2465 case P_NUMRNG:
2466 case P_NUMFROM:
2467 case P_NUMTO:
2469 * To do this properly, we really have to treat numbers as
2470 * closures: that's so things like <1-1000>33 will
2471 * match 633 (they didn't up to 3.1.6). To avoid making this
2472 * too inefficient, we see if there's an exact match next:
2473 * if there is, and it's not a digit, we return 1 after
2474 * the first attempt.
2476 op = P_OP(scan);
2477 start = (char *)P_OPERAND(scan);
2478 from = to = 0;
2479 if (op != P_NUMTO) {
2480 #ifdef ZSH_64_BIT_TYPE
2481 /* We can't rely on pointer alignment being good enough. */
2482 memcpy((char *)&from, start, sizeof(zrange_t));
2483 #else
2484 from = *((zrange_t *) start);
2485 #endif
2486 start += sizeof(zrange_t);
2488 if (op != P_NUMFROM) {
2489 #ifdef ZSH_64_BIT_TYPE
2490 memcpy((char *)&to, start, sizeof(zrange_t));
2491 #else
2492 to = *((zrange_t *) start);
2493 #endif
2495 start = compend = patinput;
2496 comp = 0;
2497 while (patinput < patinend && idigit(*patinput)) {
2498 if (comp)
2499 comp *= 10;
2500 comp += *patinput - '0';
2501 patinput++;
2502 compend++;
2504 if (comp & ((zrange_t)1 << (sizeof(comp)*8 -
2505 #ifdef ZRANGE_T_IS_SIGNED
2507 #else
2509 #endif
2510 ))) {
2512 * Out of range (allowing for signedness, which
2513 * we need if we are using zlongs).
2514 * This is as far as we can go.
2515 * If we're doing a range "from", skip all the
2516 * remaining numbers. Otherwise, we can't
2517 * match beyond the previous point anyway.
2518 * Leave the pointer to the last calculated
2519 * position (compend) where it was before.
2521 if (op == P_NUMFROM) {
2522 while (patinput < patinend && idigit(*patinput))
2523 patinput++;
2527 save = patinput;
2528 no = 0;
2529 while (patinput > start) {
2530 /* if already too small, no power on earth can save it */
2531 if (comp < from && patinput <= compend)
2532 break;
2533 if ((op == P_NUMFROM || comp <= to) && patmatch(next)) {
2534 return 1;
2536 if (!no && P_OP(next) == P_EXACTLY &&
2537 (!P_LS_LEN(next) ||
2538 !idigit(STOUC(*P_LS_STR(next)))) &&
2539 !(patglobflags & 0xff))
2540 return 0;
2541 patinput = --save;
2542 no++;
2544 * With a range start and an unrepresentable test
2545 * number, we just back down the test string without
2546 * changing the number until we get to a representable
2547 * one.
2549 if (patinput < compend)
2550 comp /= 10;
2552 patinput = start;
2553 fail = 1;
2554 break;
2555 case P_NUMANY:
2556 /* This is <->: any old set of digits, don't bother comparing */
2557 start = patinput;
2558 while (patinput < patinend && idigit(*patinput))
2559 patinput++;
2560 save = patinput;
2561 no = 0;
2562 while (patinput > start) {
2563 if (patmatch(next))
2564 return 1;
2565 if (!no && P_OP(next) == P_EXACTLY &&
2566 (!P_LS_LEN(next) ||
2567 !idigit(*P_LS_STR(next))) &&
2568 !(patglobflags & 0xff))
2569 return 0;
2570 patinput = --save;
2571 no++;
2573 patinput = start;
2574 fail = 1;
2575 break;
2576 case P_NOTHING:
2577 break;
2578 case P_BACK:
2579 break;
2580 case P_GFLAGS:
2581 patglobflags = P_OPERAND(scan)->l;
2582 break;
2583 case P_OPEN:
2584 case P_OPEN+1:
2585 case P_OPEN+2:
2586 case P_OPEN+3:
2587 case P_OPEN+4:
2588 case P_OPEN+5:
2589 case P_OPEN+6:
2590 case P_OPEN+7:
2591 case P_OPEN+8:
2592 case P_OPEN+9:
2593 no = P_OP(scan) - P_OPEN;
2594 save = patinput;
2596 if (patmatch(next)) {
2598 * Don't set patbeginp if some later invocation of
2599 * the same parentheses already has.
2601 if (no && !(parsfound & (1 << (no - 1)))) {
2602 patbeginp[no-1] = save;
2603 parsfound |= 1 << (no - 1);
2605 return 1;
2606 } else
2607 return 0;
2608 break;
2609 case P_CLOSE:
2610 case P_CLOSE+1:
2611 case P_CLOSE+2:
2612 case P_CLOSE+3:
2613 case P_CLOSE+4:
2614 case P_CLOSE+5:
2615 case P_CLOSE+6:
2616 case P_CLOSE+7:
2617 case P_CLOSE+8:
2618 case P_CLOSE+9:
2619 no = P_OP(scan) - P_CLOSE;
2620 save = patinput;
2622 if (patmatch(next)) {
2623 if (no && !(parsfound & (1 << (no + 15)))) {
2624 patendp[no-1] = save;
2625 parsfound |= 1 << (no + 15);
2627 return 1;
2628 } else
2629 return 0;
2630 break;
2631 case P_EXCSYNC:
2632 /* See the P_EXCLUDE code below for where syncptr comes from */
2634 unsigned char *syncptr;
2635 Upat after;
2636 after = P_OPERAND(scan);
2637 DPUTS(!P_ISEXCLUDE(after),
2638 "BUG: EXCSYNC not followed by EXCLUDE.");
2639 DPUTS(!P_OPERAND(after)->p,
2640 "BUG: EXCSYNC not handled by EXCLUDE");
2641 syncptr = P_OPERAND(after)->p + (patinput - patinstart);
2643 * If we already matched from here, this time we fail.
2644 * See WBRANCH code for story about error count.
2646 if (*syncptr && errsfound + 1 >= *syncptr)
2647 return 0;
2649 * Else record that we (possibly) matched this time.
2650 * No harm if we don't: then the previous test will just
2651 * short cut the attempted match that is bound to fail.
2652 * We never try to exclude something that has already
2653 * failed anyway.
2655 *syncptr = errsfound + 1;
2657 break;
2658 case P_EXCEND:
2660 * This is followed by a P_EXCSYNC, but only in the P_EXCLUDE
2661 * branch. Actually, we don't bother following it: all we
2662 * need to know is that we successfully matched so far up
2663 * to the end of the asserted pattern; the endpoint
2664 * in the target string is nulled out.
2666 if (!(fail = (patinput < patinend)))
2667 return 1;
2668 break;
2669 case P_BRANCH:
2670 case P_WBRANCH:
2671 /* P_EXCLUDE shouldn't occur without a P_BRANCH */
2672 if (!P_ISBRANCH(next)) {
2673 /* no choice, avoid recursion */
2674 DPUTS(P_OP(scan) == P_WBRANCH,
2675 "BUG: WBRANCH with no alternative.");
2676 next = P_OPERAND(scan);
2677 } else {
2678 do {
2679 save = patinput;
2680 savglobflags = patglobflags;
2681 saverrsfound = errsfound;
2682 if (P_ISEXCLUDE(next)) {
2684 * The strategy is to test the asserted pattern,
2685 * recording via P_EXCSYNC how far the part to
2686 * be excluded matched. We then set the
2687 * length of the test string to that
2688 * point and see if the exclusion as far as
2689 * P_EXCEND also matches that string.
2690 * We need to keep testing the asserted pattern
2691 * by backtracking, since the first attempt
2692 * may be excluded while a later attempt may not.
2693 * For this we keep a pointer just after
2694 * the P_EXCLUDE which is tested by the P_EXCSYNC
2695 * to see if we matched there last time, in which
2696 * case we fail. If there is nothing to backtrack
2697 * over, that doesn't matter: we should fail anyway.
2698 * The pointer also tells us where the asserted
2699 * pattern matched for use by the exclusion.
2701 * It's hard to allocate space for this
2702 * beforehand since we may need to do it
2703 * recursively.
2705 * P.S. in case you were wondering, this code
2706 * is horrible.
2708 Upat syncstrp;
2709 char *origpatinend;
2710 unsigned char *oldsyncstr;
2711 char *matchpt = NULL;
2712 int ret, savglobdots, matchederrs = 0;
2713 int savparsfound = parsfound;
2714 DPUTS(P_OP(scan) == P_WBRANCH,
2715 "BUG: excluded WBRANCH");
2716 syncstrp = P_OPERAND(next);
2718 * Unlike WBRANCH, each test at the same exclude
2719 * sync point (due to an external loop) is separate,
2720 * i.e testing (foo~bar)# is no different from
2721 * (foo~bar)(foo~bar)... from the exclusion point
2722 * of view, so we use a different sync string.
2724 oldsyncstr = syncstrp->p;
2725 syncstrp->p = (unsigned char *)
2726 zshcalloc((patinend - patinstart) + 1);
2727 origpatinend = patinend;
2728 while ((ret = patmatch(P_OPERAND(scan)))) {
2729 unsigned char *syncpt;
2730 char *savpatinstart;
2731 int savforce = forceerrs;
2732 int savpatflags = patflags, synclen;
2733 forceerrs = -1;
2734 savglobdots = globdots;
2735 matchederrs = errsfound;
2736 matchpt = patinput; /* may not be end */
2737 globdots = 1; /* OK to match . first */
2738 /* Find the point where the scan
2739 * matched the part to be excluded: because
2740 * of backtracking, the one
2741 * most recently matched will be the first.
2742 * (Luckily, backtracking is done after all
2743 * possibilities for approximation have been
2744 * checked.)
2746 for (syncpt = syncstrp->p; !*syncpt; syncpt++)
2748 synclen = syncpt - syncstrp->p;
2749 if (patinstart + synclen != patinend) {
2751 * Temporarily mark the string as
2752 * ending at this point.
2754 DPUTS(patinstart + synclen > matchpt,
2755 "BUG: EXCSYNC failed");
2757 patinend = patinstart + synclen;
2759 * If this isn't really the end of the string,
2760 * remember this for the (#e) assertion.
2762 patflags |= PAT_NOTEND;
2764 savpatinstart = patinstart;
2765 next = PATNEXT(scan);
2766 while (next && P_ISEXCLUDE(next)) {
2767 patinput = save;
2769 * turn off approximations in exclusions:
2770 * note we keep remaining patglobflags
2771 * set by asserted branch (or previous
2772 * excluded branches, for consistency).
2774 patglobflags &= ~0xff;
2775 errsfound = 0;
2776 opnd = P_OPERAND(next) + 1;
2777 if (P_OP(next) == P_EXCLUDP && patinpath) {
2779 * Top level exclusion with a file,
2780 * applies to whole path so add the
2781 * segments already matched.
2782 * We copied these in front of the
2783 * test pattern, so patinend doesn't
2784 * need moving.
2786 DPUTS(patinput != patinstart,
2787 "BUG: not at start excluding path");
2788 patinput = patinstart = patinpath;
2790 if (patmatch(opnd)) {
2791 ret = 0;
2793 * Another subtlety: if we exclude the
2794 * match, any parentheses just found
2795 * become invalidated.
2797 parsfound = savparsfound;
2799 if (patinpath) {
2800 patinput = savpatinstart +
2801 (patinput - patinstart);
2802 patinstart = savpatinstart;
2804 if (!ret)
2805 break;
2806 next = PATNEXT(next);
2809 * Restore original end position.
2811 patinend = origpatinend;
2812 patflags = savpatflags;
2813 globdots = savglobdots;
2814 forceerrs = savforce;
2815 if (ret)
2816 break;
2817 patinput = save;
2818 patglobflags = savglobflags;
2819 errsfound = saverrsfound;
2821 zfree((char *)syncstrp->p,
2822 (patinend - patinstart) + 1);
2823 syncstrp->p = oldsyncstr;
2824 if (ret) {
2825 patinput = matchpt;
2826 errsfound = matchederrs;
2827 return 1;
2829 while ((scan = PATNEXT(scan)) &&
2830 P_ISEXCLUDE(scan))
2832 } else {
2833 int ret = 1, pfree = 0;
2834 Upat ptrp = NULL;
2835 unsigned char *ptr;
2836 if (P_OP(scan) == P_WBRANCH) {
2838 * This is where we make sure that we are not
2839 * repeatedly matching zero-length strings in
2840 * a closure, which would cause an infinite loop,
2841 * and also remove exponential behaviour in
2842 * backtracking nested closures.
2843 * The P_WBRANCH operator leaves a space for a
2844 * uchar *, initialized to NULL, which is
2845 * turned into a string the same length as the
2846 * target string. Every time we match from a
2847 * particular point in the target string, we
2848 * stick a 1 at the corresponding point here.
2849 * If we come round to the same branch again, and
2850 * there is already a 1, then the test fails.
2852 opnd = P_OPERAND(scan);
2853 ptrp = opnd++;
2854 if (!ptrp->p) {
2855 ptrp->p = (unsigned char *)
2856 zshcalloc((patinend - patinstart) + 1);
2857 pfree = 1;
2859 ptr = ptrp->p + (patinput - patinstart);
2862 * Without approximation, this is just a
2863 * single bit test. With approximation, we
2864 * need to know how many errors there were
2865 * last time we made the test. If errsfound
2866 * is now smaller than it was, hence we can
2867 * make more approximations in the remaining
2868 * code, we continue with the test.
2869 * (This is why the max number of errors is
2870 * 254, not 255.)
2872 if (*ptr && errsfound + 1 >= *ptr)
2873 ret = 0;
2874 *ptr = errsfound + 1;
2875 } else
2876 opnd = P_OPERAND(scan);
2877 if (ret)
2878 ret = patmatch(opnd);
2879 if (pfree) {
2880 zfree((char *)ptrp->p,
2881 (patinend - patinstart) + 1);
2882 ptrp->p = NULL;
2884 if (ret)
2885 return 1;
2886 scan = PATNEXT(scan);
2888 patinput = save;
2889 patglobflags = savglobflags;
2890 errsfound = saverrsfound;
2891 DPUTS(P_OP(scan) == P_WBRANCH,
2892 "BUG: WBRANCH not first choice.");
2893 next = PATNEXT(scan);
2894 } while (scan && P_ISBRANCH(scan));
2895 return 0;
2897 break;
2898 case P_STAR:
2899 /* Handle specially for speed, although really P_ONEHASH+P_ANY */
2900 case P_ONEHASH:
2901 case P_TWOHASH:
2903 * This is just simple cases, matching one character.
2904 * With approximations, we still handle * this way, since
2905 * no approximation is ever necessary, but other closures
2906 * are handled by the more complicated branching method
2908 op = P_OP(scan);
2909 /* Note that no counts possibly metafied characters */
2910 start = patinput;
2912 char *lastcharstart;
2914 * Array to record the start of characters for
2915 * backtracking.
2917 VARARR(char, charstart, patinend-patinput);
2918 memset(charstart, 0, patinend-patinput);
2920 if (op == P_STAR) {
2921 for (no = 0; patinput < patinend;
2922 CHARINC(patinput, patinend))
2924 charstart[patinput-start] = 1;
2925 no++;
2927 /* simple optimization for reasonably common case */
2928 if (P_OP(next) == P_END)
2929 return 1;
2930 } else {
2931 DPUTS(patglobflags & 0xff,
2932 "BUG: wrong backtracking with approximation.");
2933 if (!globdots && P_NOTDOT(P_OPERAND(scan)) &&
2934 patinput == patinstart && patinput < patinend &&
2935 CHARREF(patinput, patinend) == ZWC('.'))
2936 return 0;
2937 no = patrepeat(P_OPERAND(scan), charstart);
2939 min = (op == P_TWOHASH) ? 1 : 0;
2941 * Lookahead to avoid useless matches. This is not possible
2942 * with approximation.
2944 if (P_OP(next) == P_EXACTLY && P_LS_LEN(next) &&
2945 !(patglobflags & 0xff)) {
2946 char *nextop = P_LS_STR(next);
2947 #ifdef MULTIBYTE_SUPPORT
2948 /* else second argument of CHARREF isn't used */
2949 int nextlen = P_LS_LEN(next);
2950 #endif
2952 * If that P_EXACTLY is last (common in simple patterns,
2953 * such as *.c), then it can be only be matched at one
2954 * point in the test string, so record that.
2956 if (P_OP(PATNEXT(next)) == P_END &&
2957 !(patflags & PAT_NOANCH)) {
2958 int ptlen = patinend - patinput;
2959 int lenmatch = patinend -
2960 (min ? CHARNEXT(start, patinend) : start);
2961 /* Are we in the right range? */
2962 if (P_LS_LEN(next) > lenmatch ||
2963 P_LS_LEN(next) < ptlen)
2964 return 0;
2965 /* Yes, just position appropriately and test. */
2966 patinput += ptlen - P_LS_LEN(next);
2968 * Here we will need to be careful that patinput is not
2969 * in the middle of a multibyte character.
2971 /* Continue loop with P_EXACTLY test. */
2972 break;
2974 nextch = CHARREF(nextop, nextop + nextlen);
2975 } else
2976 nextch = PEOF;
2977 savglobflags = patglobflags;
2978 saverrsfound = errsfound;
2979 lastcharstart = charstart + (patinput - start);
2980 if (no >= min) {
2981 for (;;) {
2982 patint_t charmatch_cache;
2983 if (nextch == PEOF ||
2984 (patinput < patinend &&
2985 CHARMATCH_EXPR(CHARREF(patinput, patinend),
2986 nextch))) {
2987 if (patmatch(next))
2988 return 1;
2990 if (--no < min)
2991 break;
2992 /* find start of previous full character */
2993 while (!*--lastcharstart)
2994 DPUTS(lastcharstart < charstart,
2995 "lastcharstart invalid");
2996 patinput = start + (lastcharstart-charstart);
2997 patglobflags = savglobflags;
2998 errsfound = saverrsfound;
3003 * As with branches, the patmatch(next) stuff for *
3004 * handles approximation, so we don't need to try
3005 * anything here.
3007 return 0;
3008 case P_ISSTART:
3009 if (patinput != patinstart || (patflags & PAT_NOTSTART))
3010 fail = 1;
3011 break;
3012 case P_ISEND:
3013 if (patinput < patinend || (patflags & PAT_NOTEND))
3014 fail = 1;
3015 break;
3016 case P_COUNTSTART:
3019 * Save and restore the current count and the
3020 * start pointer in case the pattern has been
3021 * executed by a previous repetition of a
3022 * closure.
3024 long *curptr = &P_OPERAND(scan)[P_CT_CURRENT].l;
3025 long savecount = *curptr;
3026 unsigned char *saveptr = scan[P_CT_PTR].p;
3027 int ret;
3029 *curptr = 0L;
3030 ret = patmatch(P_OPERAND(scan));
3031 *curptr = savecount;
3032 scan[P_CT_PTR].p = saveptr;
3033 return ret;
3035 case P_COUNT:
3037 /* (#cN,M): execution is relatively straightforward */
3038 long cur = scan[P_CT_CURRENT].l;
3039 long min = scan[P_CT_MIN].l;
3040 long max = scan[P_CT_MAX].l;
3042 if (cur && cur >= min &&
3043 (unsigned char *)patinput == scan[P_CT_PTR].p) {
3045 * Not at the first attempt to match so
3046 * the previous attempt managed zero length.
3047 * We can do this indefinitely so there's
3048 * no point in going on. Simply try to
3049 * match the remainder of the pattern.
3051 return patmatch(next);
3053 scan[P_CT_PTR].p = (unsigned char *)patinput;
3055 if (max < 0 || cur < max) {
3056 char *patinput_thistime = patinput;
3057 scan[P_CT_CURRENT].l = cur + 1;
3058 if (patmatch(scan + P_CT_OPERAND))
3059 return 1;
3060 patinput = patinput_thistime;
3062 if (cur < min)
3063 return 0;
3064 return patmatch(next);
3066 case P_END:
3067 if (!(fail = (patinput < patinend && !(patflags & PAT_NOANCH))))
3068 return 1;
3069 break;
3070 #ifdef DEBUG
3071 default:
3072 dputs("BUG: bad operand in patmatch.");
3073 return 0;
3074 break;
3075 #endif
3078 if (fail) {
3079 if (errsfound < (patglobflags & 0xff) &&
3080 (forceerrs == -1 || errsfound < forceerrs)) {
3082 * Approximation code. There are four possibilities
3084 * 1. omit character from input string
3085 * 2. transpose characters in input and pattern strings
3086 * 3. omit character in both input and pattern strings
3087 * 4. omit character from pattern string.
3089 * which we try in that order.
3091 * Of these, 2, 3 and 4 require an exact match string
3092 * (P_EXACTLY) while 1, 2 and 3 require that we not
3093 * have reached the end of the input string.
3095 * Note in each case after making the approximation we
3096 * need to retry the *same* pattern; this is what
3097 * requires exactpos, a slightly doleful way of
3098 * communicating with the exact character matcher.
3100 char *savexact = exactpos;
3101 save = patinput;
3102 savglobflags = patglobflags;
3103 saverrsfound = ++errsfound;
3104 fail = 0;
3106 DPUTS(P_OP(scan) != P_EXACTLY && exactpos,
3107 "BUG: non-exact match has set exactpos");
3109 /* Try omitting a character from the input string */
3110 if (patinput < patinend) {
3111 CHARINC(patinput, patinend);
3112 /* If we are not on an exact match, then this is
3113 * our last gasp effort, so we can optimize out
3114 * the recursive call.
3116 if (P_OP(scan) != P_EXACTLY)
3117 continue;
3118 if (patmatch(scan))
3119 return 1;
3122 if (P_OP(scan) == P_EXACTLY) {
3123 char *nextexact = savexact;
3124 DPUTS(!savexact,
3125 "BUG: exact match has not set exactpos");
3126 CHARINC(nextexact, exactend);
3128 if (save < patinend) {
3129 char *nextin = save;
3130 CHARINC(nextin, patinend);
3131 patglobflags = savglobflags;
3132 errsfound = saverrsfound;
3133 exactpos = savexact;
3136 * Try swapping two characters in patinput and
3137 * exactpos
3139 if (save < patinend && nextin < patinend &&
3140 nextexact < exactend) {
3141 patint_t cin0 = CHARREF(save, patinend);
3142 patint_t cpa0 = CHARREF(exactpos, exactend);
3143 patint_t cin1 = CHARREF(nextin, patinend);
3144 patint_t cpa1 = CHARREF(nextexact, exactend);
3146 if (CHARMATCH(cin0, cpa1) &&
3147 CHARMATCH(cin1, cpa0)) {
3148 patinput = nextin;
3149 CHARINC(patinput, patinend);
3150 exactpos = nextexact;
3151 CHARINC(exactpos, exactend);
3152 if (patmatch(scan))
3153 return 1;
3155 patglobflags = savglobflags;
3156 errsfound = saverrsfound;
3161 * Try moving up both strings.
3163 patinput = nextin;
3164 exactpos = nextexact;
3165 if (patmatch(scan))
3166 return 1;
3168 patinput = save;
3169 patglobflags = savglobflags;
3170 errsfound = saverrsfound;
3171 exactpos = savexact;
3174 DPUTS(exactpos == exactend, "approximating too far");
3176 * Try moving up the exact match pattern.
3177 * This must be the last attempt, so just loop
3178 * instead of calling recursively.
3180 CHARINC(exactpos, exactend);
3181 continue;
3184 exactpos = NULL;
3185 return 0;
3188 scan = next;
3191 return 0;
3195 /**/
3196 #ifdef MULTIBYTE_SUPPORT
3199 * See if character ch matches a pattern range specification.
3200 * The null-terminated specification is in range; the test
3201 * character is in ch.
3203 * indptr is used by completion matching, which is why this
3204 * function is exported. If indptr is not NULL we set *indptr
3205 * to the index of the character in the range string, adjusted
3206 * in the case of "A-B" ranges such that A would count as its
3207 * normal index (say IA), B would count as IA + (B-A), and any
3208 * character within the range as appropriate. We're not strictly
3209 * guaranteed this fits within a wint_t, but if this is Unicode
3210 * in 32 bits we have a fair amount of distance left over.
3212 * mtp is used in the same circumstances. *mtp returns the match type:
3213 * 0 for a standard character, else the PP_ index. It's not
3214 * useful if the match failed.
3217 /**/
3218 mod_export int
3219 mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp)
3221 wchar_t r1, r2;
3223 if (indptr)
3224 *indptr = 0;
3226 * Careful here: unlike other strings, range is a NULL-terminated,
3227 * metafied string, because we need to treat the Posix and hyphenated
3228 * ranges specially.
3230 while (*range) {
3231 if (imeta(STOUC(*range))) {
3232 int swtype = STOUC(*range++) - STOUC(Meta);
3233 if (mtp)
3234 *mtp = swtype;
3235 switch (swtype) {
3236 case 0:
3237 /* ordinary metafied character */
3238 range--;
3239 if (metacharinc(&range) == ch)
3240 return 1;
3241 break;
3242 case PP_ALPHA:
3243 if (iswalpha(ch))
3244 return 1;
3245 break;
3246 case PP_ALNUM:
3247 if (iswalnum(ch))
3248 return 1;
3249 break;
3250 case PP_ASCII:
3251 if ((ch & ~0x7f) == 0)
3252 return 1;
3253 break;
3254 case PP_BLANK:
3255 if (ch == L' ' || ch == L'\t')
3256 return 1;
3257 break;
3258 case PP_CNTRL:
3259 if (iswcntrl(ch))
3260 return 1;
3261 break;
3262 case PP_DIGIT:
3263 if (iswdigit(ch))
3264 return 1;
3265 break;
3266 case PP_GRAPH:
3267 if (iswgraph(ch))
3268 return 1;
3269 break;
3270 case PP_LOWER:
3271 if (iswlower(ch))
3272 return 1;
3273 break;
3274 case PP_PRINT:
3275 if (iswprint(ch))
3276 return 1;
3277 break;
3278 case PP_PUNCT:
3279 if (iswpunct(ch))
3280 return 1;
3281 break;
3282 case PP_SPACE:
3283 if (iswspace(ch))
3284 return 1;
3285 break;
3286 case PP_UPPER:
3287 if (iswupper(ch))
3288 return 1;
3289 break;
3290 case PP_XDIGIT:
3291 if (iswxdigit(ch))
3292 return 1;
3293 break;
3294 case PP_IDENT:
3295 if (wcsitype(ch, IIDENT))
3296 return 1;
3297 break;
3298 case PP_IFS:
3299 if (wcsitype(ch, ISEP))
3300 return 1;
3301 break;
3302 case PP_IFSSPACE:
3303 /* must be ASCII space character */
3304 if (ch < 128 && iwsep((int)ch))
3305 return 1;
3306 break;
3307 case PP_WORD:
3308 if (wcsitype(ch, IWORD))
3309 return 1;
3310 break;
3311 case PP_RANGE:
3312 r1 = metacharinc(&range);
3313 r2 = metacharinc(&range);
3314 if (r1 <= ch && ch <= r2) {
3315 if (indptr)
3316 *indptr += ch - r1;
3317 return 1;
3319 /* Careful not to screw up counting with bogus range */
3320 if (indptr && r1 < r2) {
3322 * This gets incremented again below to get
3323 * us past the range end. This is correct.
3325 *indptr += r2 - r1;
3327 break;
3328 case PP_UNKWN:
3329 DPUTS(1, "BUG: unknown posix range passed through.\n");
3330 break;
3331 default:
3332 DPUTS(1, "BUG: unknown metacharacter in range.");
3333 break;
3335 } else if (metacharinc(&range) == ch) {
3336 if (mtp)
3337 *mtp = 0;
3338 return 1;
3340 if (indptr)
3341 (*indptr)++;
3343 return 0;
3348 * This is effectively the reverse of mb_patmatchrange().
3349 * Given a range descriptor of the same form, and an index into it,
3350 * try to determine the character that is matched. If the index
3351 * points to a [:...:] generic style match, set chr to WEOF and
3352 * return the type in mtp instead. Return 1 if successful, 0 if
3353 * there was no corresponding index. Note all pointer arguments
3354 * must be non-null.
3357 /**/
3358 mod_export int
3359 mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp)
3361 wchar_t r1, r2, rchr;
3362 wint_t rdiff;
3364 *chr = WEOF;
3365 *mtp = 0;
3367 while (*range) {
3368 if (imeta(STOUC(*range))) {
3369 int swtype = STOUC(*range++) - STOUC(Meta);
3370 switch (swtype) {
3371 case 0:
3372 range--;
3373 rchr = metacharinc(&range);
3374 if (!ind) {
3375 *chr = (wint_t) rchr;
3376 return 1;
3378 break;
3380 case PP_ALPHA:
3381 case PP_ALNUM:
3382 case PP_ASCII:
3383 case PP_BLANK:
3384 case PP_CNTRL:
3385 case PP_DIGIT:
3386 case PP_GRAPH:
3387 case PP_LOWER:
3388 case PP_PRINT:
3389 case PP_PUNCT:
3390 case PP_SPACE:
3391 case PP_UPPER:
3392 case PP_XDIGIT:
3393 case PP_IDENT:
3394 case PP_IFS:
3395 case PP_IFSSPACE:
3396 case PP_WORD:
3397 if (!ind) {
3398 *mtp = swtype;
3399 return 1;
3401 break;
3403 case PP_RANGE:
3404 r1 = metacharinc(&range);
3405 r2 = metacharinc(&range);
3406 rdiff = (wint_t)r2 - (wint_t)r1;
3407 if (rdiff >= ind) {
3408 *chr = (wint_t)r1 + ind;
3409 return 1;
3411 /* note the extra decrement to ind below */
3412 ind -= rdiff;
3413 break;
3414 case PP_UNKWN:
3415 DPUTS(1, "BUG: unknown posix range passed through.\n");
3416 break;
3417 default:
3418 DPUTS(1, "BUG: unknown metacharacter in range.");
3419 break;
3421 } else {
3422 rchr = metacharinc(&range);
3423 if (!ind) {
3424 *chr = (wint_t)rchr;
3425 return 1;
3428 if (!ind--)
3429 break;
3432 /* No corresponding index. */
3433 return 0;
3436 /**/
3437 #endif /* MULTIBYTE_SUPPORT */
3440 * Identical function to mb_patmatchrange() above for single-byte
3441 * characters.
3444 /**/
3445 mod_export int
3446 patmatchrange(char *range, int ch, int *indptr, int *mtp)
3448 int r1, r2;
3450 if (indptr)
3451 *indptr = 0;
3453 * Careful here: unlike other strings, range is a NULL-terminated,
3454 * metafied string, because we need to treat the Posix and hyphenated
3455 * ranges specially.
3457 for (; *range; range++) {
3458 if (imeta(STOUC(*range))) {
3459 int swtype = STOUC(*range) - STOUC(Meta);
3460 if (mtp)
3461 *mtp = swtype;
3462 switch (swtype) {
3463 case 0:
3464 if (STOUC(*++range ^ 32) == ch)
3465 return 1;
3466 break;
3467 case PP_ALPHA:
3468 if (isalpha(ch))
3469 return 1;
3470 break;
3471 case PP_ALNUM:
3472 if (isalnum(ch))
3473 return 1;
3474 break;
3475 case PP_ASCII:
3476 if ((ch & ~0x7f) == 0)
3477 return 1;
3478 break;
3479 case PP_BLANK:
3480 if (ch == ' ' || ch == '\t')
3481 return 1;
3482 break;
3483 case PP_CNTRL:
3484 if (iscntrl(ch))
3485 return 1;
3486 break;
3487 case PP_DIGIT:
3488 if (isdigit(ch))
3489 return 1;
3490 break;
3491 case PP_GRAPH:
3492 if (isgraph(ch))
3493 return 1;
3494 break;
3495 case PP_LOWER:
3496 if (islower(ch))
3497 return 1;
3498 break;
3499 case PP_PRINT:
3500 if (isprint(ch))
3501 return 1;
3502 break;
3503 case PP_PUNCT:
3504 if (ispunct(ch))
3505 return 1;
3506 break;
3507 case PP_SPACE:
3508 if (isspace(ch))
3509 return 1;
3510 break;
3511 case PP_UPPER:
3512 if (isupper(ch))
3513 return 1;
3514 break;
3515 case PP_XDIGIT:
3516 if (isxdigit(ch))
3517 return 1;
3518 break;
3519 case PP_IDENT:
3520 if (iident(ch))
3521 return 1;
3522 break;
3523 case PP_IFS:
3524 if (isep(ch))
3525 return 1;
3526 break;
3527 case PP_IFSSPACE:
3528 if (iwsep(ch))
3529 return 1;
3530 break;
3531 case PP_WORD:
3532 if (iword(ch))
3533 return 1;
3534 break;
3535 case PP_RANGE:
3536 range++;
3537 r1 = STOUC(UNMETA(range));
3538 METACHARINC(range);
3539 r2 = STOUC(UNMETA(range));
3540 if (*range == Meta)
3541 range++;
3542 if (r1 <= ch && ch <= r2) {
3543 if (indptr)
3544 *indptr += ch - r1;
3545 return 1;
3547 if (indptr && r1 < r2)
3548 *indptr += r2 - r1;
3549 break;
3550 case PP_UNKWN:
3551 DPUTS(1, "BUG: unknown posix range passed through.\n");
3552 break;
3553 default:
3554 DPUTS(1, "BUG: unknown metacharacter in range.");
3555 break;
3557 } else if (STOUC(*range) == ch) {
3558 if (mtp)
3559 *mtp = 0;
3560 return 1;
3562 if (indptr)
3563 (*indptr)++;
3565 return 0;
3569 /**/
3570 #ifndef MULTIBYTE_SUPPORT
3573 * Identical function to mb_patmatchindex() above for single-byte
3574 * characters. Here -1 represents a character that needs a special type.
3576 * Unlike patmatchrange, we only need this in ZLE, which always
3577 * uses MULTIBYTE_SUPPORT if compiled in; hence we don't use
3578 * this function in that case.
3581 /**/
3582 mod_export int
3583 patmatchindex(char *range, int ind, int *chr, int *mtp)
3585 int r1, r2, rdiff, rchr;
3587 *chr = -1;
3588 *mtp = 0;
3590 for (; *range; range++) {
3591 if (imeta(STOUC(*range))) {
3592 int swtype = STOUC(*range) - STOUC(Meta);
3593 switch (swtype) {
3594 case 0:
3595 /* ordinary metafied character */
3596 rchr = STOUC(*++range) ^ 32;
3597 if (!ind) {
3598 *chr = rchr;
3599 return 1;
3601 break;
3603 case PP_ALPHA:
3604 case PP_ALNUM:
3605 case PP_ASCII:
3606 case PP_BLANK:
3607 case PP_CNTRL:
3608 case PP_DIGIT:
3609 case PP_GRAPH:
3610 case PP_LOWER:
3611 case PP_PRINT:
3612 case PP_PUNCT:
3613 case PP_SPACE:
3614 case PP_UPPER:
3615 case PP_XDIGIT:
3616 case PP_IDENT:
3617 case PP_IFS:
3618 case PP_IFSSPACE:
3619 case PP_WORD:
3620 if (!ind) {
3621 *mtp = swtype;
3622 return 1;
3624 break;
3626 case PP_RANGE:
3627 range++;
3628 r1 = STOUC(UNMETA(range));
3629 METACHARINC(range);
3630 r2 = STOUC(UNMETA(range));
3631 if (*range == Meta)
3632 range++;
3633 rdiff = r2 - r1;
3634 if (rdiff >= ind) {
3635 *chr = r1 + ind;
3636 return 1;
3638 /* note the extra decrement to ind below */
3639 ind -= rdiff;
3640 break;
3641 case PP_UNKWN:
3642 DPUTS(1, "BUG: unknown posix range passed through.\n");
3643 break;
3644 default:
3645 DPUTS(1, "BUG: unknown metacharacter in range.");
3646 break;
3648 } else {
3649 if (!ind) {
3650 *chr = STOUC(*range);
3651 return 1;
3654 if (!ind--)
3655 break;
3658 /* No corresponding index. */
3659 return 0;
3662 /**/
3663 #endif /* MULTIBYTE_SUPPORT */
3666 * Repeatedly match something simple and say how many times.
3667 * charstart is an array parallel to that starting at patinput
3668 * and records the start of (possibly multibyte) characters
3669 * to aid in later backtracking.
3672 /**/
3673 static int patrepeat(Upat p, char *charstart)
3675 int count = 0;
3676 patint_t tch, charmatch_cache;
3677 char *scan, *opnd;
3679 scan = patinput;
3680 opnd = (char *)P_OPERAND(p);
3682 switch(P_OP(p)) {
3683 #ifdef DEBUG
3684 case P_ANY:
3685 dputs("BUG: ?# did not get optimized to *");
3686 return 0;
3687 break;
3688 #endif
3689 case P_EXACTLY:
3690 DPUTS(P_LS_LEN(p) != 1, "closure following more than one character");
3691 tch = CHARREF(P_LS_STR(p), P_LS_STR(p) + P_LS_LEN(p));
3692 while (scan < patinend &&
3693 CHARMATCH_EXPR(CHARREF(scan, patinend), tch)) {
3694 charstart[scan-patinput] = 1;
3695 count++;
3696 CHARINC(scan, patinend);
3698 break;
3699 case P_ANYOF:
3700 case P_ANYBUT:
3701 while (scan < patinend) {
3702 #ifdef MULTIBYTE_SUPPORT
3703 wchar_t cr = CHARREF(scan, patinend);
3704 if (patglobflags & GF_MULTIBYTE) {
3705 if (mb_patmatchrange(opnd, cr, NULL, NULL) ^
3706 (P_OP(p) == P_ANYOF))
3707 break;
3708 } else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^
3709 (P_OP(p) == P_ANYOF))
3710 break;
3711 #else
3712 if (patmatchrange(opnd, CHARREF(scan, patinend), NULL, NULL) ^
3713 (P_OP(p) == P_ANYOF))
3714 break;
3715 #endif
3716 charstart[scan-patinput] = 1;
3717 count++;
3718 CHARINC(scan, patinend);
3720 break;
3721 #ifdef DEBUG
3722 default:
3723 dputs("BUG: something very strange is happening in patrepeat");
3724 return 0;
3725 break;
3726 #endif
3729 patinput = scan;
3730 return count;
3733 /* Free a patprog. */
3735 /**/
3736 mod_export void
3737 freepatprog(Patprog prog)
3739 if (prog && prog != dummy_patprog1 && prog != dummy_patprog2)
3740 zfree(prog, prog->size);