2 * pattern.c - pattern matching
4 * This file is part of zsh, the Z shell.
6 * Copyright (c) 1999 Peter Stephenson
9 * Permission is hereby granted, without written agreement and without
10 * license or royalty fees, to use, copy, modify, and distribute this
11 * software and to distribute modified versions of this software for any
12 * purpose, provided that the above copyright notice and the following
13 * two paragraphs appear in all copies of this software.
15 * In no event shall Peter Stephenson or the Zsh Development Group be liable
16 * to any party for direct, indirect, special, incidental, or consequential
17 * damages arising out of the use of this software and its documentation,
18 * even if Peter Stephenson and the Zsh Development Group have been advised of
19 * the possibility of such damage.
21 * Peter Stephenson and the Zsh Development Group specifically disclaim any
22 * warranties, including, but not limited to, the implied warranties of
23 * merchantability and fitness for a particular purpose. The software
24 * provided hereunder is on an "as is" basis, and Peter Stephenson and the
25 * Zsh Development Group have no obligation to provide maintenance,
26 * support, updates, enhancements, or modifications.
28 * Pattern matching code derived from the regexp library by Henry
29 * Spencer, which has the following copyright.
31 * Copyright (c) 1986 by University of Toronto.
32 * Written by Henry Spencer. Not derived from licensed software.
34 * Permission is granted to anyone to use this software for any
35 * purpose on any computer system, and to redistribute it freely,
36 * subject to the following restrictions:
38 * 1. The author is not responsible for the consequences of use of
39 * this software, no matter how awful, even if they arise
42 * 2. The origin of this software must not be misrepresented, either
43 * by explicit claim or by omission.
45 * 3. Altered versions must be plainly marked as such, and must not
46 * be misrepresented as being the original software.
48 * Eagle-eyed readers will notice this is an altered version. Incredibly
49 * sharp-eyed readers might even find bits that weren't altered.
52 * And I experienced a sense that, like certain regular
53 * expressions, seemed to match the day from beginning to end, so
54 * that I did not need to identify the parenthesised subexpression
55 * that told of dawn, nor the group of characters that indicated
56 * the moment when my grandfather returned home with news of
57 * Swann's departure for Paris; and the whole length of the month
58 * of May, as if matched by a closure, fitted into the buffer of my
59 * life with no sign of overflowing, turning the days, like a
60 * procession of insects that could consist of this or that
61 * species, into a random and unstructured repetition of different
62 * sequences, anchored from the first day of the month to the last
63 * in the same fashion as the weeks when I knew I would not see
64 * Gilberte and would search in vain for any occurrences of the
65 * string in the avenue of hawthorns by Tansonville, without my
66 * having to delimit explicitly the start or finish of the pattern.
68 * M. Proust, "In Search of Lost Files",
69 * bk I, "The Walk by Bourne's Place".
75 * The following union is used mostly for alignment purposes.
76 * Normal nodes are longs, while certain nodes take a char * as an argument;
77 * here we make sure that they both work out to the same length.
78 * The compiled regexp we construct consists of upats stuck together;
79 * anything else to be added (strings, numbers) is stuck after and
80 * then aligned to a whole number of upat units.
82 * Note also that offsets are in terms of the sizes of these things.
89 typedef union upat
*Upat
;
91 #include "pattern.pro"
93 /* Number of active parenthesized expressions allowed in backreferencing */
96 /* definition number opnd? meaning */
97 #define P_END 0x00 /* no End of program. */
98 #define P_EXCSYNC 0x01 /* no Test if following exclude already failed */
99 #define P_EXCEND 0x02 /* no Test if exclude matched orig branch */
100 #define P_BACK 0x03 /* no Match "", "next" ptr points backward. */
101 #define P_EXACTLY 0x04 /* lstr Match this string. */
102 #define P_NOTHING 0x05 /* no Match empty string. */
103 #define P_ONEHASH 0x06 /* node Match this (simple) thing 0 or more times. */
104 #define P_TWOHASH 0x07 /* node Match this (simple) thing 1 or more times. */
105 #define P_GFLAGS 0x08 /* long Match nothing and set globbing flags */
106 #define P_ISSTART 0x09 /* no Match start of string. */
107 #define P_ISEND 0x0a /* no Match end of string. */
108 #define P_COUNTSTART 0x0b /* no Initialise P_COUNT */
109 #define P_COUNT 0x0c /* 3*long uc* node Match a number of repetitions */
110 /* numbered so we can test bit 5 for a branch */
111 #define P_BRANCH 0x20 /* node Match this alternative, or the next... */
112 #define P_WBRANCH 0x21 /* uc* node P_BRANCH, but match at least 1 char */
113 /* excludes are also branches, but have bit 4 set, too */
114 #define P_EXCLUDE 0x30 /* uc* node Exclude this from previous branch */
115 #define P_EXCLUDP 0x31 /* uc* node Exclude, using full file path so far */
116 /* numbered so we can test bit 6 so as not to match initial '.' */
117 #define P_ANY 0x40 /* no Match any one character. */
118 #define P_ANYOF 0x41 /* str Match any character in this string. */
119 #define P_ANYBUT 0x42 /* str Match any character not in this string. */
120 #define P_STAR 0x43 /* no Match any set of characters. */
121 #define P_NUMRNG 0x44 /* zr, zr Match a numeric range. */
122 #define P_NUMFROM 0x45 /* zr Match a number >= X */
123 #define P_NUMTO 0x46 /* zr Match a number <= X */
124 #define P_NUMANY 0x47 /* no Match any set of decimal digits */
125 /* spaces left for P_OPEN+n,... for backreferences */
126 #define P_OPEN 0x80 /* no Mark this point in input as start of n. */
127 #define P_CLOSE 0x90 /* no Analogous to OPEN. */
130 * zr the range type zrange_t: may be zlong or unsigned long
132 * uc* a pointer to unsigned char, used at run time and initialised
134 * str null-terminated, metafied string
135 * lstr length as long then string, not null-terminated, unmetafied.
140 * P_WBRANCH: This works like a branch and is used in complex closures,
141 * to ensure we don't succeed on a zero-length match of the pattern,
142 * since that would cause an infinite loop. We do this by recording
143 * the positions where we have already tried to match. See the
144 * P_WBRANCH test in patmatch().
146 * P_ANY, P_ANYOF: the operand is a null terminated
147 * string. Normal characters match as expected. Characters
148 * in the range Meta+PP_ALPHA..Meta+PP_UNKNWN do the appropriate
149 * Posix range tests. This relies on imeta returning true for these
150 * characters. We treat unknown POSIX ranges as never matching.
151 * PP_RANGE means the next two (possibly metafied) characters form
152 * the limits of a range to test; it's too much like hard work to
155 * P_EXCLUDE, P_EXCSYNC, PEXCEND: P_EXCLUDE appears in the pattern like
156 * P_BRANCH, but applies to the immediately preceding branch. The code in
157 * the corresponding branch is followed by a P_EXCSYNC, which simply
158 * acts as a marker that a P_EXCLUDE comes next. The P_EXCLUDE
159 * has a pointer to char embeded in it, which works
160 * like P_WBRANCH: if we get to the P_EXCSYNC, and we already matched
161 * up to the same position, fail. Thus we are forced to backtrack
162 * on closures in the P_BRANCH if the first attempt was excluded.
163 * Corresponding to P_EXCSYNC in the original branch, there is a
164 * P_EXCEND in the exclusion. If we get to this point, and we did
165 * *not* match in the original branch, the exclusion itself fails,
166 * otherwise it succeeds since we know the tail already matches,
167 * so P_EXCEND is the end of the exclusion test.
168 * The whole sorry mess looks like this, where the upper lines
169 * show the linkage of the branches, and the lower shows the linkage
170 * of their pattern arguments.
172 * --------------------- ----------------------
174 * ( <BRANCH>:apat-><EXCSYNC> <EXCLUDE>:excpat-><EXCEND> ) tail
177 * --------------------------------------
179 * P_EXCLUDP: this behaves exactly like P_EXCLUDE, with the sole exception
180 * that we prepend the path so far to the exclude pattern. This is
181 * for top level file globs, e.g. ** / *.c~*foo.c
182 * ^ I had to leave this space
183 * P_NUM*: zl is a zlong if that is 64-bit, else an unsigned long.
185 * P_COUNTSTART, P_COUNT: a P_COUNTSTART flags the start of a quantified
186 * closure (#cN,M) and is used to initialise the count. Executing
187 * the pattern leads back to the P_COUNT, while the next links of the
188 * P_COUNTSTART and P_COUNT lead to the tail of the pattern:
192 * <COUNTSTART><COUNT>pattern<BACK> tail
194 * ------------------------
197 #define P_OP(p) ((p)->l & 0xff)
198 #define P_NEXT(p) ((p)->l >> 8)
199 #define P_OPERAND(p) ((p) + 1)
200 #define P_ISBRANCH(p) ((p)->l & 0x20)
201 #define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30)
202 #define P_NOTDOT(p) ((p)->l & 0x40)
204 /* Specific to lstr type, i.e. P_EXACTLY. */
205 #define P_LS_LEN(p) ((p)[1].l) /* can be used as lvalue */
206 #define P_LS_STR(p) ((char *)((p) + 2))
208 /* Specific to P_COUNT: arguments as offset in nodes from operator */
209 #define P_CT_CURRENT (1) /* Current count */
210 #define P_CT_MIN (2) /* Minimum count */
211 #define P_CT_MAX (3) /* Maximum count, -1 for none */
212 #define P_CT_PTR (4) /* Pointer to last match start */
213 #define P_CT_OPERAND (5) /* Operand of P_COUNT */
215 /* Flags needed when pattern is executed */
216 #define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */
217 #define P_HSTART 0x02 /* Starts with # or ##'d pattern. */
218 #define P_PURESTR 0x04 /* Can be matched with a strcmp */
220 #if defined(ZSH_64_BIT_TYPE) || defined(LONG_IS_64_BIT)
221 typedef zlong zrange_t
;
222 #define ZRANGE_T_IS_SIGNED (1)
224 typedef unsigned long zrange_t
;
228 * Characters which terminate a pattern segment. We actually use
229 * a pointer patendseg which skips the first character if we are not
230 * parsing a file pattern.
231 * Note that the size of this and the next array are hard-wired
232 * via the definitions.
235 static char endseg
[] = {
237 '\0', Bar
, Outpar
, /* all patterns */
238 Tilde
/* extended glob only */
241 #define PATENDSEGLEN_NORM 4
242 #define PATENDSEGLEN_EXT 5
244 /* Characters which terminate a simple string */
246 static char endstr
[] = {
248 '\0', Bar
, Outpar
, Quest
, Star
, Inbrack
, Inpar
, Inang
, Bnullkeep
,
250 Tilde
, Hat
, Pound
/* extended glob only */
253 #define PATENDSTRLEN_NORM 10
254 #define PATENDSTRLEN_EXT 13
257 /* Default size for pattern buffer */
258 #define P_DEF_ALLOC 256
260 /* Flags used in compilation */
261 static char *patstart
, *patparse
; /* input pointers */
262 static int patnpar
; /* () count */
263 static char *patcode
; /* point of code emission */
264 static long patsize
; /* size of code */
265 static char *patout
; /* start of code emission string */
266 static long patalloc
; /* size allocated for same */
267 static char *patendseg
; /* characters ending segment */
268 static int patendseglen
; /* length of same */
269 static char *patendstr
; /* characters ending plain string */
270 static int patendstrlen
; /* length of sameo */
272 /* Flags used in both compilation and execution */
273 static int patflags
; /* flags passed down to patcompile */
274 static int patglobflags
; /* globbing flags & approx */
277 * Increment pointer to metafied multibyte string.
279 #ifdef MULTIBYTE_SUPPORT
280 typedef wint_t patint_t
;
284 #define METACHARINC(x) ((void)metacharinc(&x))
287 * TODO: the shiftstate isn't well handled; we don't guarantee
288 * to maintain it properly between characters. If we don't
289 * need it we should use mbtowc() instead.
291 static mbstate_t shiftstate
;
294 * Multibyte version: it's (almost) as easy to return the
295 * value as not, so do so since we sometimes need it..
298 metacharinc(char **x
)
302 size_t ret
= MB_INVALID
;
306 * Cheat if the top bit isn't set. This is second-guessing
307 * the library, but we know for sure that if the character
308 * set doesn't have the property that all bytes with the 8th
309 * bit clear are single characters then we are stuffed.
311 if (!(patglobflags
& GF_MULTIBYTE
) || !(STOUC(*inptr
) & 0x80))
314 inchar
= ztokens
[*inptr
++ - Pound
];
315 else if (*inptr
== Meta
) {
317 inchar
= *inptr
++ ^ 32;
322 return (wchar_t)STOUC(inchar
);
327 inchar
= ztokens
[*inptr
++ - Pound
];
328 else if (*inptr
== Meta
) {
330 inchar
= *inptr
++ ^ 32;
334 ret
= mbrtowc(&wc
, &inchar
, 1, &shiftstate
);
336 if (ret
== MB_INVALID
)
338 if (ret
== MB_INCOMPLETE
)
344 /* Error. Treat as single byte. */
345 /* Reset the shift state for next time. */
346 memset(&shiftstate
, 0, sizeof(shiftstate
));
347 return (wchar_t) STOUC(*(*x
)++);
351 typedef int patint_t
;
355 #define METACHARINC(x) ((void)((x) += (*(x) == Meta) ? 2 : 1))
359 * Return unmetafied char from string (x is any char *).
360 * Used with MULTIBYTE_SUPPORT if the GF_MULTIBYTE is not
363 #define UNMETA(x) (*(x) == Meta ? (x)[1] ^ 32 : *(x))
365 /* Add n more characters, ensuring there is enough space. */
374 patadd(char *add
, int ch
, long n
, int paflags
)
376 /* Make sure everything gets aligned unless we get PA_NOALIGN. */
377 long newpatsize
= patsize
+ n
;
378 if (!(paflags
& PA_NOALIGN
))
379 newpatsize
= (newpatsize
+ sizeof(union upat
) - 1) &
380 ~(sizeof(union upat
) - 1);
381 if (patalloc
< newpatsize
) {
383 2*(newpatsize
> patalloc
? newpatsize
: patalloc
);
384 patout
= (char *)zrealloc((char *)patout
, newpatalloc
);
385 patcode
= patout
+ patsize
;
386 patalloc
= newpatalloc
;
388 patsize
= newpatsize
;
390 if (paflags
& PA_UNMETA
) {
392 * Unmetafy and untokenize the string as we go.
393 * The Meta characters in add aren't counted in n.
397 *patcode
++ = ztokens
[*add
++ - Pound
];
398 else if (*add
== Meta
) {
400 *patcode
++ = *add
++ ^ 32;
411 patcode
= patout
+ patsize
;
415 /* operates on pointers to union upat, returns a pointer */
416 #define PATNEXT(p) ((rn_offs = P_NEXT(p)) ? \
417 (P_OP(p) == P_BACK) ? \
418 ((p)-rn_offs) : ((p)+rn_offs) : NULL)
420 /* Called before parsing a set of file matchs to initialize flags */
429 patglobflags
= GF_IGNCASE
;
430 if (isset(MULTIBYTE
))
431 patglobflags
|= GF_MULTIBYTE
;
435 * Top level pattern compilation subroutine
436 * exp is a null-terminated, metafied string.
437 * inflags is an or of some PAT_* flags.
438 * endexp, if non-null, is set to a pointer to the end of the
439 * part of exp which was compiled. This is used when
440 * compiling patterns for directories which must be
441 * matched recursively.
446 patcompile(char *exp
, int inflags
, char **endexp
)
452 char *lng
, *strp
= NULL
;
455 startoff
= sizeof(struct patprog
);
456 /* Ensure alignment of start of program string */
457 startoff
= (startoff
+ sizeof(union upat
) - 1) & ~(sizeof(union upat
) - 1);
459 /* Allocate reasonable sized chunk if none, reduce size if too big */
460 if (patalloc
!= P_DEF_ALLOC
)
461 patout
= (char *)zrealloc(patout
, patalloc
= P_DEF_ALLOC
);
462 patcode
= patout
+ startoff
;
463 patsize
= patcode
- patout
;
464 patstart
= patparse
= exp
;
466 * Note global patnpar numbers parentheses 1..9, while patnpar
467 * in struct is actual count of parentheses.
470 patflags
= inflags
& ~(PAT_PURES
|PAT_HAS_EXCLUDP
);
473 patendseglen
= isset(EXTENDEDGLOB
) ? PATENDSEGLEN_EXT
: PATENDSEGLEN_NORM
;
475 patendstrlen
= isset(EXTENDEDGLOB
) ? PATENDSTRLEN_EXT
: PATENDSTRLEN_NORM
;
477 if (!(patflags
& PAT_FILE
)) {
482 remnulargs(patparse
);
483 if (isset(MULTIBYTE
))
484 patglobflags
= GF_MULTIBYTE
;
488 if (patflags
& PAT_LCMATCHUC
)
489 patglobflags
|= GF_LCMATCHUC
;
491 * Have to be set now, since they get updated during compilation.
493 ((Patprog
)patout
)->globflags
= patglobflags
;
495 if (!(patflags
& PAT_ANY
)) {
496 /* Look for a really pure string, with no tokens at all. */
497 if (!(patglobflags
& ~GF_MULTIBYTE
)
500 * If the OS treats files case-insensitively and we
501 * are looking at files, we don't need to use pattern
502 * matching to find the file.
504 || (!(patglobflags
& ~GF_IGNCASE
) && (patflags
& PAT_FILE
))
509 * Waah! I wish I understood this.
510 * Empty metafied strings have an initial Nularg.
511 * This never corresponds to a real character in
512 * a glob pattern or string, so skip it.
516 for (strp
= exp
; *strp
&&
517 (!(patflags
& PAT_FILE
) || *strp
!= '/') && !itok(*strp
);
521 if (!strp
|| (*strp
&& *strp
!= '/')) {
522 /* No, do normal compilation. */
524 if (patcompswitch(0, &flags
) == 0)
528 * Yes, copy the string, and skip compilation altogether.
529 * Null terminate for the benefit of globbing.
530 * Leave metafied both for globbing and for our own
535 patadd(exp
, 0, len
+ 1, 0);
536 patout
[startoff
+ len
] = '\0';
537 patflags
|= PAT_PURES
;
541 /* end of compilation: safe to use pointers */
543 p
->startoff
= startoff
;
544 p
->patstartch
= '\0';
545 p
->globend
= patglobflags
;
550 p
->patnpar
= patnpar
-1;
553 pscan
= (Upat
)(patout
+ startoff
);
555 if (!(patflags
& PAT_ANY
) && P_OP(PATNEXT(pscan
)) == P_END
) {
556 /* only one top level choice */
557 pscan
= P_OPERAND(pscan
);
559 if (flags
& P_PURESTR
) {
561 * The pattern can be matched with a simple strncmp/strcmp.
562 * Careful in case we've overwritten the node for the next ptr.
564 char *dst
= patout
+ startoff
;
566 p
->flags
|= PAT_PURES
;
567 for (; pscan
; pscan
= next
) {
568 next
= PATNEXT(pscan
);
569 if (P_OP(pscan
) == P_EXACTLY
) {
570 char *opnd
= P_LS_STR(pscan
), *mtest
;
571 long oplen
= P_LS_LEN(pscan
), ilen
;
574 * Unfortunately we unmetafied the string
575 * and we need to put any metacharacters
576 * back now we know it's a pure string.
577 * This shouldn't happen too often, it's
578 * just that there are some cases such
579 * as . and .. in files where we really
580 * need a pure string even if there are
581 * pattern characters flying around.
583 for (mtest
= opnd
, ilen
= oplen
; ilen
;
588 char *oldpatout
= patout
;
589 patadd(NULL
, 0, nmeta
, 0);
594 opnd
= patout
+ (opnd
- oldpatout
);
595 dst
= patout
+ startoff
;
601 *dst
++ = *opnd
++ ^ 32;
608 p
->size
= dst
- patout
;
609 /* patmlen is really strlen. We don't need a null. */
610 p
->patmlen
= p
->size
- startoff
;
612 /* starting point info */
613 if (P_OP(pscan
) == P_EXACTLY
&& !p
->globflags
&&
615 p
->patstartch
= *P_LS_STR(pscan
);
617 * Find the longest literal string in something expensive.
618 * This is itself not all that cheap if we have
619 * case-insensitive matching or approximation, so don't.
621 if ((flags
& P_HSTART
) && !p
->globflags
) {
624 for (; pscan
; pscan
= PATNEXT(pscan
))
625 if (P_OP(pscan
) == P_EXACTLY
&&
626 P_LS_LEN(pscan
) >= len
) {
627 lng
= P_LS_STR(pscan
);
628 len
= P_LS_LEN(pscan
);
631 p
->mustoff
= lng
- patout
;
640 * The pattern was compiled in a fixed buffer: unless told otherwise,
641 * we stick the compiled pattern on the heap. This is necessary
642 * for files where we will often be compiling multiple segments at once.
643 * But if we get the ZDUP flag we always put it in zalloc()ed memory.
645 if (patflags
& PAT_ZDUP
) {
646 Patprog newp
= (Patprog
)zalloc(patsize
);
647 memcpy((char *)newp
, (char *)p
, patsize
);
649 } else if (!(patflags
& PAT_STATIC
)) {
650 Patprog newp
= (Patprog
)zhalloc(patsize
);
651 memcpy((char *)newp
, (char *)p
, patsize
);
661 * Main body or parenthesized subexpression in pattern
662 * Parenthesis (and any ksh_glob gubbins) will have been removed.
667 patcompswitch(int paren
, int *flagp
)
669 long starter
, br
, ender
, excsync
= 0;
671 int flags
, gfchanged
= 0, savglobflags
= patglobflags
;
676 if (paren
&& (patglobflags
& GF_BACKREF
) && patnpar
<= NSUBEXP
) {
678 * parenthesized: make an open node.
679 * We can only refer to the first nine parentheses.
680 * For any others, we just use P_OPEN on its own; there's
681 * no gain in arbitrarily limiting the number of parentheses.
684 starter
= patnode(P_OPEN
+ parno
);
688 br
= patnode(P_BRANCH
);
689 if (!patcompbranch(&flags
))
691 if (patglobflags
!= savglobflags
)
694 pattail(starter
, br
);
698 *flagp
|= flags
& (P_HSTART
|P_PURESTR
);
700 while (*patparse
== Bar
||
701 (isset(EXTENDEDGLOB
) && *patparse
== Tilde
&&
702 (patparse
[1] == '/' ||
703 !memchr(patendseg
, patparse
[1], patendseglen
)))) {
704 int tilde
= *patparse
++ == Tilde
;
705 long gfnode
= 0, newbr
;
707 *flagp
&= ~P_PURESTR
;
711 /* excsync remembers the P_EXCSYNC node before a chain of
712 * exclusions: all point back to this. only the
713 * original (non-excluded) branch gets a trailing P_EXCSYNC.
716 excsync
= patnode(P_EXCSYNC
);
717 patoptail(br
, excsync
);
720 * By default, approximations are turned off in exclusions:
721 * we need to do this here as otherwise the code compiling
722 * the exclusion doesn't know if the flags have really
723 * changed if the error count gets restored.
725 patglobflags
&= ~0xff;
726 if (!(patflags
& PAT_FILET
) || paren
) {
727 br
= patnode(P_EXCLUDE
);
730 * At top level (paren == 0) in a file glob !(patflags
731 * &PAT_FILET) do the exclusion prepending the file path
732 * so far. We need to flag this to avoid unnecessarily
735 br
= patnode(P_EXCLUDP
);
736 patflags
|= PAT_HAS_EXCLUDP
;
739 patadd((char *)&up
, 0, sizeof(up
), 0);
740 /* / is not treated as special if we are at top level */
741 if (!paren
&& *patendseg
== '/') {
750 br
= patnode(P_BRANCH
);
752 * The position of the following statements means globflags
753 * set in the main branch carry over to the exclusion.
757 if (((Patprog
)patout
)->globflags
) {
759 * If at top level, we need to reinitialize flags to zero,
760 * since (#i)foo|bar only applies to foo and we stuck
761 * the #i into the global flags.
762 * We could have done it so that they only got set in the
763 * first branch, but it's quite convenient having any
764 * global flags set in the header and not buried in the
765 * pattern. (Or maybe it isn't and we should
766 * forget this bit and always stick in an explicit GFLAGS
767 * statement instead of using the header.)
768 * Also, this can't happen for file globs where there are
771 * No gfchanged, as nothing to follow branch at top
775 gfnode
= patnode(P_GFLAGS
);
777 patadd((char *)&up
, 0, sizeof(union upat
), 0);
780 patglobflags
= savglobflags
;
783 newbr
= patcompbranch(&flags
);
785 /* restore special treatment of / */
794 pattail(gfnode
, newbr
);
795 if (!tilde
&& patglobflags
!= savglobflags
)
797 pattail(starter
, br
);
799 patoptail(br
, patnode(P_EXCEND
));
800 *flagp
|= flags
& P_HSTART
;
804 * Make a closing node, hooking it to the end.
805 * Note that we can't optimize P_NOTHING out here, since another
806 * branch at that point would indicate the current choices continue,
809 ender
= patnode(paren
? parno
? P_CLOSE
+parno
: P_NOTHING
: P_END
);
810 pattail(starter
, ender
);
813 * Hook the tails of the branches to the closing node,
814 * except for exclusions which terminate where they are.
816 for (ptr
= (Upat
)patout
+ starter
; ptr
; ptr
= PATNEXT(ptr
))
817 if (!P_ISEXCLUDE(ptr
))
818 patoptail(ptr
-(Upat
)patout
, ender
);
820 /* check for proper termination */
821 if ((paren
&& *patparse
++ != Outpar
) ||
822 (!paren
&& *patparse
&&
823 !((patflags
& PAT_FILE
) && *patparse
== '/')))
826 if (paren
&& gfchanged
) {
828 * Restore old values of flags when leaving parentheses.
829 * gfchanged detects a change in any branch (except exclusions
830 * which are separate), since we need to emit this even if
831 * a later branch happened to put the flags back.
833 pattail(ender
, patnode(P_GFLAGS
));
834 patglobflags
= savglobflags
;
835 patadd((char *)&savglobflags
, 0, sizeof(long), 0);
842 * Compile something ended by Bar, Outpar, Tilde, or end of string.
843 * Note the BRANCH or EXCLUDE tag must already have been omitted:
844 * this returns the position of the operand of that.
849 patcompbranch(int *flagp
)
851 long chain
, latest
= 0, starter
;
857 while (!memchr(patendseg
, *patparse
, patendseglen
) ||
858 (*patparse
== Tilde
&& patparse
[1] != '/' &&
859 memchr(patendseg
, patparse
[1], patendseglen
))) {
860 if (isset(EXTENDEDGLOB
) &&
862 (*patparse
== Inpar
&& patparse
[1] == Pound
)) ||
863 (isset(KSHGLOB
) && *patparse
== '@' && patparse
[1] == Inpar
&&
864 patparse
[2] == Pound
))) {
865 /* Globbing flags. */
866 char *pp1
= patparse
;
867 int oldglobflags
= patglobflags
, ignore
;
869 patparse
+= (*patparse
== '@') ? 3 : 2;
870 if (!patgetglobflags(&patparse
, &assert, &ignore
))
875 * Start/end assertion looking like flags, but
876 * actually handled as a normal node
878 latest
= patnode(assert);
881 if (pp1
== patstart
) {
882 /* Right at start of pattern, the simplest case.
883 * Put them into the flags and don't emit anything.
885 ((Patprog
)patout
)->globflags
= patglobflags
;
887 } else if (!*patparse
) {
888 /* Right at the end, so just leave the flags for
889 * the next Patprog in the chain to pick up.
894 * Otherwise, we have to stick them in as a pattern
897 if (oldglobflags
!= patglobflags
) {
900 latest
= patnode(P_GFLAGS
);
902 patadd((char *)&up
, 0, sizeof(union upat
), 0);
908 } else if (!*patparse
)
912 } else if (isset(EXTENDEDGLOB
) && *patparse
== Hat
) {
914 * ^pat: anything but pat. For proper backtracking,
915 * etc., we turn this into (*~pat), except without the
919 latest
= patcompnot(0, &flags
);
921 latest
= patcomppiece(&flags
);
926 if (!(flags
& P_PURESTR
))
927 *flagp
&= ~P_PURESTR
;
929 *flagp
|= flags
& P_HSTART
;
931 pattail(chain
, latest
);
934 /* check if there was nothing in the loop, i.e. () */
936 starter
= patnode(P_NOTHING
);
941 /* get glob flags, return 1 for success, 0 for failure */
945 patgetglobflags(char **strp
, long *assertp
, int *ignore
)
947 char *nptr
, *ptr
= *strp
;
952 /* (#X): assumes we are still positioned on the first X */
953 for (; *ptr
&& *ptr
!= Outpar
; ptr
++) {
955 /* Glob qualifiers, ignored in pattern code */
956 while (*ptr
&& *ptr
!= Outpar
)
963 /* Approximate matching, max no. of errors follows */
964 ret
= zstrtol(++ptr
, &nptr
, 10);
966 * We can't have more than 254, because we need 255 to
967 * mark 254 errors in wbranch and exclude sync strings
968 * (hypothetically --- hope no-one tries it).
970 if (ret
< 0 || ret
> 254 || ptr
== nptr
)
972 patglobflags
= (patglobflags
& ~0xff) | (ret
& 0xff);
977 /* Lowercase in pattern matches lower or upper in target */
978 patglobflags
= (patglobflags
& ~GF_IGNCASE
) | GF_LCMATCHUC
;
982 /* Fully case insensitive */
983 patglobflags
= (patglobflags
& ~GF_LCMATCHUC
) | GF_IGNCASE
;
987 /* Restore case sensitivity */
988 patglobflags
&= ~(GF_LCMATCHUC
|GF_IGNCASE
);
992 /* Make backreferences */
993 patglobflags
|= GF_BACKREF
;
997 /* Don't make backreferences */
998 patglobflags
&= ~GF_BACKREF
;
1002 /* Make references to complete match */
1003 patglobflags
|= GF_MATCHREF
;
1008 patglobflags
&= ~GF_MATCHREF
;
1012 *assertp
= P_ISSTART
;
1020 patglobflags
|= GF_MULTIBYTE
;
1024 patglobflags
&= ~GF_MULTIBYTE
;
1034 /* Start/end assertions must appear on their own. */
1035 if (*assertp
&& (*strp
)[1] != Outpar
)
1042 static const char *colon_stuffs
[] = {
1043 "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph",
1044 "lower", "print", "punct", "space", "upper", "xdigit", "IDENT",
1045 "IFS", "IFSSPACE", "WORD", NULL
1049 * Handle the guts of a [:stuff:] character class element.
1050 * start is the beginning of "stuff" and len is its length.
1051 * This code is exported for the benefit of completion matching.
1056 range_type(char *start
, int len
)
1060 for (csp
= colon_stuffs
; *csp
; csp
++) {
1061 if (!strncmp(start
, *csp
, len
))
1062 return (csp
- colon_stuffs
) + PP_FIRST
;
1070 * Convert the contents of a [...] or [^...] expression (just the
1071 * ... part) back into a string. This is used by compfiles -p/-P
1072 * for some reason. The compiled form (a metafied string) is
1073 * passed in rangestr.
1075 * If outstr is non-NULL the compiled form is placed there. It
1076 * must be sufficiently long. A terminating NULL is appended.
1078 * Return the length required, not including the terminating NULL.
1080 * TODO: this is non-multibyte for now. It will need to be defined
1081 * appropriately with MULTIBYTE_SUPPORT when the completion matching
1087 pattern_range_to_string(char *rangestr
, char *outstr
)
1092 if (imeta(STOUC(*rangestr
))) {
1093 int swtype
= STOUC(*rangestr
) - STOUC(Meta
);
1096 /* Ordindary metafied character */
1100 *outstr
++ = rangestr
[1] ^ 32;
1104 } else if (swtype
== PP_RANGE
) {
1108 for (i
= 0; i
< 2; i
++) {
1109 if (*rangestr
== Meta
) {
1112 *outstr
++ = rangestr
[1];
1118 *outstr
++ = *rangestr
;
1129 } else if (swtype
>= PP_FIRST
&& swtype
<= PP_LAST
) {
1130 /* [:stuff:]; we need to output [: and :] */
1131 const char *found
= colon_stuffs
[swtype
- PP_FIRST
];
1132 int newlen
= strlen(found
);
1134 strcpy(outstr
, "[:");
1136 memcpy(outstr
, found
, newlen
);
1138 strcpy(outstr
, ":]");
1144 /* shouldn't happen */
1145 DPUTS(1, "BUG: unknown PP_ code in pattern range");
1149 /* ordinary character, guaranteed no Meta handling needed */
1151 *outstr
++ = *rangestr
;
1163 * compile a chunk such as a literal string or a [...] followed
1164 * by a possible hash operator
1169 patcomppiece(int *flagp
)
1171 long starter
= 0, next
, op
, opnd
;
1172 int flags
, flags2
, kshchar
, len
, ch
, patch
, nmeta
;
1175 char *nptr
, *str0
, *ptr
, *patprev
;
1180 str0
= patprev
= patparse
;
1183 * Check if we have a string. First, we need to make sure
1184 * the string doesn't introduce a ksh-like parenthesized expression.
1187 if (isset(KSHGLOB
) && *patparse
&& patparse
[1] == Inpar
) {
1188 if (strchr("?*+!@", *patparse
))
1189 kshchar
= STOUC(*patparse
);
1190 else if (*patparse
== Star
|| *patparse
== Quest
)
1191 kshchar
= STOUC(ztokens
[*patparse
- Pound
]);
1195 * End of string (or no string at all) if ksh-type parentheses,
1196 * or special character, unless that character is a tilde and
1197 * the character following is an end-of-segment character. Thus
1198 * tildes are not special if there is nothing following to
1201 if (kshchar
|| (memchr(patendstr
, *patparse
, patendstrlen
) &&
1202 (*patparse
!= Tilde
||
1203 patparse
[1] == '/' ||
1204 !memchr(patendseg
, patparse
[1], patendseglen
))))
1207 /* Remember the previous character for backtracking */
1209 METACHARINC(patparse
);
1212 if (patparse
> str0
) {
1213 long slen
= patparse
- str0
;
1216 /* Ordinary string: cancel kshchar lookahead */
1219 * Assume it matches a simple string until we find otherwise.
1222 DPUTS(patparse
== str0
, "BUG: matched nothing in patcomppiece.");
1223 /* more than one character matched? */
1224 morelen
= (patprev
> str0
);
1226 * If we have more than one character, a following hash
1227 * or (#c...) only applies to the last, so backtrack one character.
1229 if (isset(EXTENDEDGLOB
) &&
1230 (*patparse
== Pound
||
1231 (*patparse
== Inpar
&& patparse
[1] == Pound
&&
1232 patparse
[2] == 'c')) && morelen
)
1235 * If len is 1, we can't have an active # following, so doesn't
1236 * matter that we don't make X in `XX#' simple.
1240 starter
= patnode(P_EXACTLY
);
1242 /* Get length of string without metafication. */
1244 /* inherited from domatch, but why, exactly? */
1245 if (*str0
== Nularg
)
1247 for (ptr
= str0
; ptr
< patparse
; ptr
++) {
1253 slen
= (patparse
- str0
) - nmeta
;
1254 /* First add length, which is a long */
1255 patadd((char *)&slen
, 0, sizeof(long), 0);
1257 * Then the string, not null terminated.
1258 * Unmetafy and untokenize; pass the final length,
1259 * which is what we need to allocate, i.e. not including
1260 * a count for each Meta in the string.
1262 patadd(str0
, 0, slen
, PA_UNMETA
);
1263 nptr
= P_LS_STR((Upat
)patout
+ starter
);
1265 * It's much simpler to turn off pure string mode for
1266 * any case-insensitive or approximate matching; usually,
1267 * that is correct, or they wouldn't have been turned on.
1268 * However, we need to make sure we match a "." or ".."
1269 * in a file name as a pure string. There's a minor bug
1270 * that this will also apply to something like
1271 * ..(#a1).. (i.e. the (#a1) has no effect), but if you're
1272 * going to write funny patterns, you get no sympathy from me.
1277 * As above: don't use pattern matching for files
1278 * just because of case insensitivity if file system
1279 * is known to be case insensitive.
1281 * This is known to be necessary in at least one case:
1282 * if "mount -c /" is in effect, so that drives appear
1283 * directly under / instead of the usual /cygdrive, they
1284 * aren't shown by readdir(). So it's vital we don't use
1285 * globbing to find "/c", since that'll fail.
1287 ((patflags
& PAT_FILE
) ?
1288 (0xFF|GF_LCMATCHUC
) :
1289 (0xFF|GF_LCMATCHUC
|GF_IGNCASE
))
1291 (0xFF|GF_LCMATCHUC
|GF_IGNCASE
)
1294 if (!(patflags
& PAT_FILE
))
1295 flags
&= ~P_PURESTR
;
1296 else if (!(nptr
[0] == '.' &&
1297 (slen
== 1 || (nptr
[1] == '.' && slen
== 2))))
1298 flags
&= ~P_PURESTR
;
1305 METACHARINC(patparse
);
1309 starter
= patnode(P_ANY
);
1312 /* kshchar is used as a sign that we can't have #'s. */
1314 starter
= patnode(P_STAR
);
1318 if (*patparse
== Hat
|| *patparse
== '^' || *patparse
== '!') {
1320 starter
= patnode(P_ANYBUT
);
1322 starter
= patnode(P_ANYOF
);
1323 if (*patparse
== Outbrack
) {
1325 patadd(NULL
, ']', 1, PA_NOALIGN
);
1327 while (*patparse
&& *patparse
!= Outbrack
) {
1328 /* Meta is not a token */
1329 if (*patparse
== Inbrack
&& patparse
[1] == ':' &&
1330 (nptr
= strchr(patparse
+2, ':')) &&
1331 nptr
[1] == Outbrack
) {
1334 len
= nptr
- patparse
;
1335 ch
= range_type(patparse
, len
);
1336 patparse
= nptr
+ 2;
1338 patadd(NULL
, STOUC(Meta
) + ch
, 1, PA_NOALIGN
);
1341 charstart
= patparse
;
1342 METACHARINC(patparse
);
1344 if (*patparse
== '-' && patparse
[1] &&
1345 patparse
[1] != Outbrack
) {
1346 patadd(NULL
, STOUC(Meta
)+PP_RANGE
, 1, PA_NOALIGN
);
1347 if (itok(*charstart
)) {
1348 patadd(0, STOUC(ztokens
[*charstart
- Pound
]), 1,
1351 patadd(charstart
, 0, patparse
-charstart
, PA_NOALIGN
);
1353 charstart
= ++patparse
; /* skip ASCII '-' */
1354 METACHARINC(patparse
);
1356 if (itok(*charstart
)) {
1357 patadd(0, STOUC(ztokens
[*charstart
- Pound
]), 1,
1360 patadd(charstart
, 0, patparse
-charstart
, PA_NOALIGN
);
1363 if (*patparse
!= Outbrack
)
1366 /* terminate null string and fix alignment */
1367 patadd(NULL
, 0, 1, 0);
1370 /* is this how to treat parentheses in SHGLOB? */
1371 if (isset(SHGLOB
) && !kshchar
)
1373 if (kshchar
== '!') {
1374 /* This is nasty, we should really either handle all
1375 * kshglobbing below or here. But most of the
1376 * others look like non-ksh patterns, while this one
1377 * doesn't, so we handle it here and leave the rest.
1378 * We treat it like an extendedglob ^, except that
1379 * it goes into parentheses.
1381 * If we did do kshglob here, we could support
1382 * the old behaviour that things like !(foo)##
1383 * work, but it makes the code more complicated at
1384 * the expense of allowing the user to do things
1387 if (!(starter
= patcompnot(1, &flags2
)))
1389 } else if (!(starter
= patcompswitch(1, &flags2
)))
1391 flags
|= flags2
& P_HSTART
;
1395 len
= 0; /* beginning present 1, end present 2 */
1396 if (idigit(*patparse
)) {
1397 from
= (zrange_t
) zstrtol((char *)patparse
,
1398 (char **)&nptr
, 10);
1402 DPUTS(*patparse
!= '-', "BUG: - missing from numeric glob");
1404 if (idigit(*patparse
)) {
1405 to
= (zrange_t
) zstrtol((char *)patparse
,
1406 (char **)&nptr
, 10);
1410 if (*patparse
!= Outang
)
1415 starter
= patnode(P_NUMRNG
);
1416 patadd((char *)&from
, 0, sizeof(from
), 0);
1417 patadd((char *)&to
, 0, sizeof(to
), 0);
1420 starter
= patnode(P_NUMTO
);
1421 patadd((char *)&to
, 0, sizeof(to
), 0);
1424 starter
= patnode(P_NUMFROM
);
1425 patadd((char *)&from
, 0, sizeof(from
), 0);
1428 starter
= patnode(P_NUMANY
);
1431 /* This can't be simple, because it isn't.
1432 * Mention in manual that matching digits with [...]
1433 * is more efficient.
1437 DPUTS(!isset(EXTENDEDGLOB
), "BUG: # not treated as string");
1439 * A hash here is an error; it should follow something
1446 * Marker for restoring a backslash in output:
1447 * does not match a character.
1449 next
= patcomppiece(flagp
);
1451 * Can't match a pure string since we need to do this
1452 * as multiple chunks.
1454 *flagp
&= ~P_PURESTR
;
1459 dputs("BUG: character not handled in patcomppiece");
1467 if (!(pound
= (*patparse
== Pound
&& isset(EXTENDEDGLOB
))) &&
1468 !(count
= (isset(EXTENDEDGLOB
) && *patparse
== Inpar
&&
1469 patparse
[1] == Pound
&& patparse
[2] == 'c')) &&
1470 (kshchar
<= 0 || kshchar
== '@' || kshchar
== '!')) {
1475 /* too much at once doesn't currently work */
1476 if (kshchar
&& pound
)
1479 if (kshchar
== '*') {
1482 } else if (kshchar
== '+') {
1485 } else if (kshchar
== '?') {
1492 } else if (*++patparse
== Pound
) {
1502 * Note optimizations with pointers into P_NOTHING branches: some
1503 * should logically point to next node after current piece.
1505 * Backtracking is also encoded in a slightly obscure way: the
1506 * code emitted ensures we test the non-empty branch of complex
1507 * patterns before the empty branch on each repetition. Hence
1508 * each time we fail on a non-empty branch, we try the empty branch,
1509 * which is equivalent to backtracking.
1511 if (op
== P_COUNT
) {
1513 union upat countargs
[P_CT_OPERAND
];
1514 char *opp
= patparse
;
1516 countargs
[0].l
= P_COUNT
;
1517 countargs
[P_CT_CURRENT
].l
= 0L;
1518 countargs
[P_CT_MIN
].l
= (long)zstrtol(patparse
, &patparse
, 10);
1519 if (patparse
== opp
) {
1520 /* missing number treated as zero */
1521 countargs
[P_CT_MIN
].l
= 0L;
1523 if (*patparse
!= ',' && *patparse
!= Comma
) {
1524 /* either max = min or error */
1525 if (*patparse
!= Outpar
)
1527 countargs
[P_CT_MAX
].l
= countargs
[P_CT_MIN
].l
;
1530 countargs
[P_CT_MAX
].l
= (long)zstrtol(patparse
, &patparse
, 10);
1531 if (*patparse
!= Outpar
)
1533 if (patparse
== opp
) {
1534 /* missing number treated as infinity: record as -1 */
1535 countargs
[P_CT_MAX
].l
= -1L;
1539 countargs
[P_CT_PTR
].p
= NULL
;
1540 /* Mark this chain as a min/max count... */
1541 patinsert(P_COUNTSTART
, starter
, (char *)countargs
, sizeof(countargs
));
1543 * The next of the operand is a loop back to the P_COUNT. This is
1544 * how we get recursion for the count. We don't loop back to
1545 * the P_COUNTSTART; that's used for initialising the count
1546 * and saving and restoring the count for any enclosing use
1549 opnd
= P_OPERAND(starter
) + P_CT_OPERAND
;
1550 pattail(opnd
, patnode(P_BACK
));
1551 pattail(opnd
, P_OPERAND(starter
));
1553 * The next of the counter operators is what follows the
1555 * This handles matching of the tail.
1557 next
= patnode(P_NOTHING
);
1558 pattail(starter
, next
);
1559 pattail(P_OPERAND(starter
), next
);
1560 } else if ((flags
& P_SIMPLE
) && (op
== P_ONEHASH
|| op
== P_TWOHASH
) &&
1561 P_OP((Upat
)patout
+starter
) == P_ANY
) {
1562 /* Optimize ?# to *. Silly thing to do, since who would use
1563 * use ?# ? But it makes the later code shorter.
1565 Upat uptr
= (Upat
)patout
+ starter
;
1566 if (op
== P_TWOHASH
) {
1567 /* ?## becomes ?* */
1568 uptr
->l
= (uptr
->l
& ~0xff) | P_ANY
;
1569 pattail(starter
, patnode(P_STAR
));
1571 uptr
->l
= (uptr
->l
& ~0xff) | P_STAR
;
1573 } else if ((flags
& P_SIMPLE
) && op
&& !(patglobflags
& 0xff)) {
1574 /* Simplify, but not if we need to look for approximations. */
1575 patinsert(op
, starter
, NULL
, 0);
1576 } else if (op
== P_ONEHASH
) {
1577 /* Emit x# as (x&|), where & means "self". */
1579 patinsert(P_WBRANCH
, starter
, (char *)&up
, sizeof(up
));
1581 patoptail(starter
, patnode(P_BACK
)); /* and loop */
1582 patoptail(starter
, starter
); /* back */
1583 pattail(starter
, patnode(P_BRANCH
)); /* or */
1584 pattail(starter
, patnode(P_NOTHING
)); /* null. */
1585 } else if (op
== P_TWOHASH
) {
1586 /* Emit x## as x(&|) where & means "self". */
1587 next
= patnode(P_WBRANCH
); /* Either */
1589 patadd((char *)&up
, 0, sizeof(up
), 0);
1590 pattail(starter
, next
);
1591 pattail(patnode(P_BACK
), starter
); /* loop back */
1592 pattail(next
, patnode(P_BRANCH
)); /* or */
1593 pattail(starter
, patnode(P_NOTHING
)); /* null. */
1594 } else if (kshchar
== '?') {
1595 /* Emit ?(x) as (x|) */
1596 patinsert(P_BRANCH
, starter
, NULL
, 0); /* Either x */
1597 pattail(starter
, patnode(P_BRANCH
)); /* or */
1598 next
= patnode(P_NOTHING
); /* null */
1599 pattail(starter
, next
);
1600 patoptail(starter
, next
);
1602 if (*patparse
== Pound
)
1609 * Turn a ^foo (paren = 0) or !(foo) (paren = 1) into *~foo with
1610 * parentheses if necessary. As you see, that's really quite easy.
1615 patcompnot(int paren
, int *flagsp
)
1618 long excsync
, br
, excl
, n
, starter
;
1621 /* Here, we're matching a star at the start. */
1624 starter
= patnode(P_BRANCH
);
1625 br
= patnode(P_STAR
);
1626 excsync
= patnode(P_EXCSYNC
);
1627 pattail(br
, excsync
);
1628 pattail(starter
, excl
= patnode(P_EXCLUDE
));
1630 patadd((char *)&up
, 0, sizeof(up
), 0);
1631 if (!(br
= (paren
? patcompswitch(1, &dummy
) : patcompbranch(&dummy
))))
1633 pattail(br
, patnode(P_EXCEND
));
1634 n
= patnode(P_NOTHING
); /* just so much easier */
1635 pattail(excsync
, n
);
1647 long starter
= (Upat
)patcode
- (Upat
)patout
;
1651 patadd((char *)&up
, 0, sizeof(union upat
), 0);
1656 * insert an operator in front of an already emitted operand:
1657 * we relocate the operand. there had better be nothing else after.
1662 patinsert(long op
, int opnd
, char *xtra
, int sz
)
1664 char *src
, *dst
, *opdst
;
1665 union upat buf
, *lptr
;
1668 patadd((char *)&buf
, 0, sizeof(buf
), 0);
1670 patadd(xtra
, 0, sz
, 0);
1671 src
= patcode
- sizeof(union upat
) - sz
;
1673 opdst
= patout
+ opnd
* sizeof(union upat
);
1677 /* A cast can't be an lvalue */
1680 opdst
+= sizeof(union upat
);
1685 /* set the 'next' pointer at the end of a node chain */
1689 pattail(long p
, long val
)
1694 scan
= (Upat
)patout
+ p
;
1696 if (!(temp
= PATNEXT(scan
)))
1701 offset
= (P_OP(scan
) == P_BACK
)
1702 ? (scan
- (Upat
)patout
) - val
: val
- (scan
- (Upat
)patout
);
1704 scan
->l
|= offset
<< 8;
1707 /* do pattail, but on operand of first argument; nop if operandless */
1710 static void patoptail(long p
, long val
)
1712 Upat ptr
= (Upat
)patout
+ p
;
1714 if (!p
|| !P_ISBRANCH(ptr
))
1717 pattail(P_OPERAND(p
), val
);
1719 pattail(P_OPERAND(p
) + 1, val
);
1726 static char *patinstart
; /* Start of input string */
1727 static char *patinend
; /* End of input string */
1728 static char *patinput
; /* String input pointer */
1729 static char *patinpath
; /* Full path for use with ~ exclusions */
1730 static int patinlen
; /* Length of last successful match.
1731 * Includes count of Meta characters.
1734 static char *patbeginp
[NSUBEXP
]; /* Pointer to backref beginnings */
1735 static char *patendp
[NSUBEXP
]; /* Pointer to backref ends */
1736 static int parsfound
; /* parentheses (with backrefs) found */
1738 static int globdots
; /* Glob initial dots? */
1741 * Character functions operating on unmetafied strings.
1743 #ifdef MULTIBYTE_SUPPORT
1745 /* Get a character from the start point in a string */
1746 #define CHARREF(x, y) charref((x), (y))
1748 charref(char *x
, char *y
)
1753 if (!(patglobflags
& GF_MULTIBYTE
) || !(STOUC(*x
) & 0x80))
1754 return (wchar_t) STOUC(*x
);
1756 ret
= mbrtowc(&wc
, x
, y
-x
, &shiftstate
);
1758 if (ret
== MB_INVALID
|| ret
== MB_INCOMPLETE
) {
1759 /* Error. Treat as single byte. */
1760 /* Reset the shift state for next time. */
1761 memset(&shiftstate
, 0, sizeof(shiftstate
));
1762 return (wchar_t) STOUC(*x
);
1768 /* Get a pointer to the next character */
1769 #define CHARNEXT(x, y) charnext((x), (y))
1771 charnext(char *x
, char *y
)
1776 if (!(patglobflags
& GF_MULTIBYTE
) || !(STOUC(*x
) & 0x80))
1779 ret
= mbrtowc(&wc
, x
, y
-x
, &shiftstate
);
1781 if (ret
== MB_INVALID
|| ret
== MB_INCOMPLETE
) {
1782 /* Error. Treat as single byte. */
1783 /* Reset the shift state for next time. */
1784 memset(&shiftstate
, 0, sizeof(shiftstate
));
1788 /* Nulls here are normal characters */
1789 return x
+ (ret
? ret
: 1);
1792 /* Increment a pointer past the current character. */
1793 #define CHARINC(x, y) ((x) = charnext((x), (y)))
1796 /* Get a character and increment */
1797 #define CHARREFINC(x, y) charrefinc(&(x), (y))
1799 charrefinc(char **x
, char *y
)
1804 if (!(patglobflags
& GF_MULTIBYTE
) || !(STOUC(**x
) & 0x80))
1805 return (wchar_t) STOUC(*(*x
)++);
1807 ret
= mbrtowc(&wc
, *x
, y
-*x
, &shiftstate
);
1809 if (ret
== MB_INVALID
|| ret
== MB_INCOMPLETE
) {
1810 /* Error. Treat as single byte. */
1811 /* Reset the shift state for next time. */
1812 memset(&shiftstate
, 0, sizeof(shiftstate
));
1813 return (wchar_t) STOUC(*(*x
)++);
1816 /* Nulls here are normal characters */
1817 *x
+= ret
? ret
: 1;
1824 * Counter the number of characters between two pointers, smaller first
1826 * This is used when setting values in parameters, so we obey
1827 * the MULTIBYTE option (even if it's been overridden locally).
1829 #define CHARSUB(x,y) charsub(x, y)
1831 charsub(char *x
, char *y
)
1837 if (!isset(MULTIBYTE
))
1841 ret
= mbrtowc(&wc
, x
, y
-x
, &shiftstate
);
1843 if (ret
== MB_INVALID
|| ret
== MB_INCOMPLETE
) {
1844 /* Error. Treat remainder as single characters */
1845 return res
+ (y
- x
);
1848 /* Treat nulls as normal characters */
1858 #else /* no MULTIBYTE_SUPPORT */
1860 /* Get a character from the start point in a string */
1861 #define CHARREF(x, y) (STOUC(*(x)))
1862 /* Get a pointer to the next character */
1863 #define CHARNEXT(x, y) ((x)+1)
1864 /* Increment a pointer past the current character. */
1865 #define CHARINC(x, y) ((x)++)
1866 /* Get a character and increment */
1867 #define CHARREFINC(x, y) (STOUC(*(x)++))
1868 /* Counter the number of characters between two pointers, smaller first */
1869 #define CHARSUB(x,y) ((y) - (x))
1871 #endif /* MULTIBYTE_SUPPORT */
1874 * The following need to be accessed in the globbing scanner for
1875 * a multi-component file path. See horror story in glob.c.
1878 int errsfound
; /* Total error count so far */
1881 int forceerrs
; /* Forced maximum error count */
1892 * Test prog against null-terminated, metafied string.
1897 pattry(Patprog prog
, char *string
)
1899 return pattryrefs(prog
, string
, -1, -1, 0, NULL
, NULL
, NULL
);
1903 * Test prog against string of given length, no null termination
1904 * but still metafied at this point. offset gives an offset
1905 * to include in reported match indices
1910 pattrylen(Patprog prog
, char *string
, int len
, int unmetalen
, int offset
)
1912 return pattryrefs(prog
, string
, len
, unmetalen
, offset
, NULL
, NULL
, NULL
);
1916 * Test prog against string with given lengths. The input
1917 * string is metafied; stringlen is the raw string length, and
1918 * unmetalen the number of characters in the original string (some
1919 * of which may now be metafied). Either value may be -1
1920 * to indicate a null-terminated string which will be counted. Note
1921 * there may be a severe penalty for this if a lot of matching is done
1924 * offset is the position in the original string (not seen by
1925 * the pattern module) at which we are trying to match.
1926 * This is added in to the positions recorded in patbeginp and patendp
1927 * when we are looking for substrings. Currently this only happens
1928 * in the parameter substitution code.
1930 * Note this is a character offset, i.e. a metafied character
1933 * The last three arguments are used to report the positions for the
1934 * backreferences. On entry, *nump should contain the maximum number
1935 * of positions to report. In this case the match, mbegin, mend
1936 * arrays are not altered.
1938 * If nump is NULL but endp is not NULL, then *endp is set to the
1939 * end position of the match, taking into account patinstart.
1944 pattryrefs(Patprog prog
, char *string
, int stringlen
, int unmetalen
,
1946 int *nump
, int *begp
, int *endp
)
1948 int i
, maxnpos
= 0, ret
, needfullpath
, unmetalenp
;
1950 char **sp
, **ep
, *tryalloced
, *ptr
;
1951 char *progstr
= (char *)prog
+ prog
->startoff
;
1957 /* inherited from domatch, but why, exactly? */
1958 if (*string
== Nularg
) {
1964 stringlen
= strlen(string
);
1965 origlen
= stringlen
;
1967 patflags
= prog
->flags
;
1969 * For a top-level ~-exclusion, we will need the full
1970 * path to exclude, so copy the path so far and append the
1971 * current test string.
1973 needfullpath
= (patflags
& PAT_HAS_EXCLUDP
) && pathpos
;
1975 /* Get the length of the full string when unmetafied. */
1977 unmetalen
= ztrsub(string
+ stringlen
, string
);
1979 unmetalenp
= ztrsub(pathbuf
+ pathpos
, pathbuf
);
1983 DPUTS(needfullpath
&& (patflags
& (PAT_PURES
|PAT_ANY
)),
1984 "rum sort of file exclusion");
1986 * Partly for efficiency, and partly for the convenience of
1987 * globbing, we don't unmetafy pure string patterns, and
1988 * there's no reason to if the pattern is just a *.
1990 if (!(patflags
& (PAT_PURES
|PAT_ANY
))
1991 && (needfullpath
|| unmetalen
!= stringlen
)) {
1993 * We need to copy if we need to prepend the path so far
1994 * (in which case we copy both chunks), or if we have
2000 dst
= tryalloced
= zalloc(unmetalen
+ unmetalenp
);
2003 /* loop twice, copy path buffer first time */
2007 /* just loop once, copy string with unmetafication */
2011 for (icopy
= 0; icopy
< 2; icopy
++) {
2012 for (i
= 0; i
< ncopy
; i
++) {
2015 *dst
++ = *ptr
++ ^ 32;
2022 /* next time append test string to path so far */
2028 patinstart
= tryalloced
+ unmetalenp
;
2029 patinpath
= tryalloced
;
2031 patinstart
= tryalloced
;
2034 stringlen
= unmetalen
;
2036 patinstart
= string
;
2037 tryalloced
= patinpath
= NULL
;
2040 patinend
= patinstart
+ stringlen
;
2042 * From now on we do not require NULL termination of
2043 * the test string. There should also be no more references
2044 * to the variable string.
2047 if (prog
->flags
& (PAT_PURES
|PAT_ANY
)) {
2049 * Either we are testing against a pure string,
2050 * or we can match anything at all.
2053 if (prog
->flags
& PAT_ANY
) {
2055 * Optimisation for a single "*": always matches
2056 * (except for no_glob_dots, see below).
2061 * Testing a pure string. See if initial
2064 int lendiff
= stringlen
- prog
->patmlen
;
2066 /* No, the pattern string is too long. */
2068 } else if (!memcmp(progstr
, patinstart
, prog
->patmlen
)) {
2070 * Initial component matches. Matches either
2071 * if lengths are the same or we are not anchored
2072 * to the end of the string.
2074 ret
= !lendiff
|| (prog
->flags
& PAT_NOANCH
);
2082 * For files, we won't match initial "."s unless
2085 if ((prog
->flags
& PAT_NOGLD
) && *patinstart
== '.') {
2089 * Remember the length in case used for ${..#..} etc.
2090 * In this case, we didn't unmetafy the string.
2092 patinlen
= (int)prog
->patmlen
;
2093 /* if matching files, must update globbing flags */
2094 patglobflags
= prog
->globend
;
2096 if ((patglobflags
& GF_MATCHREF
) &&
2097 !(patflags
& PAT_FILE
)) {
2098 char *str
= ztrduppfx(patinstart
, patinlen
);
2099 char *ptr
= patinstart
;
2103 * Count the characters. We're not using CHARSUB()
2104 * because the string is still metafied. We're
2105 * not using mb_metastrlen() because that expects
2106 * the string to be null terminated.
2109 while (ptr
< patinstart
+ patinlen
) {
2111 ptr
+= MB_METACHARLEN(ptr
);
2114 setsparam("MATCH", str
);
2116 (zlong
)(patoffset
+ !isset(KSHARRAYS
)));
2118 (zlong
)(mlen
+ patoffset
+
2119 !isset(KSHARRAYS
) - 1));
2125 zfree(tryalloced
, unmetalen
+ unmetalenp
);
2130 * Test for a `must match' string, unless we're scanning for a match
2131 * in which case we don't need to do this each time.
2134 if (!(prog
->flags
& PAT_SCAN
) && prog
->mustoff
)
2136 char *testptr
; /* start pointer into test string */
2137 char *teststop
; /* last point from which we can match */
2138 char *patptr
= (char *)prog
+ prog
->mustoff
;
2139 int patlen
= prog
->patmlen
;
2142 if (patlen
> stringlen
) {
2143 /* Too long, can't match. */
2146 teststop
= patinend
- patlen
;
2148 for (testptr
= patinstart
; testptr
<= teststop
; testptr
++)
2150 if (!memcmp(testptr
, patptr
, patlen
)) {
2162 zfree(tryalloced
, unmetalen
+ unmetalenp
);
2166 patglobflags
= prog
->globflags
;
2167 if (!(patflags
& PAT_FILE
)) {
2171 globdots
= !(patflags
& PAT_NOGLD
);
2174 patinput
= patinstart
;
2176 if (patmatch((Upat
)progstr
)) {
2178 * we were lazy and didn't save the globflags if an exclusion
2179 * failed, so set it now
2181 patglobflags
= prog
->globend
;
2184 * Record length of successful match, including Meta
2185 * characters. Do it here so that patmatchlen() can return
2186 * it even if we delete the pattern strings.
2188 patinlen
= patinput
- patinstart
;
2190 * Optimization: if we didn't find any Meta characters
2191 * to begin with, we don't need to look for them now.
2193 if (unmetalen
!= origlen
) {
2194 for (ptr
= patinstart
; ptr
< patinput
; ptr
++)
2200 * Should we clear backreferences and matches on a failed
2203 if ((patglobflags
& GF_MATCHREF
) && !(patflags
& PAT_FILE
)) {
2205 * m flag: for global match. This carries no overhead
2206 * in the pattern matching part.
2208 * Remember the test pattern is already unmetafied.
2211 int mlen
= CHARSUB(patinstart
, patinput
);
2213 str
= metafy(patinstart
, patinput
- patinstart
, META_DUP
);
2214 setsparam("MATCH", str
);
2215 setiparam("MBEGIN", (zlong
)(patoffset
+ !isset(KSHARRAYS
)));
2217 (zlong
)(mlen
+ patoffset
+
2218 !isset(KSHARRAYS
) - 1));
2220 if (prog
->patnpar
&& nump
) {
2222 * b flag: for backreferences using parentheses. Reported
2225 *nump
= prog
->patnpar
;
2230 for (i
= 0; i
< prog
->patnpar
&& i
< maxnpos
; i
++) {
2231 if (parsfound
& (1 << i
)) {
2233 *begp
++ = CHARSUB(patinstart
, *sp
) + patoffset
;
2235 *endp
++ = CHARSUB(patinstart
, *ep
) + patoffset
2247 } else if (prog
->patnpar
&& !(patflags
& PAT_FILE
)) {
2249 * b flag: for backreferences using parentheses.
2251 int palen
= prog
->patnpar
+1;
2252 char **matcharr
, **mbeginarr
, **mendarr
;
2253 char numbuf
[DIGBUFSIZE
];
2255 matcharr
= zshcalloc(palen
*sizeof(char *));
2256 mbeginarr
= zshcalloc(palen
*sizeof(char *));
2257 mendarr
= zshcalloc(palen
*sizeof(char *));
2262 for (i
= 0; i
< prog
->patnpar
; i
++) {
2263 if (parsfound
& (1 << i
)) {
2264 matcharr
[i
] = metafy(*sp
, *ep
- *sp
, META_DUP
);
2266 * mbegin and mend give indexes into the string
2267 * in the standard notation, i.e. respecting
2268 * KSHARRAYS, and with the end index giving
2269 * the last character, not one beyond.
2270 * For example, foo=foo; [[ $foo = (f)oo ]] gives
2271 * (without KSHARRAYS) indexes 1 and 1, which
2272 * corresponds to indexing as ${foo[1,1]}.
2274 sprintf(numbuf
, "%ld",
2275 (long)(CHARSUB(patinstart
, *sp
) +
2277 !isset(KSHARRAYS
)));
2278 mbeginarr
[i
] = ztrdup(numbuf
);
2279 sprintf(numbuf
, "%ld",
2280 (long)(CHARSUB(patinstart
, *ep
) +
2282 !isset(KSHARRAYS
) - 1));
2283 mendarr
[i
] = ztrdup(numbuf
);
2285 /* Pattern wasn't set: either it was in an
2286 * unmatched branch, or a hashed parenthesis
2287 * that didn't match at all.
2289 matcharr
[i
] = ztrdup("");
2290 mbeginarr
[i
] = ztrdup("-1");
2291 mendarr
[i
] = ztrdup("-1");
2296 setaparam("match", matcharr
);
2297 setaparam("mbegin", mbeginarr
);
2298 setaparam("mend", mendarr
);
2301 if (!nump
&& endp
) {
2303 * We just need the overall end position.
2305 *endp
= CHARSUB(patinstart
, patinput
) + patoffset
;
2313 zfree(tryalloced
, unmetalen
+ unmetalenp
);
2320 * Return length of previous succesful match. This is
2321 * in metafied bytes, i.e. includes a count of Meta characters.
2322 * Unusual and futile attempt at modular encapsulation.
2333 * Match literal characters with case insensitivity test: the first
2334 * comes from the input string, the second the current pattern.
2336 #ifdef MULTIBYTE_SUPPORT
2337 #define ISUPPER(x) iswupper(x)
2338 #define ISLOWER(x) iswlower(x)
2339 #define TOUPPER(x) towupper(x)
2340 #define TOLOWER(x) towlower(x)
2341 #define ISDIGIT(x) iswdigit(x)
2343 #define ISUPPER(x) isupper(x)
2344 #define ISLOWER(x) islower(x)
2345 #define TOUPPER(x) toupper(x)
2346 #define TOLOWER(x) tolower(x)
2347 #define ISDIGIT(x) idigit(x)
2349 #define CHARMATCH(chin, chpa) (chin == chpa || \
2350 ((patglobflags & GF_IGNCASE) ? \
2351 ((ISUPPER(chin) ? TOLOWER(chin) : chin) == \
2352 (ISUPPER(chpa) ? TOLOWER(chpa) : chpa)) : \
2353 (patglobflags & GF_LCMATCHUC) ? \
2354 (ISLOWER(chpa) && TOUPPER(chpa) == chin) : 0))
2357 * The same but caching an expression from the first argument,
2358 * Requires local charmatch_cache definition.
2360 #define CHARMATCH_EXPR(expr, chpa) \
2361 (charmatch_cache = (expr), CHARMATCH(charmatch_cache, chpa))
2364 * exactpos is used to remember how far down an exact string we have
2365 * matched, if we are doing approximation and can therefore redo from
2366 * the same point; we never need to otherwise.
2368 * exactend is a pointer to the end of the string, which isn't
2371 static char *exactpos
, *exactend
;
2374 * Main matching routine.
2376 * Testing the tail end of a match is usually done by recursion, but
2377 * we try to eliminate that in favour of looping for simple cases.
2384 /* Current and next nodes */
2385 Upat scan
= prog
, next
, opnd
;
2386 char *start
, *save
, *chrop
, *chrend
, *compend
;
2387 int savglobflags
, op
, no
, min
, fail
= 0, saverrsfound
;
2388 zrange_t from
, to
, comp
;
2392 next
= PATNEXT(scan
);
2394 if (!globdots
&& P_NOTDOT(scan
) && patinput
== patinstart
&&
2395 patinput
< patinend
&& *patinput
== '.')
2398 switch (P_OP(scan
)) {
2400 if (patinput
== patinend
)
2403 CHARINC(patinput
, patinend
);
2407 * acts as nothing if *chrop is null: this is used by
2414 chrop
= P_LS_STR(scan
);
2415 chrend
= chrop
+ P_LS_LEN(scan
);
2418 while (chrop
< chrend
&& patinput
< patinend
) {
2419 char *savpatinput
= patinput
;
2420 char *savchrop
= chrop
;
2421 patint_t chin
= CHARREFINC(patinput
, patinend
);
2422 patint_t chpa
= CHARREFINC(chrop
, chrend
);
2423 if (!CHARMATCH(chin
, chpa
)) {
2425 patinput
= savpatinput
;
2430 if (chrop
< chrend
) {
2438 if (patinput
== patinend
)
2441 #ifdef MULTIBYTE_SUPPORT
2442 wchar_t cr
= CHARREF(patinput
, patinend
);
2443 char *scanop
= (char *)P_OPERAND(scan
);
2444 if (patglobflags
& GF_MULTIBYTE
) {
2445 if (mb_patmatchrange(scanop
, cr
, NULL
, NULL
) ^
2446 (P_OP(scan
) == P_ANYOF
))
2449 CHARINC(patinput
, patinend
);
2450 } else if (patmatchrange(scanop
, (int)cr
, NULL
, NULL
) ^
2451 (P_OP(scan
) == P_ANYOF
))
2454 CHARINC(patinput
, patinend
);
2456 if (patmatchrange((char *)P_OPERAND(scan
),
2457 CHARREF(patinput
, patinend
), NULL
, NULL
) ^
2458 (P_OP(scan
) == P_ANYOF
))
2461 CHARINC(patinput
, patinend
);
2469 * To do this properly, we really have to treat numbers as
2470 * closures: that's so things like <1-1000>33 will
2471 * match 633 (they didn't up to 3.1.6). To avoid making this
2472 * too inefficient, we see if there's an exact match next:
2473 * if there is, and it's not a digit, we return 1 after
2474 * the first attempt.
2477 start
= (char *)P_OPERAND(scan
);
2479 if (op
!= P_NUMTO
) {
2480 #ifdef ZSH_64_BIT_TYPE
2481 /* We can't rely on pointer alignment being good enough. */
2482 memcpy((char *)&from
, start
, sizeof(zrange_t
));
2484 from
= *((zrange_t
*) start
);
2486 start
+= sizeof(zrange_t
);
2488 if (op
!= P_NUMFROM
) {
2489 #ifdef ZSH_64_BIT_TYPE
2490 memcpy((char *)&to
, start
, sizeof(zrange_t
));
2492 to
= *((zrange_t
*) start
);
2495 start
= compend
= patinput
;
2497 while (patinput
< patinend
&& idigit(*patinput
)) {
2500 comp
+= *patinput
- '0';
2504 if (comp
& ((zrange_t
)1 << (sizeof(comp
)*8 -
2505 #ifdef ZRANGE_T_IS_SIGNED
2512 * Out of range (allowing for signedness, which
2513 * we need if we are using zlongs).
2514 * This is as far as we can go.
2515 * If we're doing a range "from", skip all the
2516 * remaining numbers. Otherwise, we can't
2517 * match beyond the previous point anyway.
2518 * Leave the pointer to the last calculated
2519 * position (compend) where it was before.
2521 if (op
== P_NUMFROM
) {
2522 while (patinput
< patinend
&& idigit(*patinput
))
2529 while (patinput
> start
) {
2530 /* if already too small, no power on earth can save it */
2531 if (comp
< from
&& patinput
<= compend
)
2533 if ((op
== P_NUMFROM
|| comp
<= to
) && patmatch(next
)) {
2536 if (!no
&& P_OP(next
) == P_EXACTLY
&&
2538 !idigit(STOUC(*P_LS_STR(next
)))) &&
2539 !(patglobflags
& 0xff))
2544 * With a range start and an unrepresentable test
2545 * number, we just back down the test string without
2546 * changing the number until we get to a representable
2549 if (patinput
< compend
)
2556 /* This is <->: any old set of digits, don't bother comparing */
2558 while (patinput
< patinend
&& idigit(*patinput
))
2562 while (patinput
> start
) {
2565 if (!no
&& P_OP(next
) == P_EXACTLY
&&
2567 !idigit(*P_LS_STR(next
))) &&
2568 !(patglobflags
& 0xff))
2581 patglobflags
= P_OPERAND(scan
)->l
;
2593 no
= P_OP(scan
) - P_OPEN
;
2596 if (patmatch(next
)) {
2598 * Don't set patbeginp if some later invocation of
2599 * the same parentheses already has.
2601 if (no
&& !(parsfound
& (1 << (no
- 1)))) {
2602 patbeginp
[no
-1] = save
;
2603 parsfound
|= 1 << (no
- 1);
2619 no
= P_OP(scan
) - P_CLOSE
;
2622 if (patmatch(next
)) {
2623 if (no
&& !(parsfound
& (1 << (no
+ 15)))) {
2624 patendp
[no
-1] = save
;
2625 parsfound
|= 1 << (no
+ 15);
2632 /* See the P_EXCLUDE code below for where syncptr comes from */
2634 unsigned char *syncptr
;
2636 after
= P_OPERAND(scan
);
2637 DPUTS(!P_ISEXCLUDE(after
),
2638 "BUG: EXCSYNC not followed by EXCLUDE.");
2639 DPUTS(!P_OPERAND(after
)->p
,
2640 "BUG: EXCSYNC not handled by EXCLUDE");
2641 syncptr
= P_OPERAND(after
)->p
+ (patinput
- patinstart
);
2643 * If we already matched from here, this time we fail.
2644 * See WBRANCH code for story about error count.
2646 if (*syncptr
&& errsfound
+ 1 >= *syncptr
)
2649 * Else record that we (possibly) matched this time.
2650 * No harm if we don't: then the previous test will just
2651 * short cut the attempted match that is bound to fail.
2652 * We never try to exclude something that has already
2655 *syncptr
= errsfound
+ 1;
2660 * This is followed by a P_EXCSYNC, but only in the P_EXCLUDE
2661 * branch. Actually, we don't bother following it: all we
2662 * need to know is that we successfully matched so far up
2663 * to the end of the asserted pattern; the endpoint
2664 * in the target string is nulled out.
2666 if (!(fail
= (patinput
< patinend
)))
2671 /* P_EXCLUDE shouldn't occur without a P_BRANCH */
2672 if (!P_ISBRANCH(next
)) {
2673 /* no choice, avoid recursion */
2674 DPUTS(P_OP(scan
) == P_WBRANCH
,
2675 "BUG: WBRANCH with no alternative.");
2676 next
= P_OPERAND(scan
);
2680 savglobflags
= patglobflags
;
2681 saverrsfound
= errsfound
;
2682 if (P_ISEXCLUDE(next
)) {
2684 * The strategy is to test the asserted pattern,
2685 * recording via P_EXCSYNC how far the part to
2686 * be excluded matched. We then set the
2687 * length of the test string to that
2688 * point and see if the exclusion as far as
2689 * P_EXCEND also matches that string.
2690 * We need to keep testing the asserted pattern
2691 * by backtracking, since the first attempt
2692 * may be excluded while a later attempt may not.
2693 * For this we keep a pointer just after
2694 * the P_EXCLUDE which is tested by the P_EXCSYNC
2695 * to see if we matched there last time, in which
2696 * case we fail. If there is nothing to backtrack
2697 * over, that doesn't matter: we should fail anyway.
2698 * The pointer also tells us where the asserted
2699 * pattern matched for use by the exclusion.
2701 * It's hard to allocate space for this
2702 * beforehand since we may need to do it
2705 * P.S. in case you were wondering, this code
2710 unsigned char *oldsyncstr
;
2711 char *matchpt
= NULL
;
2712 int ret
, savglobdots
, matchederrs
= 0;
2713 int savparsfound
= parsfound
;
2714 DPUTS(P_OP(scan
) == P_WBRANCH
,
2715 "BUG: excluded WBRANCH");
2716 syncstrp
= P_OPERAND(next
);
2718 * Unlike WBRANCH, each test at the same exclude
2719 * sync point (due to an external loop) is separate,
2720 * i.e testing (foo~bar)# is no different from
2721 * (foo~bar)(foo~bar)... from the exclusion point
2722 * of view, so we use a different sync string.
2724 oldsyncstr
= syncstrp
->p
;
2725 syncstrp
->p
= (unsigned char *)
2726 zshcalloc((patinend
- patinstart
) + 1);
2727 origpatinend
= patinend
;
2728 while ((ret
= patmatch(P_OPERAND(scan
)))) {
2729 unsigned char *syncpt
;
2730 char *savpatinstart
;
2731 int savforce
= forceerrs
;
2732 int savpatflags
= patflags
, synclen
;
2734 savglobdots
= globdots
;
2735 matchederrs
= errsfound
;
2736 matchpt
= patinput
; /* may not be end */
2737 globdots
= 1; /* OK to match . first */
2738 /* Find the point where the scan
2739 * matched the part to be excluded: because
2740 * of backtracking, the one
2741 * most recently matched will be the first.
2742 * (Luckily, backtracking is done after all
2743 * possibilities for approximation have been
2746 for (syncpt
= syncstrp
->p
; !*syncpt
; syncpt
++)
2748 synclen
= syncpt
- syncstrp
->p
;
2749 if (patinstart
+ synclen
!= patinend
) {
2751 * Temporarily mark the string as
2752 * ending at this point.
2754 DPUTS(patinstart
+ synclen
> matchpt
,
2755 "BUG: EXCSYNC failed");
2757 patinend
= patinstart
+ synclen
;
2759 * If this isn't really the end of the string,
2760 * remember this for the (#e) assertion.
2762 patflags
|= PAT_NOTEND
;
2764 savpatinstart
= patinstart
;
2765 next
= PATNEXT(scan
);
2766 while (next
&& P_ISEXCLUDE(next
)) {
2769 * turn off approximations in exclusions:
2770 * note we keep remaining patglobflags
2771 * set by asserted branch (or previous
2772 * excluded branches, for consistency).
2774 patglobflags
&= ~0xff;
2776 opnd
= P_OPERAND(next
) + 1;
2777 if (P_OP(next
) == P_EXCLUDP
&& patinpath
) {
2779 * Top level exclusion with a file,
2780 * applies to whole path so add the
2781 * segments already matched.
2782 * We copied these in front of the
2783 * test pattern, so patinend doesn't
2786 DPUTS(patinput
!= patinstart
,
2787 "BUG: not at start excluding path");
2788 patinput
= patinstart
= patinpath
;
2790 if (patmatch(opnd
)) {
2793 * Another subtlety: if we exclude the
2794 * match, any parentheses just found
2795 * become invalidated.
2797 parsfound
= savparsfound
;
2800 patinput
= savpatinstart
+
2801 (patinput
- patinstart
);
2802 patinstart
= savpatinstart
;
2806 next
= PATNEXT(next
);
2809 * Restore original end position.
2811 patinend
= origpatinend
;
2812 patflags
= savpatflags
;
2813 globdots
= savglobdots
;
2814 forceerrs
= savforce
;
2818 patglobflags
= savglobflags
;
2819 errsfound
= saverrsfound
;
2821 zfree((char *)syncstrp
->p
,
2822 (patinend
- patinstart
) + 1);
2823 syncstrp
->p
= oldsyncstr
;
2826 errsfound
= matchederrs
;
2829 while ((scan
= PATNEXT(scan
)) &&
2833 int ret
= 1, pfree
= 0;
2836 if (P_OP(scan
) == P_WBRANCH
) {
2838 * This is where we make sure that we are not
2839 * repeatedly matching zero-length strings in
2840 * a closure, which would cause an infinite loop,
2841 * and also remove exponential behaviour in
2842 * backtracking nested closures.
2843 * The P_WBRANCH operator leaves a space for a
2844 * uchar *, initialized to NULL, which is
2845 * turned into a string the same length as the
2846 * target string. Every time we match from a
2847 * particular point in the target string, we
2848 * stick a 1 at the corresponding point here.
2849 * If we come round to the same branch again, and
2850 * there is already a 1, then the test fails.
2852 opnd
= P_OPERAND(scan
);
2855 ptrp
->p
= (unsigned char *)
2856 zshcalloc((patinend
- patinstart
) + 1);
2859 ptr
= ptrp
->p
+ (patinput
- patinstart
);
2862 * Without approximation, this is just a
2863 * single bit test. With approximation, we
2864 * need to know how many errors there were
2865 * last time we made the test. If errsfound
2866 * is now smaller than it was, hence we can
2867 * make more approximations in the remaining
2868 * code, we continue with the test.
2869 * (This is why the max number of errors is
2872 if (*ptr
&& errsfound
+ 1 >= *ptr
)
2874 *ptr
= errsfound
+ 1;
2876 opnd
= P_OPERAND(scan
);
2878 ret
= patmatch(opnd
);
2880 zfree((char *)ptrp
->p
,
2881 (patinend
- patinstart
) + 1);
2886 scan
= PATNEXT(scan
);
2889 patglobflags
= savglobflags
;
2890 errsfound
= saverrsfound
;
2891 DPUTS(P_OP(scan
) == P_WBRANCH
,
2892 "BUG: WBRANCH not first choice.");
2893 next
= PATNEXT(scan
);
2894 } while (scan
&& P_ISBRANCH(scan
));
2899 /* Handle specially for speed, although really P_ONEHASH+P_ANY */
2903 * This is just simple cases, matching one character.
2904 * With approximations, we still handle * this way, since
2905 * no approximation is ever necessary, but other closures
2906 * are handled by the more complicated branching method
2909 /* Note that no counts possibly metafied characters */
2912 char *lastcharstart
;
2914 * Array to record the start of characters for
2917 VARARR(char, charstart
, patinend
-patinput
);
2918 memset(charstart
, 0, patinend
-patinput
);
2921 for (no
= 0; patinput
< patinend
;
2922 CHARINC(patinput
, patinend
))
2924 charstart
[patinput
-start
] = 1;
2927 /* simple optimization for reasonably common case */
2928 if (P_OP(next
) == P_END
)
2931 DPUTS(patglobflags
& 0xff,
2932 "BUG: wrong backtracking with approximation.");
2933 if (!globdots
&& P_NOTDOT(P_OPERAND(scan
)) &&
2934 patinput
== patinstart
&& patinput
< patinend
&&
2935 CHARREF(patinput
, patinend
) == ZWC('.'))
2937 no
= patrepeat(P_OPERAND(scan
), charstart
);
2939 min
= (op
== P_TWOHASH
) ? 1 : 0;
2941 * Lookahead to avoid useless matches. This is not possible
2942 * with approximation.
2944 if (P_OP(next
) == P_EXACTLY
&& P_LS_LEN(next
) &&
2945 !(patglobflags
& 0xff)) {
2946 char *nextop
= P_LS_STR(next
);
2947 #ifdef MULTIBYTE_SUPPORT
2948 /* else second argument of CHARREF isn't used */
2949 int nextlen
= P_LS_LEN(next
);
2952 * If that P_EXACTLY is last (common in simple patterns,
2953 * such as *.c), then it can be only be matched at one
2954 * point in the test string, so record that.
2956 if (P_OP(PATNEXT(next
)) == P_END
&&
2957 !(patflags
& PAT_NOANCH
)) {
2958 int ptlen
= patinend
- patinput
;
2959 int lenmatch
= patinend
-
2960 (min
? CHARNEXT(start
, patinend
) : start
);
2961 /* Are we in the right range? */
2962 if (P_LS_LEN(next
) > lenmatch
||
2963 P_LS_LEN(next
) < ptlen
)
2965 /* Yes, just position appropriately and test. */
2966 patinput
+= ptlen
- P_LS_LEN(next
);
2968 * Here we will need to be careful that patinput is not
2969 * in the middle of a multibyte character.
2971 /* Continue loop with P_EXACTLY test. */
2974 nextch
= CHARREF(nextop
, nextop
+ nextlen
);
2977 savglobflags
= patglobflags
;
2978 saverrsfound
= errsfound
;
2979 lastcharstart
= charstart
+ (patinput
- start
);
2982 patint_t charmatch_cache
;
2983 if (nextch
== PEOF
||
2984 (patinput
< patinend
&&
2985 CHARMATCH_EXPR(CHARREF(patinput
, patinend
),
2992 /* find start of previous full character */
2993 while (!*--lastcharstart
)
2994 DPUTS(lastcharstart
< charstart
,
2995 "lastcharstart invalid");
2996 patinput
= start
+ (lastcharstart
-charstart
);
2997 patglobflags
= savglobflags
;
2998 errsfound
= saverrsfound
;
3003 * As with branches, the patmatch(next) stuff for *
3004 * handles approximation, so we don't need to try
3009 if (patinput
!= patinstart
|| (patflags
& PAT_NOTSTART
))
3013 if (patinput
< patinend
|| (patflags
& PAT_NOTEND
))
3019 * Save and restore the current count and the
3020 * start pointer in case the pattern has been
3021 * executed by a previous repetition of a
3024 long *curptr
= &P_OPERAND(scan
)[P_CT_CURRENT
].l
;
3025 long savecount
= *curptr
;
3026 unsigned char *saveptr
= scan
[P_CT_PTR
].p
;
3030 ret
= patmatch(P_OPERAND(scan
));
3031 *curptr
= savecount
;
3032 scan
[P_CT_PTR
].p
= saveptr
;
3037 /* (#cN,M): execution is relatively straightforward */
3038 long cur
= scan
[P_CT_CURRENT
].l
;
3039 long min
= scan
[P_CT_MIN
].l
;
3040 long max
= scan
[P_CT_MAX
].l
;
3042 if (cur
&& cur
>= min
&&
3043 (unsigned char *)patinput
== scan
[P_CT_PTR
].p
) {
3045 * Not at the first attempt to match so
3046 * the previous attempt managed zero length.
3047 * We can do this indefinitely so there's
3048 * no point in going on. Simply try to
3049 * match the remainder of the pattern.
3051 return patmatch(next
);
3053 scan
[P_CT_PTR
].p
= (unsigned char *)patinput
;
3055 if (max
< 0 || cur
< max
) {
3056 char *patinput_thistime
= patinput
;
3057 scan
[P_CT_CURRENT
].l
= cur
+ 1;
3058 if (patmatch(scan
+ P_CT_OPERAND
))
3060 patinput
= patinput_thistime
;
3064 return patmatch(next
);
3067 if (!(fail
= (patinput
< patinend
&& !(patflags
& PAT_NOANCH
))))
3072 dputs("BUG: bad operand in patmatch.");
3079 if (errsfound
< (patglobflags
& 0xff) &&
3080 (forceerrs
== -1 || errsfound
< forceerrs
)) {
3082 * Approximation code. There are four possibilities
3084 * 1. omit character from input string
3085 * 2. transpose characters in input and pattern strings
3086 * 3. omit character in both input and pattern strings
3087 * 4. omit character from pattern string.
3089 * which we try in that order.
3091 * Of these, 2, 3 and 4 require an exact match string
3092 * (P_EXACTLY) while 1, 2 and 3 require that we not
3093 * have reached the end of the input string.
3095 * Note in each case after making the approximation we
3096 * need to retry the *same* pattern; this is what
3097 * requires exactpos, a slightly doleful way of
3098 * communicating with the exact character matcher.
3100 char *savexact
= exactpos
;
3102 savglobflags
= patglobflags
;
3103 saverrsfound
= ++errsfound
;
3106 DPUTS(P_OP(scan
) != P_EXACTLY
&& exactpos
,
3107 "BUG: non-exact match has set exactpos");
3109 /* Try omitting a character from the input string */
3110 if (patinput
< patinend
) {
3111 CHARINC(patinput
, patinend
);
3112 /* If we are not on an exact match, then this is
3113 * our last gasp effort, so we can optimize out
3114 * the recursive call.
3116 if (P_OP(scan
) != P_EXACTLY
)
3122 if (P_OP(scan
) == P_EXACTLY
) {
3123 char *nextexact
= savexact
;
3125 "BUG: exact match has not set exactpos");
3126 CHARINC(nextexact
, exactend
);
3128 if (save
< patinend
) {
3129 char *nextin
= save
;
3130 CHARINC(nextin
, patinend
);
3131 patglobflags
= savglobflags
;
3132 errsfound
= saverrsfound
;
3133 exactpos
= savexact
;
3136 * Try swapping two characters in patinput and
3139 if (save
< patinend
&& nextin
< patinend
&&
3140 nextexact
< exactend
) {
3141 patint_t cin0
= CHARREF(save
, patinend
);
3142 patint_t cpa0
= CHARREF(exactpos
, exactend
);
3143 patint_t cin1
= CHARREF(nextin
, patinend
);
3144 patint_t cpa1
= CHARREF(nextexact
, exactend
);
3146 if (CHARMATCH(cin0
, cpa1
) &&
3147 CHARMATCH(cin1
, cpa0
)) {
3149 CHARINC(patinput
, patinend
);
3150 exactpos
= nextexact
;
3151 CHARINC(exactpos
, exactend
);
3155 patglobflags
= savglobflags
;
3156 errsfound
= saverrsfound
;
3161 * Try moving up both strings.
3164 exactpos
= nextexact
;
3169 patglobflags
= savglobflags
;
3170 errsfound
= saverrsfound
;
3171 exactpos
= savexact
;
3174 DPUTS(exactpos
== exactend
, "approximating too far");
3176 * Try moving up the exact match pattern.
3177 * This must be the last attempt, so just loop
3178 * instead of calling recursively.
3180 CHARINC(exactpos
, exactend
);
3196 #ifdef MULTIBYTE_SUPPORT
3199 * See if character ch matches a pattern range specification.
3200 * The null-terminated specification is in range; the test
3201 * character is in ch.
3203 * indptr is used by completion matching, which is why this
3204 * function is exported. If indptr is not NULL we set *indptr
3205 * to the index of the character in the range string, adjusted
3206 * in the case of "A-B" ranges such that A would count as its
3207 * normal index (say IA), B would count as IA + (B-A), and any
3208 * character within the range as appropriate. We're not strictly
3209 * guaranteed this fits within a wint_t, but if this is Unicode
3210 * in 32 bits we have a fair amount of distance left over.
3212 * mtp is used in the same circumstances. *mtp returns the match type:
3213 * 0 for a standard character, else the PP_ index. It's not
3214 * useful if the match failed.
3219 mb_patmatchrange(char *range
, wchar_t ch
, wint_t *indptr
, int *mtp
)
3226 * Careful here: unlike other strings, range is a NULL-terminated,
3227 * metafied string, because we need to treat the Posix and hyphenated
3231 if (imeta(STOUC(*range
))) {
3232 int swtype
= STOUC(*range
++) - STOUC(Meta
);
3237 /* ordinary metafied character */
3239 if (metacharinc(&range
) == ch
)
3251 if ((ch
& ~0x7f) == 0)
3255 if (ch
== L
' ' || ch
== L
'\t')
3295 if (wcsitype(ch
, IIDENT
))
3299 if (wcsitype(ch
, ISEP
))
3303 /* must be ASCII space character */
3304 if (ch
< 128 && iwsep((int)ch
))
3308 if (wcsitype(ch
, IWORD
))
3312 r1
= metacharinc(&range
);
3313 r2
= metacharinc(&range
);
3314 if (r1
<= ch
&& ch
<= r2
) {
3319 /* Careful not to screw up counting with bogus range */
3320 if (indptr
&& r1
< r2
) {
3322 * This gets incremented again below to get
3323 * us past the range end. This is correct.
3329 DPUTS(1, "BUG: unknown posix range passed through.\n");
3332 DPUTS(1, "BUG: unknown metacharacter in range.");
3335 } else if (metacharinc(&range
) == ch
) {
3348 * This is effectively the reverse of mb_patmatchrange().
3349 * Given a range descriptor of the same form, and an index into it,
3350 * try to determine the character that is matched. If the index
3351 * points to a [:...:] generic style match, set chr to WEOF and
3352 * return the type in mtp instead. Return 1 if successful, 0 if
3353 * there was no corresponding index. Note all pointer arguments
3359 mb_patmatchindex(char *range
, wint_t ind
, wint_t *chr
, int *mtp
)
3361 wchar_t r1
, r2
, rchr
;
3368 if (imeta(STOUC(*range
))) {
3369 int swtype
= STOUC(*range
++) - STOUC(Meta
);
3373 rchr
= metacharinc(&range
);
3375 *chr
= (wint_t) rchr
;
3404 r1
= metacharinc(&range
);
3405 r2
= metacharinc(&range
);
3406 rdiff
= (wint_t)r2
- (wint_t)r1
;
3408 *chr
= (wint_t)r1
+ ind
;
3411 /* note the extra decrement to ind below */
3415 DPUTS(1, "BUG: unknown posix range passed through.\n");
3418 DPUTS(1, "BUG: unknown metacharacter in range.");
3422 rchr
= metacharinc(&range
);
3424 *chr
= (wint_t)rchr
;
3432 /* No corresponding index. */
3437 #endif /* MULTIBYTE_SUPPORT */
3440 * Identical function to mb_patmatchrange() above for single-byte
3446 patmatchrange(char *range
, int ch
, int *indptr
, int *mtp
)
3453 * Careful here: unlike other strings, range is a NULL-terminated,
3454 * metafied string, because we need to treat the Posix and hyphenated
3457 for (; *range
; range
++) {
3458 if (imeta(STOUC(*range
))) {
3459 int swtype
= STOUC(*range
) - STOUC(Meta
);
3464 if (STOUC(*++range
^ 32) == ch
)
3476 if ((ch
& ~0x7f) == 0)
3480 if (ch
== ' ' || ch
== '\t')
3537 r1
= STOUC(UNMETA(range
));
3539 r2
= STOUC(UNMETA(range
));
3542 if (r1
<= ch
&& ch
<= r2
) {
3547 if (indptr
&& r1
< r2
)
3551 DPUTS(1, "BUG: unknown posix range passed through.\n");
3554 DPUTS(1, "BUG: unknown metacharacter in range.");
3557 } else if (STOUC(*range
) == ch
) {
3570 #ifndef MULTIBYTE_SUPPORT
3573 * Identical function to mb_patmatchindex() above for single-byte
3574 * characters. Here -1 represents a character that needs a special type.
3576 * Unlike patmatchrange, we only need this in ZLE, which always
3577 * uses MULTIBYTE_SUPPORT if compiled in; hence we don't use
3578 * this function in that case.
3583 patmatchindex(char *range
, int ind
, int *chr
, int *mtp
)
3585 int r1
, r2
, rdiff
, rchr
;
3590 for (; *range
; range
++) {
3591 if (imeta(STOUC(*range
))) {
3592 int swtype
= STOUC(*range
) - STOUC(Meta
);
3595 /* ordinary metafied character */
3596 rchr
= STOUC(*++range
) ^ 32;
3628 r1
= STOUC(UNMETA(range
));
3630 r2
= STOUC(UNMETA(range
));
3638 /* note the extra decrement to ind below */
3642 DPUTS(1, "BUG: unknown posix range passed through.\n");
3645 DPUTS(1, "BUG: unknown metacharacter in range.");
3650 *chr
= STOUC(*range
);
3658 /* No corresponding index. */
3663 #endif /* MULTIBYTE_SUPPORT */
3666 * Repeatedly match something simple and say how many times.
3667 * charstart is an array parallel to that starting at patinput
3668 * and records the start of (possibly multibyte) characters
3669 * to aid in later backtracking.
3673 static int patrepeat(Upat p
, char *charstart
)
3676 patint_t tch
, charmatch_cache
;
3680 opnd
= (char *)P_OPERAND(p
);
3685 dputs("BUG: ?# did not get optimized to *");
3690 DPUTS(P_LS_LEN(p
) != 1, "closure following more than one character");
3691 tch
= CHARREF(P_LS_STR(p
), P_LS_STR(p
) + P_LS_LEN(p
));
3692 while (scan
< patinend
&&
3693 CHARMATCH_EXPR(CHARREF(scan
, patinend
), tch
)) {
3694 charstart
[scan
-patinput
] = 1;
3696 CHARINC(scan
, patinend
);
3701 while (scan
< patinend
) {
3702 #ifdef MULTIBYTE_SUPPORT
3703 wchar_t cr
= CHARREF(scan
, patinend
);
3704 if (patglobflags
& GF_MULTIBYTE
) {
3705 if (mb_patmatchrange(opnd
, cr
, NULL
, NULL
) ^
3706 (P_OP(p
) == P_ANYOF
))
3708 } else if (patmatchrange(opnd
, (int)cr
, NULL
, NULL
) ^
3709 (P_OP(p
) == P_ANYOF
))
3712 if (patmatchrange(opnd
, CHARREF(scan
, patinend
), NULL
, NULL
) ^
3713 (P_OP(p
) == P_ANYOF
))
3716 charstart
[scan
-patinput
] = 1;
3718 CHARINC(scan
, patinend
);
3723 dputs("BUG: something very strange is happening in patrepeat");
3733 /* Free a patprog. */
3737 freepatprog(Patprog prog
)
3739 if (prog
&& prog
!= dummy_patprog1
&& prog
!= dummy_patprog2
)
3740 zfree(prog
, prog
->size
);