Add some more cases to the app-id unit tests
[glib.git] / glib / pcre / pcre_exec.c
blobcecbbcf9d0435151abb5aade0aab94c177601a3d
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
52 #include "pcre_internal.h"
54 /* Undefine some potentially clashing cpp symbols */
56 #undef min
57 #undef max
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
88 #define REC_STACK_SAVE_MAX 30
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
97 #ifdef PCRE_DEBUG
98 /*************************************************
99 * Debugging function to print chars *
100 *************************************************/
102 /* Print a sequence of chars in printable format, stopping at the end of the
103 subject if the requested.
105 Arguments:
106 p points to characters
107 length number to print
108 is_subject TRUE if printing from within md->start_subject
109 md pointer to matching data block, if is_subject is TRUE
111 Returns: nothing
114 static void
115 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 unsigned int c;
118 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
119 while (length-- > 0)
120 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 #endif
126 /*************************************************
127 * Match a back-reference *
128 *************************************************/
130 /* Normally, if a back reference hasn't been set, the length that is passed is
131 negative, so the match always fails. However, in JavaScript compatibility mode,
132 the length passed is zero. Note that in caseless UTF-8 mode, the number of
133 subject bytes matched may be different to the number of reference bytes.
135 Arguments:
136 offset index into the offset vector
137 eptr pointer into the subject
138 length length of reference to be matched (number of bytes)
139 md points to match data block
140 caseless TRUE if caseless
142 Returns: >= 0 the number of subject bytes matched
143 -1 no match
144 -2 partial match; always given if at end subject
147 static int
148 match_ref(int offset, PCRE_PUCHAR eptr, int length, match_data *md,
149 BOOL caseless)
151 PCRE_PUCHAR eptr_start = eptr;
152 PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
167 /* Always fail if reference not set (and not JavaScript compatible - in that
168 case the length is passed as zero). */
170 if (length < 0) return -1;
172 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173 properly if Unicode properties are supported. Otherwise, we can check only
174 ASCII characters. */
176 if (caseless)
178 #ifdef SUPPORT_UTF
179 #ifdef SUPPORT_UCP
180 if (md->utf)
182 /* Match characters up to the end of the reference. NOTE: the number of
183 bytes matched may differ, because there are some characters whose upper and
184 lower case versions code as different numbers of bytes. For example, U+023A
185 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
186 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
187 the latter. It is important, therefore, to check the length along the
188 reference, not along the subject (earlier code did this wrong). */
190 PCRE_PUCHAR endptr = p + length;
191 while (p < endptr)
193 int c, d;
194 if (eptr >= md->end_subject) return -2; /* Partial match */
195 GETCHARINC(c, eptr);
196 GETCHARINC(d, p);
197 if (c != d && c != UCD_OTHERCASE(d)) return -1;
200 else
201 #endif
202 #endif
204 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
205 is no UCP support. */
207 while (length-- > 0)
209 if (eptr >= md->end_subject) return -2; /* Partial match */
210 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
211 p++;
212 eptr++;
217 /* In the caseful case, we can just compare the bytes, whether or not we
218 are in UTF-8 mode. */
220 else
222 while (length-- > 0)
224 if (eptr >= md->end_subject) return -2; /* Partial match */
225 if (*p++ != *eptr++) return -1;
229 return (int)(eptr - eptr_start);
234 /***************************************************************************
235 ****************************************************************************
236 RECURSION IN THE match() FUNCTION
238 The match() function is highly recursive, though not every recursive call
239 increases the recursive depth. Nevertheless, some regular expressions can cause
240 it to recurse to a great depth. I was writing for Unix, so I just let it call
241 itself recursively. This uses the stack for saving everything that has to be
242 saved for a recursive call. On Unix, the stack can be large, and this works
243 fine.
245 It turns out that on some non-Unix-like systems there are problems with
246 programs that use a lot of stack. (This despite the fact that every last chip
247 has oodles of memory these days, and techniques for extending the stack have
248 been known for decades.) So....
250 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
251 calls by keeping local variables that need to be preserved in blocks of memory
252 obtained from malloc() instead instead of on the stack. Macros are used to
253 achieve this so that the actual code doesn't look very different to what it
254 always used to.
256 The original heap-recursive code used longjmp(). However, it seems that this
257 can be very slow on some operating systems. Following a suggestion from Stan
258 Switzer, the use of longjmp() has been abolished, at the cost of having to
259 provide a unique number for each call to RMATCH. There is no way of generating
260 a sequence of numbers at compile time in C. I have given them names, to make
261 them stand out more clearly.
263 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
264 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
265 tests. Furthermore, not using longjmp() means that local dynamic variables
266 don't have indeterminate values; this has meant that the frame size can be
267 reduced because the result can be "passed back" by straight setting of the
268 variable instead of being passed in the frame.
269 ****************************************************************************
270 ***************************************************************************/
272 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
273 below must be updated in sync. */
275 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
276 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
277 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
278 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
279 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
280 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
281 RM61, RM62, RM63, RM64, RM65, RM66 };
283 /* These versions of the macros use the stack, as normal. There are debugging
284 versions and production versions. Note that the "rw" argument of RMATCH isn't
285 actually used in this definition. */
287 #ifndef NO_RECURSE
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
296 #define RRETURN(ra) \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
307 #else
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
314 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 heapframe *newframe = frame->Xnextframe;\
317 if (newframe == NULL)\
319 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
320 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
321 newframe->Xnextframe = NULL;\
322 frame->Xnextframe = newframe;\
324 frame->Xwhere = rw;\
325 newframe->Xeptr = ra;\
326 newframe->Xecode = rb;\
327 newframe->Xmstart = mstart;\
328 newframe->Xoffset_top = rc;\
329 newframe->Xeptrb = re;\
330 newframe->Xrdepth = frame->Xrdepth + 1;\
331 newframe->Xprevframe = frame;\
332 frame = newframe;\
333 DPRINTF(("restarting from line %d\n", __LINE__));\
334 goto HEAP_RECURSE;\
335 L_##rw:\
336 DPRINTF(("jumped back to line %d\n", __LINE__));\
339 #define RRETURN(ra)\
341 heapframe *oldframe = frame;\
342 frame = oldframe->Xprevframe;\
343 if (frame != NULL)\
345 rrc = ra;\
346 goto HEAP_RETURN;\
348 return ra;\
352 /* Structure for remembering the local variables in a private frame */
354 typedef struct heapframe {
355 struct heapframe *Xprevframe;
356 struct heapframe *Xnextframe;
358 /* Function arguments that may change */
360 PCRE_PUCHAR Xeptr;
361 const pcre_uchar *Xecode;
362 PCRE_PUCHAR Xmstart;
363 int Xoffset_top;
364 eptrblock *Xeptrb;
365 unsigned int Xrdepth;
367 /* Function local variables */
369 PCRE_PUCHAR Xcallpat;
370 #ifdef SUPPORT_UTF
371 PCRE_PUCHAR Xcharptr;
372 #endif
373 PCRE_PUCHAR Xdata;
374 PCRE_PUCHAR Xnext;
375 PCRE_PUCHAR Xpp;
376 PCRE_PUCHAR Xprev;
377 PCRE_PUCHAR Xsaved_eptr;
379 recursion_info Xnew_recursive;
381 BOOL Xcur_is_word;
382 BOOL Xcondition;
383 BOOL Xprev_is_word;
385 #ifdef SUPPORT_UCP
386 int Xprop_type;
387 int Xprop_value;
388 int Xprop_fail_result;
389 int Xoclength;
390 pcre_uchar Xocchars[6];
391 #endif
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
407 eptrblock Xnewptrb;
409 /* Where to jump back to */
411 int Xwhere;
413 } heapframe;
415 #endif
418 /***************************************************************************
419 ***************************************************************************/
423 /*************************************************
424 * Match from current position *
425 *************************************************/
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
431 /* These macros pack up tests that are used for partial matching, and which
432 appear several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
442 md->hitend = TRUE; \
443 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
449 md->hitend = TRUE; \
450 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 offset_top current top pointer
465 md pointer to "static" info for the match
466 eptrb pointer to chain of blocks containing eptr at start of
467 brackets - for testing for empty matches
468 rdepth the recursion depth
470 Returns: MATCH_MATCH if matched ) these values are >= 0
471 MATCH_NOMATCH if failed to match )
472 a negative MATCH_xxx value for PRUNE, SKIP, etc
473 a negative PCRE_ERROR_xxx value if aborted by an error condition
474 (e.g. stopped by repeated call or recursion limit)
477 static int
478 match(PCRE_PUCHAR eptr, const pcre_uchar *ecode,
479 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
480 unsigned int rdepth)
482 /* These variables do not need to be preserved over recursion in this function,
483 so they can be ordinary variables in all cases. Mark some of them with
484 "register" because they are used a lot in loops. */
486 int rrc; /* Returns from recursive calls */
487 int i; /* Used for loops not involving calls to RMATCH() */
488 unsigned int c; /* Character values not kept over RMATCH() calls */
489 BOOL utf; /* Local copy of UTF flag for speed */
491 BOOL minimize, possessive; /* Quantifier options */
492 BOOL caseless;
493 int condcode;
495 /* When recursion is not being used, all "local" variables that have to be
496 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
497 frame on the stack here; subsequent instantiations are obtained from the heap
498 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
499 the top-level on the stack rather than malloc-ing them all gives a performance
500 boost in many cases where there is not much "recursion". */
502 #ifdef NO_RECURSE
503 heapframe *frame = (heapframe *)md->match_frames_base;
505 /* Copy in the original argument variables */
507 frame->Xeptr = eptr;
508 frame->Xecode = ecode;
509 frame->Xmstart = mstart;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
514 /* This is where control jumps back to to effect "recursion" */
516 HEAP_RECURSE:
518 /* Macros make the argument variables come from the current frame */
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define offset_top frame->Xoffset_top
524 #define eptrb frame->Xeptrb
525 #define rdepth frame->Xrdepth
527 /* Ditto for the local variables */
529 #ifdef SUPPORT_UTF
530 #define charptr frame->Xcharptr
531 #endif
532 #define callpat frame->Xcallpat
533 #define codelink frame->Xcodelink
534 #define data frame->Xdata
535 #define next frame->Xnext
536 #define pp frame->Xpp
537 #define prev frame->Xprev
538 #define saved_eptr frame->Xsaved_eptr
540 #define new_recursive frame->Xnew_recursive
542 #define cur_is_word frame->Xcur_is_word
543 #define condition frame->Xcondition
544 #define prev_is_word frame->Xprev_is_word
546 #ifdef SUPPORT_UCP
547 #define prop_type frame->Xprop_type
548 #define prop_value frame->Xprop_value
549 #define prop_fail_result frame->Xprop_fail_result
550 #define oclength frame->Xoclength
551 #define occhars frame->Xocchars
552 #endif
554 #define ctype frame->Xctype
555 #define fc frame->Xfc
556 #define fi frame->Xfi
557 #define length frame->Xlength
558 #define max frame->Xmax
559 #define min frame->Xmin
560 #define number frame->Xnumber
561 #define offset frame->Xoffset
562 #define op frame->Xop
563 #define save_capture_last frame->Xsave_capture_last
564 #define save_offset1 frame->Xsave_offset1
565 #define save_offset2 frame->Xsave_offset2
566 #define save_offset3 frame->Xsave_offset3
567 #define stacksave frame->Xstacksave
569 #define newptrb frame->Xnewptrb
571 /* When recursion is being used, local variables are allocated on the stack and
572 get preserved during recursion in the normal way. In this environment, fi and
573 i, and fc and c, can be the same variables. */
575 #else /* NO_RECURSE not defined */
576 #define fi i
577 #define fc c
579 /* Many of the following variables are used only in small blocks of the code.
580 My normal style of coding would have declared them within each of those blocks.
581 However, in order to accommodate the version of this code that uses an external
582 "stack" implemented on the heap, it is easier to declare them all here, so the
583 declarations can be cut out in a block. The only declarations within blocks
584 below are for variables that do not have to be preserved over a recursive call
585 to RMATCH(). */
587 #ifdef SUPPORT_UTF
588 const pcre_uchar *charptr;
589 #endif
590 const pcre_uchar *callpat;
591 const pcre_uchar *data;
592 const pcre_uchar *next;
593 PCRE_PUCHAR pp;
594 const pcre_uchar *prev;
595 PCRE_PUCHAR saved_eptr;
597 recursion_info new_recursive;
599 BOOL cur_is_word;
600 BOOL condition;
601 BOOL prev_is_word;
603 #ifdef SUPPORT_UCP
604 int prop_type;
605 int prop_value;
606 int prop_fail_result;
607 int oclength;
608 pcre_uchar occhars[6];
609 #endif
611 int codelink;
612 int ctype;
613 int length;
614 int max;
615 int min;
616 int number;
617 int offset;
618 int op;
619 int save_capture_last;
620 int save_offset1, save_offset2, save_offset3;
621 int stacksave[REC_STACK_SAVE_MAX];
623 eptrblock newptrb;
625 /* There is a special fudge for calling match() in a way that causes it to
626 measure the size of its basic stack frame when the stack is being used for
627 recursion. The second argument (ecode) being NULL triggers this behaviour. It
628 cannot normally ever be NULL. The return is the negated value of the frame
629 size. */
631 if (ecode == NULL)
633 if (rdepth == 0)
634 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
635 else
637 int len = (char *)&rdepth - (char *)eptr;
638 return (len > 0)? -len : len;
641 #endif /* NO_RECURSE */
643 /* To save space on the stack and in the heap frame, I have doubled up on some
644 of the local variables that are used only in localised parts of the code, but
645 still need to be preserved over recursive calls of match(). These macros define
646 the alternative names that are used. */
648 #define allow_zero cur_is_word
649 #define cbegroup condition
650 #define code_offset codelink
651 #define condassert condition
652 #define matched_once prev_is_word
653 #define foc number
654 #define save_mark data
656 /* These statements are here to stop the compiler complaining about unitialized
657 variables. */
659 #ifdef SUPPORT_UCP
660 prop_value = 0;
661 prop_fail_result = 0;
662 #endif
665 /* This label is used for tail recursion, which is used in a few cases even
666 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
667 used. Thanks to Ian Taylor for noticing this possibility and sending the
668 original patch. */
670 TAIL_RECURSE:
672 /* OK, now we can get on with the real code of the function. Recursive calls
673 are specified by the macro RMATCH and RRETURN is used to return. When
674 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
675 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
676 defined). However, RMATCH isn't like a function call because it's quite a
677 complicated macro. It has to be used in one particular way. This shouldn't,
678 however, impact performance when true recursion is being used. */
680 #ifdef SUPPORT_UTF
681 utf = md->utf; /* Local copy of the flag */
682 #else
683 utf = FALSE;
684 #endif
686 /* First check that we haven't called match() too many times, or that we
687 haven't exceeded the recursive call limit. */
689 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
690 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
692 /* At the start of a group with an unlimited repeat that may match an empty
693 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
694 done this way to save having to use another function argument, which would take
695 up space on the stack. See also MATCH_CONDASSERT below.
697 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
698 such remembered pointers, to be checked when we hit the closing ket, in order
699 to break infinite loops that match no characters. When match() is called in
700 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
701 NOT be used with tail recursion, because the memory block that is used is on
702 the stack, so a new one may be required for each match(). */
704 if (md->match_function_type == MATCH_CBEGROUP)
706 newptrb.epb_saved_eptr = eptr;
707 newptrb.epb_prev = eptrb;
708 eptrb = &newptrb;
709 md->match_function_type = 0;
712 /* Now start processing the opcodes. */
714 for (;;)
716 minimize = possessive = FALSE;
717 op = *ecode;
719 switch(op)
721 case OP_MARK:
722 md->nomatch_mark = ecode + 2;
723 md->mark = NULL; /* In case previously set by assertion */
724 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
725 eptrb, RM55);
726 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
727 md->mark == NULL) md->mark = ecode + 2;
729 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
730 argument, and we must check whether that argument matches this MARK's
731 argument. It is passed back in md->start_match_ptr (an overloading of that
732 variable). If it does match, we reset that variable to the current subject
733 position and return MATCH_SKIP. Otherwise, pass back the return code
734 unaltered. */
736 else if (rrc == MATCH_SKIP_ARG &&
737 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
739 md->start_match_ptr = eptr;
740 RRETURN(MATCH_SKIP);
742 RRETURN(rrc);
744 case OP_FAIL:
745 RRETURN(MATCH_NOMATCH);
747 /* COMMIT overrides PRUNE, SKIP, and THEN */
749 case OP_COMMIT:
750 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
751 eptrb, RM52);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
753 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
754 rrc != MATCH_THEN)
755 RRETURN(rrc);
756 RRETURN(MATCH_COMMIT);
758 /* PRUNE overrides THEN */
760 case OP_PRUNE:
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
762 eptrb, RM51);
763 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
764 RRETURN(MATCH_PRUNE);
766 case OP_PRUNE_ARG:
767 md->nomatch_mark = ecode + 2;
768 md->mark = NULL; /* In case previously set by assertion */
769 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
770 eptrb, RM56);
771 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
772 md->mark == NULL) md->mark = ecode + 2;
773 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
774 RRETURN(MATCH_PRUNE);
776 /* SKIP overrides PRUNE and THEN */
778 case OP_SKIP:
779 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
780 eptrb, RM53);
781 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
782 RRETURN(rrc);
783 md->start_match_ptr = eptr; /* Pass back current position */
784 RRETURN(MATCH_SKIP);
786 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
787 nomatch_mark. There is a flag that disables this opcode when re-matching a
788 pattern that ended with a SKIP for which there was not a matching MARK. */
790 case OP_SKIP_ARG:
791 if (md->ignore_skip_arg)
793 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
794 break;
796 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
797 eptrb, RM57);
798 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
799 RRETURN(rrc);
801 /* Pass back the current skip name by overloading md->start_match_ptr and
802 returning the special MATCH_SKIP_ARG return code. This will either be
803 caught by a matching MARK, or get to the top, where it causes a rematch
804 with the md->ignore_skip_arg flag set. */
806 md->start_match_ptr = ecode + 2;
807 RRETURN(MATCH_SKIP_ARG);
809 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
810 the branch in which it occurs can be determined. Overload the start of
811 match pointer to do this. */
813 case OP_THEN:
814 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
815 eptrb, RM54);
816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
817 md->start_match_ptr = ecode;
818 RRETURN(MATCH_THEN);
820 case OP_THEN_ARG:
821 md->nomatch_mark = ecode + 2;
822 md->mark = NULL; /* In case previously set by assertion */
823 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
824 md, eptrb, RM58);
825 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
826 md->mark == NULL) md->mark = ecode + 2;
827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
828 md->start_match_ptr = ecode;
829 RRETURN(MATCH_THEN);
831 /* Handle an atomic group that does not contain any capturing parentheses.
832 This can be handled like an assertion. Prior to 8.13, all atomic groups
833 were handled this way. In 8.13, the code was changed as below for ONCE, so
834 that backups pass through the group and thereby reset captured values.
835 However, this uses a lot more stack, so in 8.20, atomic groups that do not
836 contain any captures generate OP_ONCE_NC, which can be handled in the old,
837 less stack intensive way.
839 Check the alternative branches in turn - the matching won't pass the KET
840 for this kind of subpattern. If any one branch matches, we carry on as at
841 the end of a normal bracket, leaving the subject pointer, but resetting
842 the start-of-match value in case it was changed by \K. */
844 case OP_ONCE_NC:
845 prev = ecode;
846 saved_eptr = eptr;
847 save_mark = md->mark;
850 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
851 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
853 mstart = md->start_match_ptr;
854 break;
856 if (rrc == MATCH_THEN)
858 next = ecode + GET(ecode,1);
859 if (md->start_match_ptr < next &&
860 (*ecode == OP_ALT || *next == OP_ALT))
861 rrc = MATCH_NOMATCH;
864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
865 ecode += GET(ecode,1);
866 md->mark = save_mark;
868 while (*ecode == OP_ALT);
870 /* If hit the end of the group (which could be repeated), fail */
872 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
874 /* Continue as from after the group, updating the offsets high water
875 mark, since extracts may have been taken. */
877 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
879 offset_top = md->end_offset_top;
880 eptr = md->end_match_ptr;
882 /* For a non-repeating ket, just continue at this level. This also
883 happens for a repeating ket if no characters were matched in the group.
884 This is the forcible breaking of infinite loops as implemented in Perl
885 5.005. */
887 if (*ecode == OP_KET || eptr == saved_eptr)
889 ecode += 1+LINK_SIZE;
890 break;
893 /* The repeating kets try the rest of the pattern or restart from the
894 preceding bracket, in the appropriate order. The second "call" of match()
895 uses tail recursion, to avoid using another stack frame. */
897 if (*ecode == OP_KETRMIN)
899 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
901 ecode = prev;
902 goto TAIL_RECURSE;
904 else /* OP_KETRMAX */
906 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
908 ecode += 1 + LINK_SIZE;
909 goto TAIL_RECURSE;
911 /* Control never gets here */
913 /* Handle a capturing bracket, other than those that are possessive with an
914 unlimited repeat. If there is space in the offset vector, save the current
915 subject position in the working slot at the top of the vector. We mustn't
916 change the current values of the data slot, because they may be set from a
917 previous iteration of this group, and be referred to by a reference inside
918 the group. A failure to match might occur after the group has succeeded,
919 if something later on doesn't match. For this reason, we need to restore
920 the working value and also the values of the final offsets, in case they
921 were set by a previous iteration of the same bracket.
923 If there isn't enough space in the offset vector, treat this as if it were
924 a non-capturing bracket. Don't worry about setting the flag for the error
925 case here; that is handled in the code for KET. */
927 case OP_CBRA:
928 case OP_SCBRA:
929 number = GET2(ecode, 1+LINK_SIZE);
930 offset = number << 1;
932 #ifdef PCRE_DEBUG
933 printf("start bracket %d\n", number);
934 printf("subject=");
935 pchars(eptr, 16, TRUE, md);
936 printf("\n");
937 #endif
939 if (offset < md->offset_max)
941 save_offset1 = md->offset_vector[offset];
942 save_offset2 = md->offset_vector[offset+1];
943 save_offset3 = md->offset_vector[md->offset_end - number];
944 save_capture_last = md->capture_last;
945 save_mark = md->mark;
947 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
948 md->offset_vector[md->offset_end - number] =
949 (int)(eptr - md->start_subject);
951 for (;;)
953 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
954 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
955 eptrb, RM1);
956 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
958 /* If we backed up to a THEN, check whether it is within the current
959 branch by comparing the address of the THEN that is passed back with
960 the end of the branch. If it is within the current branch, and the
961 branch is one of two or more alternatives (it either starts or ends
962 with OP_ALT), we have reached the limit of THEN's action, so convert
963 the return code to NOMATCH, which will cause normal backtracking to
964 happen from now on. Otherwise, THEN is passed back to an outer
965 alternative. This implements Perl's treatment of parenthesized groups,
966 where a group not containing | does not affect the current alternative,
967 that is, (X) is NOT the same as (X|(*F)). */
969 if (rrc == MATCH_THEN)
971 next = ecode + GET(ecode,1);
972 if (md->start_match_ptr < next &&
973 (*ecode == OP_ALT || *next == OP_ALT))
974 rrc = MATCH_NOMATCH;
977 /* Anything other than NOMATCH is passed back. */
979 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
980 md->capture_last = save_capture_last;
981 ecode += GET(ecode, 1);
982 md->mark = save_mark;
983 if (*ecode != OP_ALT) break;
986 DPRINTF(("bracket %d failed\n", number));
987 md->offset_vector[offset] = save_offset1;
988 md->offset_vector[offset+1] = save_offset2;
989 md->offset_vector[md->offset_end - number] = save_offset3;
991 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
993 RRETURN(rrc);
996 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
997 as a non-capturing bracket. */
999 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1000 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1002 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1004 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007 /* Non-capturing or atomic group, except for possessive with unlimited
1008 repeat and ONCE group with no captures. Loop for all the alternatives.
1010 When we get to the final alternative within the brackets, we used to return
1011 the result of a recursive call to match() whatever happened so it was
1012 possible to reduce stack usage by turning this into a tail recursion,
1013 except in the case of a possibly empty group. However, now that there is
1014 the possiblity of (*THEN) occurring in the final alternative, this
1015 optimization is no longer always possible.
1017 We can optimize if we know there are no (*THEN)s in the pattern; at present
1018 this is the best that can be done.
1020 MATCH_ONCE is returned when the end of an atomic group is successfully
1021 reached, but subsequent matching fails. It passes back up the tree (causing
1022 captured values to be reset) until the original atomic group level is
1023 reached. This is tested by comparing md->once_target with the start of the
1024 group. At this point, the return is converted into MATCH_NOMATCH so that
1025 previous backup points can be taken. */
1027 case OP_ONCE:
1028 case OP_BRA:
1029 case OP_SBRA:
1030 DPRINTF(("start non-capturing bracket\n"));
1032 for (;;)
1034 if (op >= OP_SBRA || op == OP_ONCE)
1035 md->match_function_type = MATCH_CBEGROUP;
1037 /* If this is not a possibly empty group, and there are no (*THEN)s in
1038 the pattern, and this is the final alternative, optimize as described
1039 above. */
1041 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1043 ecode += PRIV(OP_lengths)[*ecode];
1044 goto TAIL_RECURSE;
1047 /* In all other cases, we have to make another call to match(). */
1049 save_mark = md->mark;
1050 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1051 RM2);
1053 /* See comment in the code for capturing groups above about handling
1054 THEN. */
1056 if (rrc == MATCH_THEN)
1058 next = ecode + GET(ecode,1);
1059 if (md->start_match_ptr < next &&
1060 (*ecode == OP_ALT || *next == OP_ALT))
1061 rrc = MATCH_NOMATCH;
1064 if (rrc != MATCH_NOMATCH)
1066 if (rrc == MATCH_ONCE)
1068 const pcre_uchar *scode = ecode;
1069 if (*scode != OP_ONCE) /* If not at start, find it */
1071 while (*scode == OP_ALT) scode += GET(scode, 1);
1072 scode -= GET(scode, 1);
1074 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1076 RRETURN(rrc);
1078 ecode += GET(ecode, 1);
1079 md->mark = save_mark;
1080 if (*ecode != OP_ALT) break;
1083 RRETURN(MATCH_NOMATCH);
1085 /* Handle possessive capturing brackets with an unlimited repeat. We come
1086 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1087 handled similarly to the normal case above. However, the matching is
1088 different. The end of these brackets will always be OP_KETRPOS, which
1089 returns MATCH_KETRPOS without going further in the pattern. By this means
1090 we can handle the group by iteration rather than recursion, thereby
1091 reducing the amount of stack needed. */
1093 case OP_CBRAPOS:
1094 case OP_SCBRAPOS:
1095 allow_zero = FALSE;
1097 POSSESSIVE_CAPTURE:
1098 number = GET2(ecode, 1+LINK_SIZE);
1099 offset = number << 1;
1101 #ifdef PCRE_DEBUG
1102 printf("start possessive bracket %d\n", number);
1103 printf("subject=");
1104 pchars(eptr, 16, TRUE, md);
1105 printf("\n");
1106 #endif
1108 if (offset < md->offset_max)
1110 matched_once = FALSE;
1111 code_offset = (int)(ecode - md->start_code);
1113 save_offset1 = md->offset_vector[offset];
1114 save_offset2 = md->offset_vector[offset+1];
1115 save_offset3 = md->offset_vector[md->offset_end - number];
1116 save_capture_last = md->capture_last;
1118 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1120 /* Each time round the loop, save the current subject position for use
1121 when the group matches. For MATCH_MATCH, the group has matched, so we
1122 restart it with a new subject starting position, remembering that we had
1123 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1124 usual. If we haven't matched any alternatives in any iteration, check to
1125 see if a previous iteration matched. If so, the group has matched;
1126 continue from afterwards. Otherwise it has failed; restore the previous
1127 capture values before returning NOMATCH. */
1129 for (;;)
1131 md->offset_vector[md->offset_end - number] =
1132 (int)(eptr - md->start_subject);
1133 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1134 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1135 eptrb, RM63);
1136 if (rrc == MATCH_KETRPOS)
1138 offset_top = md->end_offset_top;
1139 eptr = md->end_match_ptr;
1140 ecode = md->start_code + code_offset;
1141 save_capture_last = md->capture_last;
1142 matched_once = TRUE;
1143 continue;
1146 /* See comment in the code for capturing groups above about handling
1147 THEN. */
1149 if (rrc == MATCH_THEN)
1151 next = ecode + GET(ecode,1);
1152 if (md->start_match_ptr < next &&
1153 (*ecode == OP_ALT || *next == OP_ALT))
1154 rrc = MATCH_NOMATCH;
1157 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1158 md->capture_last = save_capture_last;
1159 ecode += GET(ecode, 1);
1160 if (*ecode != OP_ALT) break;
1163 if (!matched_once)
1165 md->offset_vector[offset] = save_offset1;
1166 md->offset_vector[offset+1] = save_offset2;
1167 md->offset_vector[md->offset_end - number] = save_offset3;
1170 if (allow_zero || matched_once)
1172 ecode += 1 + LINK_SIZE;
1173 break;
1176 RRETURN(MATCH_NOMATCH);
1179 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1180 as a non-capturing bracket. */
1182 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1183 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1185 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1187 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1188 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1190 /* Non-capturing possessive bracket with unlimited repeat. We come here
1191 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1192 without the capturing complication. It is written out separately for speed
1193 and cleanliness. */
1195 case OP_BRAPOS:
1196 case OP_SBRAPOS:
1197 allow_zero = FALSE;
1199 POSSESSIVE_NON_CAPTURE:
1200 matched_once = FALSE;
1201 code_offset = (int)(ecode - md->start_code);
1203 for (;;)
1205 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1206 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1207 eptrb, RM48);
1208 if (rrc == MATCH_KETRPOS)
1210 offset_top = md->end_offset_top;
1211 eptr = md->end_match_ptr;
1212 ecode = md->start_code + code_offset;
1213 matched_once = TRUE;
1214 continue;
1217 /* See comment in the code for capturing groups above about handling
1218 THEN. */
1220 if (rrc == MATCH_THEN)
1222 next = ecode + GET(ecode,1);
1223 if (md->start_match_ptr < next &&
1224 (*ecode == OP_ALT || *next == OP_ALT))
1225 rrc = MATCH_NOMATCH;
1228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1229 ecode += GET(ecode, 1);
1230 if (*ecode != OP_ALT) break;
1233 if (matched_once || allow_zero)
1235 ecode += 1 + LINK_SIZE;
1236 break;
1238 RRETURN(MATCH_NOMATCH);
1240 /* Control never reaches here. */
1242 /* Conditional group: compilation checked that there are no more than
1243 two branches. If the condition is false, skipping the first branch takes us
1244 past the end if there is only one branch, but that's OK because that is
1245 exactly what going to the ket would do. */
1247 case OP_COND:
1248 case OP_SCOND:
1249 codelink = GET(ecode, 1);
1251 /* Because of the way auto-callout works during compile, a callout item is
1252 inserted between OP_COND and an assertion condition. */
1254 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1256 if (PUBL(callout) != NULL)
1258 PUBL(callout_block) cb;
1259 cb.version = 2; /* Version 1 of the callout block */
1260 cb.callout_number = ecode[LINK_SIZE+2];
1261 cb.offset_vector = md->offset_vector;
1262 #ifdef COMPILE_PCRE8
1263 cb.subject = (PCRE_SPTR)md->start_subject;
1264 #else
1265 cb.subject = (PCRE_SPTR16)md->start_subject;
1266 #endif
1267 cb.subject_length = (int)(md->end_subject - md->start_subject);
1268 cb.start_match = (int)(mstart - md->start_subject);
1269 cb.current_position = (int)(eptr - md->start_subject);
1270 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1271 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1272 cb.capture_top = offset_top/2;
1273 cb.capture_last = md->capture_last;
1274 cb.callout_data = md->callout_data;
1275 cb.mark = md->nomatch_mark;
1276 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1277 if (rrc < 0) RRETURN(rrc);
1279 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1282 condcode = ecode[LINK_SIZE+1];
1284 /* Now see what the actual condition is */
1286 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1288 if (md->recursive == NULL) /* Not recursing => FALSE */
1290 condition = FALSE;
1291 ecode += GET(ecode, 1);
1293 else
1295 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1296 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1298 /* If the test is for recursion into a specific subpattern, and it is
1299 false, but the test was set up by name, scan the table to see if the
1300 name refers to any other numbers, and test them. The condition is true
1301 if any one is set. */
1303 if (!condition && condcode == OP_NRREF)
1305 pcre_uchar *slotA = md->name_table;
1306 for (i = 0; i < md->name_count; i++)
1308 if (GET2(slotA, 0) == recno) break;
1309 slotA += md->name_entry_size;
1312 /* Found a name for the number - there can be only one; duplicate
1313 names for different numbers are allowed, but not vice versa. First
1314 scan down for duplicates. */
1316 if (i < md->name_count)
1318 pcre_uchar *slotB = slotA;
1319 while (slotB > md->name_table)
1321 slotB -= md->name_entry_size;
1322 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1324 condition = GET2(slotB, 0) == md->recursive->group_num;
1325 if (condition) break;
1327 else break;
1330 /* Scan up for duplicates */
1332 if (!condition)
1334 slotB = slotA;
1335 for (i++; i < md->name_count; i++)
1337 slotB += md->name_entry_size;
1338 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1340 condition = GET2(slotB, 0) == md->recursive->group_num;
1341 if (condition) break;
1343 else break;
1349 /* Chose branch according to the condition */
1351 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1355 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1357 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1358 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1360 /* If the numbered capture is unset, but the reference was by name,
1361 scan the table to see if the name refers to any other numbers, and test
1362 them. The condition is true if any one is set. This is tediously similar
1363 to the code above, but not close enough to try to amalgamate. */
1365 if (!condition && condcode == OP_NCREF)
1367 int refno = offset >> 1;
1368 pcre_uchar *slotA = md->name_table;
1370 for (i = 0; i < md->name_count; i++)
1372 if (GET2(slotA, 0) == refno) break;
1373 slotA += md->name_entry_size;
1376 /* Found a name for the number - there can be only one; duplicate names
1377 for different numbers are allowed, but not vice versa. First scan down
1378 for duplicates. */
1380 if (i < md->name_count)
1382 pcre_uchar *slotB = slotA;
1383 while (slotB > md->name_table)
1385 slotB -= md->name_entry_size;
1386 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1388 offset = GET2(slotB, 0) << 1;
1389 condition = offset < offset_top &&
1390 md->offset_vector[offset] >= 0;
1391 if (condition) break;
1393 else break;
1396 /* Scan up for duplicates */
1398 if (!condition)
1400 slotB = slotA;
1401 for (i++; i < md->name_count; i++)
1403 slotB += md->name_entry_size;
1404 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1406 offset = GET2(slotB, 0) << 1;
1407 condition = offset < offset_top &&
1408 md->offset_vector[offset] >= 0;
1409 if (condition) break;
1411 else break;
1417 /* Chose branch according to the condition */
1419 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1422 else if (condcode == OP_DEF) /* DEFINE - always false */
1424 condition = FALSE;
1425 ecode += GET(ecode, 1);
1428 /* The condition is an assertion. Call match() to evaluate it - setting
1429 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1430 an assertion. */
1432 else
1434 md->match_function_type = MATCH_CONDASSERT;
1435 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1436 if (rrc == MATCH_MATCH)
1438 if (md->end_offset_top > offset_top)
1439 offset_top = md->end_offset_top; /* Captures may have happened */
1440 condition = TRUE;
1441 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1442 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1445 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1446 assertion; it is therefore treated as NOMATCH. */
1448 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1450 RRETURN(rrc); /* Need braces because of following else */
1452 else
1454 condition = FALSE;
1455 ecode += codelink;
1459 /* We are now at the branch that is to be obeyed. As there is only one, can
1460 use tail recursion to avoid using another stack frame, except when there is
1461 unlimited repeat of a possibly empty group. In the latter case, a recursive
1462 call to match() is always required, unless the second alternative doesn't
1463 exist, in which case we can just plough on. Note that, for compatibility
1464 with Perl, the | in a conditional group is NOT treated as creating two
1465 alternatives. If a THEN is encountered in the branch, it propagates out to
1466 the enclosing alternative (unless nested in a deeper set of alternatives,
1467 of course). */
1469 if (condition || *ecode == OP_ALT)
1471 if (op != OP_SCOND)
1473 ecode += 1 + LINK_SIZE;
1474 goto TAIL_RECURSE;
1477 md->match_function_type = MATCH_CBEGROUP;
1478 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1479 RRETURN(rrc);
1482 /* Condition false & no alternative; continue after the group. */
1484 else
1486 ecode += 1 + LINK_SIZE;
1488 break;
1491 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1492 to close any currently open capturing brackets. */
1494 case OP_CLOSE:
1495 number = GET2(ecode, 1);
1496 offset = number << 1;
1498 #ifdef PCRE_DEBUG
1499 printf("end bracket %d at *ACCEPT", number);
1500 printf("\n");
1501 #endif
1503 md->capture_last = number;
1504 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1506 md->offset_vector[offset] =
1507 md->offset_vector[md->offset_end - number];
1508 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1509 if (offset_top <= offset) offset_top = offset + 2;
1511 ecode += 1 + IMM2_SIZE;
1512 break;
1515 /* End of the pattern, either real or forced. */
1517 case OP_END:
1518 case OP_ACCEPT:
1519 case OP_ASSERT_ACCEPT:
1521 /* If we have matched an empty string, fail if not in an assertion and not
1522 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1523 is set and we have matched at the start of the subject. In both cases,
1524 backtracking will then try other alternatives, if any. */
1526 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1527 md->recursive == NULL &&
1528 (md->notempty ||
1529 (md->notempty_atstart &&
1530 mstart == md->start_subject + md->start_offset)))
1531 RRETURN(MATCH_NOMATCH);
1533 /* Otherwise, we have a match. */
1535 md->end_match_ptr = eptr; /* Record where we ended */
1536 md->end_offset_top = offset_top; /* and how many extracts were taken */
1537 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1539 /* For some reason, the macros don't work properly if an expression is
1540 given as the argument to RRETURN when the heap is in use. */
1542 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1543 RRETURN(rrc);
1545 /* Assertion brackets. Check the alternative branches in turn - the
1546 matching won't pass the KET for an assertion. If any one branch matches,
1547 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1548 start of each branch to move the current point backwards, so the code at
1549 this level is identical to the lookahead case. When the assertion is part
1550 of a condition, we want to return immediately afterwards. The caller of
1551 this incarnation of the match() function will have set MATCH_CONDASSERT in
1552 md->match_function type, and one of these opcodes will be the first opcode
1553 that is processed. We use a local variable that is preserved over calls to
1554 match() to remember this case. */
1556 case OP_ASSERT:
1557 case OP_ASSERTBACK:
1558 save_mark = md->mark;
1559 if (md->match_function_type == MATCH_CONDASSERT)
1561 condassert = TRUE;
1562 md->match_function_type = 0;
1564 else condassert = FALSE;
1568 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1569 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1571 mstart = md->start_match_ptr; /* In case \K reset it */
1572 break;
1574 md->mark = save_mark;
1576 /* A COMMIT failure must fail the entire assertion, without trying any
1577 subsequent branches. */
1579 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1581 /* PCRE does not allow THEN to escape beyond an assertion; it
1582 is treated as NOMATCH. */
1584 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1585 ecode += GET(ecode, 1);
1587 while (*ecode == OP_ALT);
1589 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1591 /* If checking an assertion for a condition, return MATCH_MATCH. */
1593 if (condassert) RRETURN(MATCH_MATCH);
1595 /* Continue from after the assertion, updating the offsets high water
1596 mark, since extracts may have been taken during the assertion. */
1598 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1599 ecode += 1 + LINK_SIZE;
1600 offset_top = md->end_offset_top;
1601 continue;
1603 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1604 PRUNE, or COMMIT means we must assume failure without checking subsequent
1605 branches. */
1607 case OP_ASSERT_NOT:
1608 case OP_ASSERTBACK_NOT:
1609 save_mark = md->mark;
1610 if (md->match_function_type == MATCH_CONDASSERT)
1612 condassert = TRUE;
1613 md->match_function_type = 0;
1615 else condassert = FALSE;
1619 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1620 md->mark = save_mark;
1621 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1622 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1624 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1625 break;
1628 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1629 as NOMATCH. */
1631 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1632 ecode += GET(ecode,1);
1634 while (*ecode == OP_ALT);
1636 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1638 ecode += 1 + LINK_SIZE;
1639 continue;
1641 /* Move the subject pointer back. This occurs only at the start of
1642 each branch of a lookbehind assertion. If we are too close to the start to
1643 move back, this match function fails. When working with UTF-8 we move
1644 back a number of characters, not bytes. */
1646 case OP_REVERSE:
1647 #ifdef SUPPORT_UTF
1648 if (utf)
1650 i = GET(ecode, 1);
1651 while (i-- > 0)
1653 eptr--;
1654 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1655 BACKCHAR(eptr);
1658 else
1659 #endif
1661 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1664 eptr -= GET(ecode, 1);
1665 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1668 /* Save the earliest consulted character, then skip to next op code */
1670 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1671 ecode += 1 + LINK_SIZE;
1672 break;
1674 /* The callout item calls an external function, if one is provided, passing
1675 details of the match so far. This is mainly for debugging, though the
1676 function is able to force a failure. */
1678 case OP_CALLOUT:
1679 if (PUBL(callout) != NULL)
1681 PUBL(callout_block) cb;
1682 cb.version = 2; /* Version 1 of the callout block */
1683 cb.callout_number = ecode[1];
1684 cb.offset_vector = md->offset_vector;
1685 #ifdef COMPILE_PCRE8
1686 cb.subject = (PCRE_SPTR)md->start_subject;
1687 #else
1688 cb.subject = (PCRE_SPTR16)md->start_subject;
1689 #endif
1690 cb.subject_length = (int)(md->end_subject - md->start_subject);
1691 cb.start_match = (int)(mstart - md->start_subject);
1692 cb.current_position = (int)(eptr - md->start_subject);
1693 cb.pattern_position = GET(ecode, 2);
1694 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1695 cb.capture_top = offset_top/2;
1696 cb.capture_last = md->capture_last;
1697 cb.callout_data = md->callout_data;
1698 cb.mark = md->nomatch_mark;
1699 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1700 if (rrc < 0) RRETURN(rrc);
1702 ecode += 2 + 2*LINK_SIZE;
1703 break;
1705 /* Recursion either matches the current regex, or some subexpression. The
1706 offset data is the offset to the starting bracket from the start of the
1707 whole pattern. (This is so that it works from duplicated subpatterns.)
1709 The state of the capturing groups is preserved over recursion, and
1710 re-instated afterwards. We don't know how many are started and not yet
1711 finished (offset_top records the completed total) so we just have to save
1712 all the potential data. There may be up to 65535 such values, which is too
1713 large to put on the stack, but using malloc for small numbers seems
1714 expensive. As a compromise, the stack is used when there are no more than
1715 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1717 There are also other values that have to be saved. We use a chained
1718 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1719 for the original version of this logic. It has, however, been hacked around
1720 a lot, so he is not to blame for the current way it works. */
1722 case OP_RECURSE:
1724 recursion_info *ri;
1725 int recno;
1727 callpat = md->start_code + GET(ecode, 1);
1728 recno = (callpat == md->start_code)? 0 :
1729 GET2(callpat, 1 + LINK_SIZE);
1731 /* Check for repeating a recursion without advancing the subject pointer.
1732 This should catch convoluted mutual recursions. (Some simple cases are
1733 caught at compile time.) */
1735 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1736 if (recno == ri->group_num && eptr == ri->subject_position)
1737 RRETURN(PCRE_ERROR_RECURSELOOP);
1739 /* Add to "recursing stack" */
1741 new_recursive.group_num = recno;
1742 new_recursive.subject_position = eptr;
1743 new_recursive.prevrec = md->recursive;
1744 md->recursive = &new_recursive;
1746 /* Where to continue from afterwards */
1748 ecode += 1 + LINK_SIZE;
1750 /* Now save the offset data */
1752 new_recursive.saved_max = md->offset_end;
1753 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1754 new_recursive.offset_save = stacksave;
1755 else
1757 new_recursive.offset_save =
1758 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1759 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1761 memcpy(new_recursive.offset_save, md->offset_vector,
1762 new_recursive.saved_max * sizeof(int));
1764 /* OK, now we can do the recursion. After processing each alternative,
1765 restore the offset data. If there were nested recursions, md->recursive
1766 might be changed, so reset it before looping. */
1768 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1769 cbegroup = (*callpat >= OP_SBRA);
1772 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1773 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1774 md, eptrb, RM6);
1775 memcpy(md->offset_vector, new_recursive.offset_save,
1776 new_recursive.saved_max * sizeof(int));
1777 md->recursive = new_recursive.prevrec;
1778 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1780 DPRINTF(("Recursion matched\n"));
1781 if (new_recursive.offset_save != stacksave)
1782 (PUBL(free))(new_recursive.offset_save);
1784 /* Set where we got to in the subject, and reset the start in case
1785 it was changed by \K. This *is* propagated back out of a recursion,
1786 for Perl compatibility. */
1788 eptr = md->end_match_ptr;
1789 mstart = md->start_match_ptr;
1790 goto RECURSION_MATCHED; /* Exit loop; end processing */
1793 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1794 is treated as NOMATCH. */
1796 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1797 rrc != MATCH_COMMIT)
1799 DPRINTF(("Recursion gave error %d\n", rrc));
1800 if (new_recursive.offset_save != stacksave)
1801 (PUBL(free))(new_recursive.offset_save);
1802 RRETURN(rrc);
1805 md->recursive = &new_recursive;
1806 callpat += GET(callpat, 1);
1808 while (*callpat == OP_ALT);
1810 DPRINTF(("Recursion didn't match\n"));
1811 md->recursive = new_recursive.prevrec;
1812 if (new_recursive.offset_save != stacksave)
1813 (PUBL(free))(new_recursive.offset_save);
1814 RRETURN(MATCH_NOMATCH);
1817 RECURSION_MATCHED:
1818 break;
1820 /* An alternation is the end of a branch; scan along to find the end of the
1821 bracketed group and go to there. */
1823 case OP_ALT:
1824 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1825 break;
1827 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1828 indicating that it may occur zero times. It may repeat infinitely, or not
1829 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1830 with fixed upper repeat limits are compiled as a number of copies, with the
1831 optional ones preceded by BRAZERO or BRAMINZERO. */
1833 case OP_BRAZERO:
1834 next = ecode + 1;
1835 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1837 do next += GET(next, 1); while (*next == OP_ALT);
1838 ecode = next + 1 + LINK_SIZE;
1839 break;
1841 case OP_BRAMINZERO:
1842 next = ecode + 1;
1843 do next += GET(next, 1); while (*next == OP_ALT);
1844 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1846 ecode++;
1847 break;
1849 case OP_SKIPZERO:
1850 next = ecode+1;
1851 do next += GET(next,1); while (*next == OP_ALT);
1852 ecode = next + 1 + LINK_SIZE;
1853 break;
1855 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1856 here; just jump to the group, with allow_zero set TRUE. */
1858 case OP_BRAPOSZERO:
1859 op = *(++ecode);
1860 allow_zero = TRUE;
1861 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1862 goto POSSESSIVE_NON_CAPTURE;
1864 /* End of a group, repeated or non-repeating. */
1866 case OP_KET:
1867 case OP_KETRMIN:
1868 case OP_KETRMAX:
1869 case OP_KETRPOS:
1870 prev = ecode - GET(ecode, 1);
1872 /* If this was a group that remembered the subject start, in order to break
1873 infinite repeats of empty string matches, retrieve the subject start from
1874 the chain. Otherwise, set it NULL. */
1876 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1878 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1879 eptrb = eptrb->epb_prev; /* Backup to previous group */
1881 else saved_eptr = NULL;
1883 /* If we are at the end of an assertion group or a non-capturing atomic
1884 group, stop matching and return MATCH_MATCH, but record the current high
1885 water mark for use by positive assertions. We also need to record the match
1886 start in case it was changed by \K. */
1888 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1889 *prev == OP_ONCE_NC)
1891 md->end_match_ptr = eptr; /* For ONCE_NC */
1892 md->end_offset_top = offset_top;
1893 md->start_match_ptr = mstart;
1894 RRETURN(MATCH_MATCH); /* Sets md->mark */
1897 /* For capturing groups we have to check the group number back at the start
1898 and if necessary complete handling an extraction by setting the offsets and
1899 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1900 into group 0, so it won't be picked up here. Instead, we catch it when the
1901 OP_END is reached. Other recursion is handled here. We just have to record
1902 the current subject position and start match pointer and give a MATCH
1903 return. */
1905 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1906 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1908 number = GET2(prev, 1+LINK_SIZE);
1909 offset = number << 1;
1911 #ifdef PCRE_DEBUG
1912 printf("end bracket %d", number);
1913 printf("\n");
1914 #endif
1916 /* Handle a recursively called group. */
1918 if (md->recursive != NULL && md->recursive->group_num == number)
1920 md->end_match_ptr = eptr;
1921 md->start_match_ptr = mstart;
1922 RRETURN(MATCH_MATCH);
1925 /* Deal with capturing */
1927 md->capture_last = number;
1928 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1930 /* If offset is greater than offset_top, it means that we are
1931 "skipping" a capturing group, and that group's offsets must be marked
1932 unset. In earlier versions of PCRE, all the offsets were unset at the
1933 start of matching, but this doesn't work because atomic groups and
1934 assertions can cause a value to be set that should later be unset.
1935 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1936 part of the atomic group, but this is not on the final matching path,
1937 so must be unset when 2 is set. (If there is no group 2, there is no
1938 problem, because offset_top will then be 2, indicating no capture.) */
1940 if (offset > offset_top)
1942 int *iptr = md->offset_vector + offset_top;
1943 int *iend = md->offset_vector + offset;
1944 while (iptr < iend) *iptr++ = -1;
1947 /* Now make the extraction */
1949 md->offset_vector[offset] =
1950 md->offset_vector[md->offset_end - number];
1951 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1952 if (offset_top <= offset) offset_top = offset + 2;
1956 /* For an ordinary non-repeating ket, just continue at this level. This
1957 also happens for a repeating ket if no characters were matched in the
1958 group. This is the forcible breaking of infinite loops as implemented in
1959 Perl 5.005. For a non-repeating atomic group that includes captures,
1960 establish a backup point by processing the rest of the pattern at a lower
1961 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1962 original OP_ONCE level, thereby bypassing intermediate backup points, but
1963 resetting any captures that happened along the way. */
1965 if (*ecode == OP_KET || eptr == saved_eptr)
1967 if (*prev == OP_ONCE)
1969 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1972 RRETURN(MATCH_ONCE);
1974 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1975 break;
1978 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1979 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1980 at a time from the outer level, thus saving stack. */
1982 if (*ecode == OP_KETRPOS)
1984 md->end_match_ptr = eptr;
1985 md->end_offset_top = offset_top;
1986 RRETURN(MATCH_KETRPOS);
1989 /* The normal repeating kets try the rest of the pattern or restart from
1990 the preceding bracket, in the appropriate order. In the second case, we can
1991 use tail recursion to avoid using another stack frame, unless we have an
1992 an atomic group or an unlimited repeat of a group that can match an empty
1993 string. */
1995 if (*ecode == OP_KETRMIN)
1997 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1999 if (*prev == OP_ONCE)
2001 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2002 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2003 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2004 RRETURN(MATCH_ONCE);
2006 if (*prev >= OP_SBRA) /* Could match an empty string */
2008 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2009 RRETURN(rrc);
2011 ecode = prev;
2012 goto TAIL_RECURSE;
2014 else /* OP_KETRMAX */
2016 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2017 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 if (*prev == OP_ONCE)
2021 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2022 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2023 md->once_target = prev;
2024 RRETURN(MATCH_ONCE);
2026 ecode += 1 + LINK_SIZE;
2027 goto TAIL_RECURSE;
2029 /* Control never gets here */
2031 /* Not multiline mode: start of subject assertion, unless notbol. */
2033 case OP_CIRC:
2034 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2036 /* Start of subject assertion */
2038 case OP_SOD:
2039 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2040 ecode++;
2041 break;
2043 /* Multiline mode: start of subject unless notbol, or after any newline. */
2045 case OP_CIRCM:
2046 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2047 if (eptr != md->start_subject &&
2048 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2049 RRETURN(MATCH_NOMATCH);
2050 ecode++;
2051 break;
2053 /* Start of match assertion */
2055 case OP_SOM:
2056 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2057 ecode++;
2058 break;
2060 /* Reset the start of match point */
2062 case OP_SET_SOM:
2063 mstart = eptr;
2064 ecode++;
2065 break;
2067 /* Multiline mode: assert before any newline, or before end of subject
2068 unless noteol is set. */
2070 case OP_DOLLM:
2071 if (eptr < md->end_subject)
2073 if (!IS_NEWLINE(eptr))
2075 if (md->partial != 0 &&
2076 eptr + 1 >= md->end_subject &&
2077 NLBLOCK->nltype == NLTYPE_FIXED &&
2078 NLBLOCK->nllen == 2 &&
2079 *eptr == NLBLOCK->nl[0])
2081 md->hitend = TRUE;
2082 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2084 RRETURN(MATCH_NOMATCH);
2087 else
2089 if (md->noteol) RRETURN(MATCH_NOMATCH);
2090 SCHECK_PARTIAL();
2092 ecode++;
2093 break;
2095 /* Not multiline mode: assert before a terminating newline or before end of
2096 subject unless noteol is set. */
2098 case OP_DOLL:
2099 if (md->noteol) RRETURN(MATCH_NOMATCH);
2100 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2102 /* ... else fall through for endonly */
2104 /* End of subject assertion (\z) */
2106 case OP_EOD:
2107 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2108 SCHECK_PARTIAL();
2109 ecode++;
2110 break;
2112 /* End of subject or ending \n assertion (\Z) */
2114 case OP_EODN:
2115 ASSERT_NL_OR_EOS:
2116 if (eptr < md->end_subject &&
2117 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2119 if (md->partial != 0 &&
2120 eptr + 1 >= md->end_subject &&
2121 NLBLOCK->nltype == NLTYPE_FIXED &&
2122 NLBLOCK->nllen == 2 &&
2123 *eptr == NLBLOCK->nl[0])
2125 md->hitend = TRUE;
2126 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2128 RRETURN(MATCH_NOMATCH);
2131 /* Either at end of string or \n before end. */
2133 SCHECK_PARTIAL();
2134 ecode++;
2135 break;
2137 /* Word boundary assertions */
2139 case OP_NOT_WORD_BOUNDARY:
2140 case OP_WORD_BOUNDARY:
2143 /* Find out if the previous and current characters are "word" characters.
2144 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2145 be "non-word" characters. Remember the earliest consulted character for
2146 partial matching. */
2148 #ifdef SUPPORT_UTF
2149 if (utf)
2151 /* Get status of previous character */
2153 if (eptr == md->start_subject) prev_is_word = FALSE; else
2155 PCRE_PUCHAR lastptr = eptr - 1;
2156 BACKCHAR(lastptr);
2157 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2158 GETCHAR(c, lastptr);
2159 #ifdef SUPPORT_UCP
2160 if (md->use_ucp)
2162 if (c == '_') prev_is_word = TRUE; else
2164 int cat = UCD_CATEGORY(c);
2165 prev_is_word = (cat == ucp_L || cat == ucp_N);
2168 else
2169 #endif
2170 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2173 /* Get status of next character */
2175 if (eptr >= md->end_subject)
2177 SCHECK_PARTIAL();
2178 cur_is_word = FALSE;
2180 else
2182 GETCHAR(c, eptr);
2183 #ifdef SUPPORT_UCP
2184 if (md->use_ucp)
2186 if (c == '_') cur_is_word = TRUE; else
2188 int cat = UCD_CATEGORY(c);
2189 cur_is_word = (cat == ucp_L || cat == ucp_N);
2192 else
2193 #endif
2194 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2197 else
2198 #endif
2200 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2201 consistency with the behaviour of \w we do use it in this case. */
2204 /* Get status of previous character */
2206 if (eptr == md->start_subject) prev_is_word = FALSE; else
2208 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2209 #ifdef SUPPORT_UCP
2210 if (md->use_ucp)
2212 c = eptr[-1];
2213 if (c == '_') prev_is_word = TRUE; else
2215 int cat = UCD_CATEGORY(c);
2216 prev_is_word = (cat == ucp_L || cat == ucp_N);
2219 else
2220 #endif
2221 prev_is_word = MAX_255(eptr[-1])
2222 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2225 /* Get status of next character */
2227 if (eptr >= md->end_subject)
2229 SCHECK_PARTIAL();
2230 cur_is_word = FALSE;
2232 else
2233 #ifdef SUPPORT_UCP
2234 if (md->use_ucp)
2236 c = *eptr;
2237 if (c == '_') cur_is_word = TRUE; else
2239 int cat = UCD_CATEGORY(c);
2240 cur_is_word = (cat == ucp_L || cat == ucp_N);
2243 else
2244 #endif
2245 cur_is_word = MAX_255(*eptr)
2246 && ((md->ctypes[*eptr] & ctype_word) != 0);
2249 /* Now see if the situation is what we want */
2251 if ((*ecode++ == OP_WORD_BOUNDARY)?
2252 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2253 RRETURN(MATCH_NOMATCH);
2255 break;
2257 /* Match any single character type except newline; have to take care with
2258 CRLF newlines and partial matching. */
2260 case OP_ANY:
2261 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2262 if (md->partial != 0 &&
2263 eptr + 1 >= md->end_subject &&
2264 NLBLOCK->nltype == NLTYPE_FIXED &&
2265 NLBLOCK->nllen == 2 &&
2266 *eptr == NLBLOCK->nl[0])
2268 md->hitend = TRUE;
2269 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2272 /* Fall through */
2274 /* Match any single character whatsoever. */
2276 case OP_ALLANY:
2277 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2278 { /* not be updated before SCHECK_PARTIAL. */
2279 SCHECK_PARTIAL();
2280 RRETURN(MATCH_NOMATCH);
2282 eptr++;
2283 #ifdef SUPPORT_UTF
2284 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2285 #endif
2286 ecode++;
2287 break;
2289 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2290 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2292 case OP_ANYBYTE:
2293 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2294 { /* not be updated before SCHECK_PARTIAL. */
2295 SCHECK_PARTIAL();
2296 RRETURN(MATCH_NOMATCH);
2298 eptr++;
2299 ecode++;
2300 break;
2302 case OP_NOT_DIGIT:
2303 if (eptr >= md->end_subject)
2305 SCHECK_PARTIAL();
2306 RRETURN(MATCH_NOMATCH);
2308 GETCHARINCTEST(c, eptr);
2309 if (
2310 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2311 c < 256 &&
2312 #endif
2313 (md->ctypes[c] & ctype_digit) != 0
2315 RRETURN(MATCH_NOMATCH);
2316 ecode++;
2317 break;
2319 case OP_DIGIT:
2320 if (eptr >= md->end_subject)
2322 SCHECK_PARTIAL();
2323 RRETURN(MATCH_NOMATCH);
2325 GETCHARINCTEST(c, eptr);
2326 if (
2327 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2328 c > 255 ||
2329 #endif
2330 (md->ctypes[c] & ctype_digit) == 0
2332 RRETURN(MATCH_NOMATCH);
2333 ecode++;
2334 break;
2336 case OP_NOT_WHITESPACE:
2337 if (eptr >= md->end_subject)
2339 SCHECK_PARTIAL();
2340 RRETURN(MATCH_NOMATCH);
2342 GETCHARINCTEST(c, eptr);
2343 if (
2344 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2345 c < 256 &&
2346 #endif
2347 (md->ctypes[c] & ctype_space) != 0
2349 RRETURN(MATCH_NOMATCH);
2350 ecode++;
2351 break;
2353 case OP_WHITESPACE:
2354 if (eptr >= md->end_subject)
2356 SCHECK_PARTIAL();
2357 RRETURN(MATCH_NOMATCH);
2359 GETCHARINCTEST(c, eptr);
2360 if (
2361 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2362 c > 255 ||
2363 #endif
2364 (md->ctypes[c] & ctype_space) == 0
2366 RRETURN(MATCH_NOMATCH);
2367 ecode++;
2368 break;
2370 case OP_NOT_WORDCHAR:
2371 if (eptr >= md->end_subject)
2373 SCHECK_PARTIAL();
2374 RRETURN(MATCH_NOMATCH);
2376 GETCHARINCTEST(c, eptr);
2377 if (
2378 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2379 c < 256 &&
2380 #endif
2381 (md->ctypes[c] & ctype_word) != 0
2383 RRETURN(MATCH_NOMATCH);
2384 ecode++;
2385 break;
2387 case OP_WORDCHAR:
2388 if (eptr >= md->end_subject)
2390 SCHECK_PARTIAL();
2391 RRETURN(MATCH_NOMATCH);
2393 GETCHARINCTEST(c, eptr);
2394 if (
2395 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2396 c > 255 ||
2397 #endif
2398 (md->ctypes[c] & ctype_word) == 0
2400 RRETURN(MATCH_NOMATCH);
2401 ecode++;
2402 break;
2404 case OP_ANYNL:
2405 if (eptr >= md->end_subject)
2407 SCHECK_PARTIAL();
2408 RRETURN(MATCH_NOMATCH);
2410 GETCHARINCTEST(c, eptr);
2411 switch(c)
2413 default: RRETURN(MATCH_NOMATCH);
2415 case 0x000d:
2416 if (eptr >= md->end_subject)
2418 SCHECK_PARTIAL();
2420 else if (*eptr == 0x0a) eptr++;
2421 break;
2423 case 0x000a:
2424 break;
2426 case 0x000b:
2427 case 0x000c:
2428 case 0x0085:
2429 case 0x2028:
2430 case 0x2029:
2431 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2432 break;
2434 ecode++;
2435 break;
2437 case OP_NOT_HSPACE:
2438 if (eptr >= md->end_subject)
2440 SCHECK_PARTIAL();
2441 RRETURN(MATCH_NOMATCH);
2443 GETCHARINCTEST(c, eptr);
2444 switch(c)
2446 default: break;
2447 case 0x09: /* HT */
2448 case 0x20: /* SPACE */
2449 case 0xa0: /* NBSP */
2450 case 0x1680: /* OGHAM SPACE MARK */
2451 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2452 case 0x2000: /* EN QUAD */
2453 case 0x2001: /* EM QUAD */
2454 case 0x2002: /* EN SPACE */
2455 case 0x2003: /* EM SPACE */
2456 case 0x2004: /* THREE-PER-EM SPACE */
2457 case 0x2005: /* FOUR-PER-EM SPACE */
2458 case 0x2006: /* SIX-PER-EM SPACE */
2459 case 0x2007: /* FIGURE SPACE */
2460 case 0x2008: /* PUNCTUATION SPACE */
2461 case 0x2009: /* THIN SPACE */
2462 case 0x200A: /* HAIR SPACE */
2463 case 0x202f: /* NARROW NO-BREAK SPACE */
2464 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2465 case 0x3000: /* IDEOGRAPHIC SPACE */
2466 RRETURN(MATCH_NOMATCH);
2468 ecode++;
2469 break;
2471 case OP_HSPACE:
2472 if (eptr >= md->end_subject)
2474 SCHECK_PARTIAL();
2475 RRETURN(MATCH_NOMATCH);
2477 GETCHARINCTEST(c, eptr);
2478 switch(c)
2480 default: RRETURN(MATCH_NOMATCH);
2481 case 0x09: /* HT */
2482 case 0x20: /* SPACE */
2483 case 0xa0: /* NBSP */
2484 case 0x1680: /* OGHAM SPACE MARK */
2485 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2486 case 0x2000: /* EN QUAD */
2487 case 0x2001: /* EM QUAD */
2488 case 0x2002: /* EN SPACE */
2489 case 0x2003: /* EM SPACE */
2490 case 0x2004: /* THREE-PER-EM SPACE */
2491 case 0x2005: /* FOUR-PER-EM SPACE */
2492 case 0x2006: /* SIX-PER-EM SPACE */
2493 case 0x2007: /* FIGURE SPACE */
2494 case 0x2008: /* PUNCTUATION SPACE */
2495 case 0x2009: /* THIN SPACE */
2496 case 0x200A: /* HAIR SPACE */
2497 case 0x202f: /* NARROW NO-BREAK SPACE */
2498 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2499 case 0x3000: /* IDEOGRAPHIC SPACE */
2500 break;
2502 ecode++;
2503 break;
2505 case OP_NOT_VSPACE:
2506 if (eptr >= md->end_subject)
2508 SCHECK_PARTIAL();
2509 RRETURN(MATCH_NOMATCH);
2511 GETCHARINCTEST(c, eptr);
2512 switch(c)
2514 default: break;
2515 case 0x0a: /* LF */
2516 case 0x0b: /* VT */
2517 case 0x0c: /* FF */
2518 case 0x0d: /* CR */
2519 case 0x85: /* NEL */
2520 case 0x2028: /* LINE SEPARATOR */
2521 case 0x2029: /* PARAGRAPH SEPARATOR */
2522 RRETURN(MATCH_NOMATCH);
2524 ecode++;
2525 break;
2527 case OP_VSPACE:
2528 if (eptr >= md->end_subject)
2530 SCHECK_PARTIAL();
2531 RRETURN(MATCH_NOMATCH);
2533 GETCHARINCTEST(c, eptr);
2534 switch(c)
2536 default: RRETURN(MATCH_NOMATCH);
2537 case 0x0a: /* LF */
2538 case 0x0b: /* VT */
2539 case 0x0c: /* FF */
2540 case 0x0d: /* CR */
2541 case 0x85: /* NEL */
2542 case 0x2028: /* LINE SEPARATOR */
2543 case 0x2029: /* PARAGRAPH SEPARATOR */
2544 break;
2546 ecode++;
2547 break;
2549 #ifdef SUPPORT_UCP
2550 /* Check the next character by Unicode property. We will get here only
2551 if the support is in the binary; otherwise a compile-time error occurs. */
2553 case OP_PROP:
2554 case OP_NOTPROP:
2555 if (eptr >= md->end_subject)
2557 SCHECK_PARTIAL();
2558 RRETURN(MATCH_NOMATCH);
2560 GETCHARINCTEST(c, eptr);
2562 const pcre_uint8 chartype = UCD_CHARTYPE(c);
2564 switch(ecode[1])
2566 case PT_ANY:
2567 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2568 break;
2570 case PT_LAMP:
2571 if ((chartype == ucp_Lu ||
2572 chartype == ucp_Ll ||
2573 chartype == ucp_Lt) == (op == OP_NOTPROP))
2574 RRETURN(MATCH_NOMATCH);
2575 break;
2577 case PT_GC:
2578 if ((ecode[2] != PRIV(ucp_gentype)[chartype]) == (op == OP_PROP))
2579 RRETURN(MATCH_NOMATCH);
2580 break;
2582 case PT_PC:
2583 if ((ecode[2] != chartype) == (op == OP_PROP))
2584 RRETURN(MATCH_NOMATCH);
2585 break;
2587 case PT_SC:
2588 if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
2589 RRETURN(MATCH_NOMATCH);
2590 break;
2592 /* These are specials */
2594 case PT_ALNUM:
2595 if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2596 PRIV(ucp_gentype)[chartype] == ucp_N) == (op == OP_NOTPROP))
2597 RRETURN(MATCH_NOMATCH);
2598 break;
2600 case PT_SPACE: /* Perl space */
2601 if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
2602 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2603 == (op == OP_NOTPROP))
2604 RRETURN(MATCH_NOMATCH);
2605 break;
2607 case PT_PXSPACE: /* POSIX space */
2608 if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
2609 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2610 c == CHAR_FF || c == CHAR_CR)
2611 == (op == OP_NOTPROP))
2612 RRETURN(MATCH_NOMATCH);
2613 break;
2615 case PT_WORD:
2616 if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2617 PRIV(ucp_gentype)[chartype] == ucp_N ||
2618 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2619 RRETURN(MATCH_NOMATCH);
2620 break;
2622 /* This should never occur */
2624 default:
2625 RRETURN(PCRE_ERROR_INTERNAL);
2628 ecode += 3;
2630 break;
2632 /* Match an extended Unicode sequence. We will get here only if the support
2633 is in the binary; otherwise a compile-time error occurs. */
2635 case OP_EXTUNI:
2636 if (eptr >= md->end_subject)
2638 SCHECK_PARTIAL();
2639 RRETURN(MATCH_NOMATCH);
2641 GETCHARINCTEST(c, eptr);
2642 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2643 while (eptr < md->end_subject)
2645 int len = 1;
2646 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2647 if (UCD_CATEGORY(c) != ucp_M) break;
2648 eptr += len;
2650 CHECK_PARTIAL();
2651 ecode++;
2652 break;
2653 #endif
2656 /* Match a back reference, possibly repeatedly. Look past the end of the
2657 item to see if there is repeat information following. The code is similar
2658 to that for character classes, but repeated for efficiency. Then obey
2659 similar code to character type repeats - written out again for speed.
2660 However, if the referenced string is the empty string, always treat
2661 it as matched, any number of times (otherwise there could be infinite
2662 loops). */
2664 case OP_REF:
2665 case OP_REFI:
2666 caseless = op == OP_REFI;
2667 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2668 ecode += 1 + IMM2_SIZE;
2670 /* If the reference is unset, there are two possibilities:
2672 (a) In the default, Perl-compatible state, set the length negative;
2673 this ensures that every attempt at a match fails. We can't just fail
2674 here, because of the possibility of quantifiers with zero minima.
2676 (b) If the JavaScript compatibility flag is set, set the length to zero
2677 so that the back reference matches an empty string.
2679 Otherwise, set the length to the length of what was matched by the
2680 referenced subpattern. */
2682 if (offset >= offset_top || md->offset_vector[offset] < 0)
2683 length = (md->jscript_compat)? 0 : -1;
2684 else
2685 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2687 /* Set up for repetition, or handle the non-repeated case */
2689 switch (*ecode)
2691 case OP_CRSTAR:
2692 case OP_CRMINSTAR:
2693 case OP_CRPLUS:
2694 case OP_CRMINPLUS:
2695 case OP_CRQUERY:
2696 case OP_CRMINQUERY:
2697 c = *ecode++ - OP_CRSTAR;
2698 minimize = (c & 1) != 0;
2699 min = rep_min[c]; /* Pick up values from tables; */
2700 max = rep_max[c]; /* zero for max => infinity */
2701 if (max == 0) max = INT_MAX;
2702 break;
2704 case OP_CRRANGE:
2705 case OP_CRMINRANGE:
2706 minimize = (*ecode == OP_CRMINRANGE);
2707 min = GET2(ecode, 1);
2708 max = GET2(ecode, 1 + IMM2_SIZE);
2709 if (max == 0) max = INT_MAX;
2710 ecode += 1 + 2 * IMM2_SIZE;
2711 break;
2713 default: /* No repeat follows */
2714 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2716 if (length == -2) eptr = md->end_subject; /* Partial match */
2717 CHECK_PARTIAL();
2718 RRETURN(MATCH_NOMATCH);
2720 eptr += length;
2721 continue; /* With the main loop */
2724 /* Handle repeated back references. If the length of the reference is
2725 zero, just continue with the main loop. If the length is negative, it
2726 means the reference is unset in non-Java-compatible mode. If the minimum is
2727 zero, we can continue at the same level without recursion. For any other
2728 minimum, carrying on will result in NOMATCH. */
2730 if (length == 0) continue;
2731 if (length < 0 && min == 0) continue;
2733 /* First, ensure the minimum number of matches are present. We get back
2734 the length of the reference string explicitly rather than passing the
2735 address of eptr, so that eptr can be a register variable. */
2737 for (i = 1; i <= min; i++)
2739 int slength;
2740 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2742 if (slength == -2) eptr = md->end_subject; /* Partial match */
2743 CHECK_PARTIAL();
2744 RRETURN(MATCH_NOMATCH);
2746 eptr += slength;
2749 /* If min = max, continue at the same level without recursion.
2750 They are not both allowed to be zero. */
2752 if (min == max) continue;
2754 /* If minimizing, keep trying and advancing the pointer */
2756 if (minimize)
2758 for (fi = min;; fi++)
2760 int slength;
2761 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2762 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763 if (fi >= max) RRETURN(MATCH_NOMATCH);
2764 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2766 if (slength == -2) eptr = md->end_subject; /* Partial match */
2767 CHECK_PARTIAL();
2768 RRETURN(MATCH_NOMATCH);
2770 eptr += slength;
2772 /* Control never gets here */
2775 /* If maximizing, find the longest string and work backwards */
2777 else
2779 pp = eptr;
2780 for (i = min; i < max; i++)
2782 int slength;
2783 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2785 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2786 the soft partial matching case. */
2788 if (slength == -2 && md->partial != 0 &&
2789 md->end_subject > md->start_used_ptr)
2791 md->hitend = TRUE;
2792 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2794 break;
2796 eptr += slength;
2799 while (eptr >= pp)
2801 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2803 eptr -= length;
2805 RRETURN(MATCH_NOMATCH);
2807 /* Control never gets here */
2809 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2810 used when all the characters in the class have values in the range 0-255,
2811 and either the matching is caseful, or the characters are in the range
2812 0-127 when UTF-8 processing is enabled. The only difference between
2813 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2814 encountered.
2816 First, look past the end of the item to see if there is repeat information
2817 following. Then obey similar code to character type repeats - written out
2818 again for speed. */
2820 case OP_NCLASS:
2821 case OP_CLASS:
2823 /* The data variable is saved across frames, so the byte map needs to
2824 be stored there. */
2825 #define BYTE_MAP ((pcre_uint8 *)data)
2826 data = ecode + 1; /* Save for matching */
2827 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2829 switch (*ecode)
2831 case OP_CRSTAR:
2832 case OP_CRMINSTAR:
2833 case OP_CRPLUS:
2834 case OP_CRMINPLUS:
2835 case OP_CRQUERY:
2836 case OP_CRMINQUERY:
2837 c = *ecode++ - OP_CRSTAR;
2838 minimize = (c & 1) != 0;
2839 min = rep_min[c]; /* Pick up values from tables; */
2840 max = rep_max[c]; /* zero for max => infinity */
2841 if (max == 0) max = INT_MAX;
2842 break;
2844 case OP_CRRANGE:
2845 case OP_CRMINRANGE:
2846 minimize = (*ecode == OP_CRMINRANGE);
2847 min = GET2(ecode, 1);
2848 max = GET2(ecode, 1 + IMM2_SIZE);
2849 if (max == 0) max = INT_MAX;
2850 ecode += 1 + 2 * IMM2_SIZE;
2851 break;
2853 default: /* No repeat follows */
2854 min = max = 1;
2855 break;
2858 /* First, ensure the minimum number of matches are present. */
2860 #ifdef SUPPORT_UTF
2861 if (utf)
2863 for (i = 1; i <= min; i++)
2865 if (eptr >= md->end_subject)
2867 SCHECK_PARTIAL();
2868 RRETURN(MATCH_NOMATCH);
2870 GETCHARINC(c, eptr);
2871 if (c > 255)
2873 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2875 else
2876 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2879 else
2880 #endif
2881 /* Not UTF mode */
2883 for (i = 1; i <= min; i++)
2885 if (eptr >= md->end_subject)
2887 SCHECK_PARTIAL();
2888 RRETURN(MATCH_NOMATCH);
2890 c = *eptr++;
2891 #ifndef COMPILE_PCRE8
2892 if (c > 255)
2894 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2896 else
2897 #endif
2898 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2902 /* If max == min we can continue with the main loop without the
2903 need to recurse. */
2905 if (min == max) continue;
2907 /* If minimizing, keep testing the rest of the expression and advancing
2908 the pointer while it matches the class. */
2910 if (minimize)
2912 #ifdef SUPPORT_UTF
2913 if (utf)
2915 for (fi = min;; fi++)
2917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2919 if (fi >= max) RRETURN(MATCH_NOMATCH);
2920 if (eptr >= md->end_subject)
2922 SCHECK_PARTIAL();
2923 RRETURN(MATCH_NOMATCH);
2925 GETCHARINC(c, eptr);
2926 if (c > 255)
2928 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2930 else
2931 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2934 else
2935 #endif
2936 /* Not UTF mode */
2938 for (fi = min;; fi++)
2940 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2942 if (fi >= max) RRETURN(MATCH_NOMATCH);
2943 if (eptr >= md->end_subject)
2945 SCHECK_PARTIAL();
2946 RRETURN(MATCH_NOMATCH);
2948 c = *eptr++;
2949 #ifndef COMPILE_PCRE8
2950 if (c > 255)
2952 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2954 else
2955 #endif
2956 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2959 /* Control never gets here */
2962 /* If maximizing, find the longest possible run, then work backwards. */
2964 else
2966 pp = eptr;
2968 #ifdef SUPPORT_UTF
2969 if (utf)
2971 for (i = min; i < max; i++)
2973 int len = 1;
2974 if (eptr >= md->end_subject)
2976 SCHECK_PARTIAL();
2977 break;
2979 GETCHARLEN(c, eptr, len);
2980 if (c > 255)
2982 if (op == OP_CLASS) break;
2984 else
2985 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2986 eptr += len;
2988 for (;;)
2990 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2992 if (eptr-- == pp) break; /* Stop if tried at original pos */
2993 BACKCHAR(eptr);
2996 else
2997 #endif
2998 /* Not UTF mode */
3000 for (i = min; i < max; i++)
3002 if (eptr >= md->end_subject)
3004 SCHECK_PARTIAL();
3005 break;
3007 c = *eptr;
3008 #ifndef COMPILE_PCRE8
3009 if (c > 255)
3011 if (op == OP_CLASS) break;
3013 else
3014 #endif
3015 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3016 eptr++;
3018 while (eptr >= pp)
3020 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3022 eptr--;
3026 RRETURN(MATCH_NOMATCH);
3028 #undef BYTE_MAP
3030 /* Control never gets here */
3033 /* Match an extended character class. This opcode is encountered only
3034 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3035 mode, because Unicode properties are supported in non-UTF-8 mode. */
3037 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3038 case OP_XCLASS:
3040 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3041 ecode += GET(ecode, 1); /* Advance past the item */
3043 switch (*ecode)
3045 case OP_CRSTAR:
3046 case OP_CRMINSTAR:
3047 case OP_CRPLUS:
3048 case OP_CRMINPLUS:
3049 case OP_CRQUERY:
3050 case OP_CRMINQUERY:
3051 c = *ecode++ - OP_CRSTAR;
3052 minimize = (c & 1) != 0;
3053 min = rep_min[c]; /* Pick up values from tables; */
3054 max = rep_max[c]; /* zero for max => infinity */
3055 if (max == 0) max = INT_MAX;
3056 break;
3058 case OP_CRRANGE:
3059 case OP_CRMINRANGE:
3060 minimize = (*ecode == OP_CRMINRANGE);
3061 min = GET2(ecode, 1);
3062 max = GET2(ecode, 1 + IMM2_SIZE);
3063 if (max == 0) max = INT_MAX;
3064 ecode += 1 + 2 * IMM2_SIZE;
3065 break;
3067 default: /* No repeat follows */
3068 min = max = 1;
3069 break;
3072 /* First, ensure the minimum number of matches are present. */
3074 for (i = 1; i <= min; i++)
3076 if (eptr >= md->end_subject)
3078 SCHECK_PARTIAL();
3079 RRETURN(MATCH_NOMATCH);
3081 GETCHARINCTEST(c, eptr);
3082 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3085 /* If max == min we can continue with the main loop without the
3086 need to recurse. */
3088 if (min == max) continue;
3090 /* If minimizing, keep testing the rest of the expression and advancing
3091 the pointer while it matches the class. */
3093 if (minimize)
3095 for (fi = min;; fi++)
3097 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3098 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3099 if (fi >= max) RRETURN(MATCH_NOMATCH);
3100 if (eptr >= md->end_subject)
3102 SCHECK_PARTIAL();
3103 RRETURN(MATCH_NOMATCH);
3105 GETCHARINCTEST(c, eptr);
3106 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3108 /* Control never gets here */
3111 /* If maximizing, find the longest possible run, then work backwards. */
3113 else
3115 pp = eptr;
3116 for (i = min; i < max; i++)
3118 int len = 1;
3119 if (eptr >= md->end_subject)
3121 SCHECK_PARTIAL();
3122 break;
3124 #ifdef SUPPORT_UTF
3125 GETCHARLENTEST(c, eptr, len);
3126 #else
3127 c = *eptr;
3128 #endif
3129 if (!PRIV(xclass)(c, data, utf)) break;
3130 eptr += len;
3132 for(;;)
3134 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3135 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3136 if (eptr-- == pp) break; /* Stop if tried at original pos */
3137 #ifdef SUPPORT_UTF
3138 if (utf) BACKCHAR(eptr);
3139 #endif
3141 RRETURN(MATCH_NOMATCH);
3144 /* Control never gets here */
3146 #endif /* End of XCLASS */
3148 /* Match a single character, casefully */
3150 case OP_CHAR:
3151 #ifdef SUPPORT_UTF
3152 if (utf)
3154 length = 1;
3155 ecode++;
3156 GETCHARLEN(fc, ecode, length);
3157 if (length > md->end_subject - eptr)
3159 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3160 RRETURN(MATCH_NOMATCH);
3162 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3164 else
3165 #endif
3166 /* Not UTF mode */
3168 if (md->end_subject - eptr < 1)
3170 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3171 RRETURN(MATCH_NOMATCH);
3173 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3174 ecode += 2;
3176 break;
3178 /* Match a single character, caselessly. If we are at the end of the
3179 subject, give up immediately. */
3181 case OP_CHARI:
3182 if (eptr >= md->end_subject)
3184 SCHECK_PARTIAL();
3185 RRETURN(MATCH_NOMATCH);
3188 #ifdef SUPPORT_UTF
3189 if (utf)
3191 length = 1;
3192 ecode++;
3193 GETCHARLEN(fc, ecode, length);
3195 /* If the pattern character's value is < 128, we have only one byte, and
3196 we know that its other case must also be one byte long, so we can use the
3197 fast lookup table. We know that there is at least one byte left in the
3198 subject. */
3200 if (fc < 128)
3202 if (md->lcc[fc]
3203 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3204 ecode++;
3205 eptr++;
3208 /* Otherwise we must pick up the subject character. Note that we cannot
3209 use the value of "length" to check for sufficient bytes left, because the
3210 other case of the character may have more or fewer bytes. */
3212 else
3214 unsigned int dc;
3215 GETCHARINC(dc, eptr);
3216 ecode += length;
3218 /* If we have Unicode property support, we can use it to test the other
3219 case of the character, if there is one. */
3221 if (fc != dc)
3223 #ifdef SUPPORT_UCP
3224 if (dc != UCD_OTHERCASE(fc))
3225 #endif
3226 RRETURN(MATCH_NOMATCH);
3230 else
3231 #endif /* SUPPORT_UTF */
3233 /* Not UTF mode */
3235 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3236 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3237 eptr++;
3238 ecode += 2;
3240 break;
3242 /* Match a single character repeatedly. */
3244 case OP_EXACT:
3245 case OP_EXACTI:
3246 min = max = GET2(ecode, 1);
3247 ecode += 1 + IMM2_SIZE;
3248 goto REPEATCHAR;
3250 case OP_POSUPTO:
3251 case OP_POSUPTOI:
3252 possessive = TRUE;
3253 /* Fall through */
3255 case OP_UPTO:
3256 case OP_UPTOI:
3257 case OP_MINUPTO:
3258 case OP_MINUPTOI:
3259 min = 0;
3260 max = GET2(ecode, 1);
3261 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3262 ecode += 1 + IMM2_SIZE;
3263 goto REPEATCHAR;
3265 case OP_POSSTAR:
3266 case OP_POSSTARI:
3267 possessive = TRUE;
3268 min = 0;
3269 max = INT_MAX;
3270 ecode++;
3271 goto REPEATCHAR;
3273 case OP_POSPLUS:
3274 case OP_POSPLUSI:
3275 possessive = TRUE;
3276 min = 1;
3277 max = INT_MAX;
3278 ecode++;
3279 goto REPEATCHAR;
3281 case OP_POSQUERY:
3282 case OP_POSQUERYI:
3283 possessive = TRUE;
3284 min = 0;
3285 max = 1;
3286 ecode++;
3287 goto REPEATCHAR;
3289 case OP_STAR:
3290 case OP_STARI:
3291 case OP_MINSTAR:
3292 case OP_MINSTARI:
3293 case OP_PLUS:
3294 case OP_PLUSI:
3295 case OP_MINPLUS:
3296 case OP_MINPLUSI:
3297 case OP_QUERY:
3298 case OP_QUERYI:
3299 case OP_MINQUERY:
3300 case OP_MINQUERYI:
3301 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3302 minimize = (c & 1) != 0;
3303 min = rep_min[c]; /* Pick up values from tables; */
3304 max = rep_max[c]; /* zero for max => infinity */
3305 if (max == 0) max = INT_MAX;
3307 /* Common code for all repeated single-character matches. */
3309 REPEATCHAR:
3310 #ifdef SUPPORT_UTF
3311 if (utf)
3313 length = 1;
3314 charptr = ecode;
3315 GETCHARLEN(fc, ecode, length);
3316 ecode += length;
3318 /* Handle multibyte character matching specially here. There is
3319 support for caseless matching if UCP support is present. */
3321 if (length > 1)
3323 #ifdef SUPPORT_UCP
3324 unsigned int othercase;
3325 if (op >= OP_STARI && /* Caseless */
3326 (othercase = UCD_OTHERCASE(fc)) != fc)
3327 oclength = PRIV(ord2utf)(othercase, occhars);
3328 else oclength = 0;
3329 #endif /* SUPPORT_UCP */
3331 for (i = 1; i <= min; i++)
3333 if (eptr <= md->end_subject - length &&
3334 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3335 #ifdef SUPPORT_UCP
3336 else if (oclength > 0 &&
3337 eptr <= md->end_subject - oclength &&
3338 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3339 #endif /* SUPPORT_UCP */
3340 else
3342 CHECK_PARTIAL();
3343 RRETURN(MATCH_NOMATCH);
3347 if (min == max) continue;
3349 if (minimize)
3351 for (fi = min;; fi++)
3353 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3354 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3355 if (fi >= max) RRETURN(MATCH_NOMATCH);
3356 if (eptr <= md->end_subject - length &&
3357 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3358 #ifdef SUPPORT_UCP
3359 else if (oclength > 0 &&
3360 eptr <= md->end_subject - oclength &&
3361 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3362 #endif /* SUPPORT_UCP */
3363 else
3365 CHECK_PARTIAL();
3366 RRETURN(MATCH_NOMATCH);
3369 /* Control never gets here */
3372 else /* Maximize */
3374 pp = eptr;
3375 for (i = min; i < max; i++)
3377 if (eptr <= md->end_subject - length &&
3378 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3379 #ifdef SUPPORT_UCP
3380 else if (oclength > 0 &&
3381 eptr <= md->end_subject - oclength &&
3382 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3383 #endif /* SUPPORT_UCP */
3384 else
3386 CHECK_PARTIAL();
3387 break;
3391 if (possessive) continue;
3393 for(;;)
3395 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3396 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3397 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3398 #ifdef SUPPORT_UCP
3399 eptr--;
3400 BACKCHAR(eptr);
3401 #else /* without SUPPORT_UCP */
3402 eptr -= length;
3403 #endif /* SUPPORT_UCP */
3406 /* Control never gets here */
3409 /* If the length of a UTF-8 character is 1, we fall through here, and
3410 obey the code as for non-UTF-8 characters below, though in this case the
3411 value of fc will always be < 128. */
3413 else
3414 #endif /* SUPPORT_UTF */
3415 /* When not in UTF-8 mode, load a single-byte character. */
3416 fc = *ecode++;
3418 /* The value of fc at this point is always one character, though we may
3419 or may not be in UTF mode. The code is duplicated for the caseless and
3420 caseful cases, for speed, since matching characters is likely to be quite
3421 common. First, ensure the minimum number of matches are present. If min =
3422 max, continue at the same level without recursing. Otherwise, if
3423 minimizing, keep trying the rest of the expression and advancing one
3424 matching character if failing, up to the maximum. Alternatively, if
3425 maximizing, find the maximum number of characters and work backwards. */
3427 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3428 max, (char *)eptr));
3430 if (op >= OP_STARI) /* Caseless */
3432 #ifdef COMPILE_PCRE8
3433 /* fc must be < 128 if UTF is enabled. */
3434 foc = md->fcc[fc];
3435 #else
3436 #ifdef SUPPORT_UTF
3437 #ifdef SUPPORT_UCP
3438 if (utf && fc > 127)
3439 foc = UCD_OTHERCASE(fc);
3440 #else
3441 if (utf && fc > 127)
3442 foc = fc;
3443 #endif /* SUPPORT_UCP */
3444 else
3445 #endif /* SUPPORT_UTF */
3446 foc = TABLE_GET(fc, md->fcc, fc);
3447 #endif /* COMPILE_PCRE8 */
3449 for (i = 1; i <= min; i++)
3451 if (eptr >= md->end_subject)
3453 SCHECK_PARTIAL();
3454 RRETURN(MATCH_NOMATCH);
3456 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3457 eptr++;
3459 if (min == max) continue;
3460 if (minimize)
3462 for (fi = min;; fi++)
3464 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3466 if (fi >= max) RRETURN(MATCH_NOMATCH);
3467 if (eptr >= md->end_subject)
3469 SCHECK_PARTIAL();
3470 RRETURN(MATCH_NOMATCH);
3472 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3473 eptr++;
3475 /* Control never gets here */
3477 else /* Maximize */
3479 pp = eptr;
3480 for (i = min; i < max; i++)
3482 if (eptr >= md->end_subject)
3484 SCHECK_PARTIAL();
3485 break;
3487 if (fc != *eptr && foc != *eptr) break;
3488 eptr++;
3491 if (possessive) continue;
3493 while (eptr >= pp)
3495 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3496 eptr--;
3497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3499 RRETURN(MATCH_NOMATCH);
3501 /* Control never gets here */
3504 /* Caseful comparisons (includes all multi-byte characters) */
3506 else
3508 for (i = 1; i <= min; i++)
3510 if (eptr >= md->end_subject)
3512 SCHECK_PARTIAL();
3513 RRETURN(MATCH_NOMATCH);
3515 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3518 if (min == max) continue;
3520 if (minimize)
3522 for (fi = min;; fi++)
3524 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3525 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3526 if (fi >= max) RRETURN(MATCH_NOMATCH);
3527 if (eptr >= md->end_subject)
3529 SCHECK_PARTIAL();
3530 RRETURN(MATCH_NOMATCH);
3532 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3534 /* Control never gets here */
3536 else /* Maximize */
3538 pp = eptr;
3539 for (i = min; i < max; i++)
3541 if (eptr >= md->end_subject)
3543 SCHECK_PARTIAL();
3544 break;
3546 if (fc != *eptr) break;
3547 eptr++;
3549 if (possessive) continue;
3551 while (eptr >= pp)
3553 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3554 eptr--;
3555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3557 RRETURN(MATCH_NOMATCH);
3560 /* Control never gets here */
3562 /* Match a negated single one-byte character. The character we are
3563 checking can be multibyte. */
3565 case OP_NOT:
3566 case OP_NOTI:
3567 if (eptr >= md->end_subject)
3569 SCHECK_PARTIAL();
3570 RRETURN(MATCH_NOMATCH);
3572 #ifdef SUPPORT_UTF
3573 if (utf)
3575 unsigned int ch, och;
3577 ecode++;
3578 GETCHARINC(ch, ecode);
3579 GETCHARINC(c, eptr);
3581 if (op == OP_NOT)
3583 if (ch == c) RRETURN(MATCH_NOMATCH);
3585 else
3587 #ifdef SUPPORT_UCP
3588 if (ch > 127)
3589 och = UCD_OTHERCASE(ch);
3590 #else
3591 if (ch > 127)
3592 och = ch;
3593 #endif /* SUPPORT_UCP */
3594 else
3595 och = TABLE_GET(ch, md->fcc, ch);
3596 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3599 else
3600 #endif
3602 unsigned int ch = ecode[1];
3603 c = *eptr++;
3604 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3605 RRETURN(MATCH_NOMATCH);
3606 ecode += 2;
3608 break;
3610 /* Match a negated single one-byte character repeatedly. This is almost a
3611 repeat of the code for a repeated single character, but I haven't found a
3612 nice way of commoning these up that doesn't require a test of the
3613 positive/negative option for each character match. Maybe that wouldn't add
3614 very much to the time taken, but character matching *is* what this is all
3615 about... */
3617 case OP_NOTEXACT:
3618 case OP_NOTEXACTI:
3619 min = max = GET2(ecode, 1);
3620 ecode += 1 + IMM2_SIZE;
3621 goto REPEATNOTCHAR;
3623 case OP_NOTUPTO:
3624 case OP_NOTUPTOI:
3625 case OP_NOTMINUPTO:
3626 case OP_NOTMINUPTOI:
3627 min = 0;
3628 max = GET2(ecode, 1);
3629 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3630 ecode += 1 + IMM2_SIZE;
3631 goto REPEATNOTCHAR;
3633 case OP_NOTPOSSTAR:
3634 case OP_NOTPOSSTARI:
3635 possessive = TRUE;
3636 min = 0;
3637 max = INT_MAX;
3638 ecode++;
3639 goto REPEATNOTCHAR;
3641 case OP_NOTPOSPLUS:
3642 case OP_NOTPOSPLUSI:
3643 possessive = TRUE;
3644 min = 1;
3645 max = INT_MAX;
3646 ecode++;
3647 goto REPEATNOTCHAR;
3649 case OP_NOTPOSQUERY:
3650 case OP_NOTPOSQUERYI:
3651 possessive = TRUE;
3652 min = 0;
3653 max = 1;
3654 ecode++;
3655 goto REPEATNOTCHAR;
3657 case OP_NOTPOSUPTO:
3658 case OP_NOTPOSUPTOI:
3659 possessive = TRUE;
3660 min = 0;
3661 max = GET2(ecode, 1);
3662 ecode += 1 + IMM2_SIZE;
3663 goto REPEATNOTCHAR;
3665 case OP_NOTSTAR:
3666 case OP_NOTSTARI:
3667 case OP_NOTMINSTAR:
3668 case OP_NOTMINSTARI:
3669 case OP_NOTPLUS:
3670 case OP_NOTPLUSI:
3671 case OP_NOTMINPLUS:
3672 case OP_NOTMINPLUSI:
3673 case OP_NOTQUERY:
3674 case OP_NOTQUERYI:
3675 case OP_NOTMINQUERY:
3676 case OP_NOTMINQUERYI:
3677 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3678 minimize = (c & 1) != 0;
3679 min = rep_min[c]; /* Pick up values from tables; */
3680 max = rep_max[c]; /* zero for max => infinity */
3681 if (max == 0) max = INT_MAX;
3683 /* Common code for all repeated single-byte matches. */
3685 REPEATNOTCHAR:
3686 GETCHARINCTEST(fc, ecode);
3688 /* The code is duplicated for the caseless and caseful cases, for speed,
3689 since matching characters is likely to be quite common. First, ensure the
3690 minimum number of matches are present. If min = max, continue at the same
3691 level without recursing. Otherwise, if minimizing, keep trying the rest of
3692 the expression and advancing one matching character if failing, up to the
3693 maximum. Alternatively, if maximizing, find the maximum number of
3694 characters and work backwards. */
3696 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3697 max, (char *)eptr));
3699 if (op >= OP_NOTSTARI) /* Caseless */
3701 #ifdef SUPPORT_UTF
3702 #ifdef SUPPORT_UCP
3703 if (utf && fc > 127)
3704 foc = UCD_OTHERCASE(fc);
3705 #else
3706 if (utf && fc > 127)
3707 foc = fc;
3708 #endif /* SUPPORT_UCP */
3709 else
3710 #endif /* SUPPORT_UTF */
3711 foc = TABLE_GET(fc, md->fcc, fc);
3713 #ifdef SUPPORT_UTF
3714 if (utf)
3716 unsigned int d;
3717 for (i = 1; i <= min; i++)
3719 if (eptr >= md->end_subject)
3721 SCHECK_PARTIAL();
3722 RRETURN(MATCH_NOMATCH);
3724 GETCHARINC(d, eptr);
3725 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3728 else
3729 #endif
3730 /* Not UTF mode */
3732 for (i = 1; i <= min; i++)
3734 if (eptr >= md->end_subject)
3736 SCHECK_PARTIAL();
3737 RRETURN(MATCH_NOMATCH);
3739 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3740 eptr++;
3744 if (min == max) continue;
3746 if (minimize)
3748 #ifdef SUPPORT_UTF
3749 if (utf)
3751 unsigned int d;
3752 for (fi = min;; fi++)
3754 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3755 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3756 if (fi >= max) RRETURN(MATCH_NOMATCH);
3757 if (eptr >= md->end_subject)
3759 SCHECK_PARTIAL();
3760 RRETURN(MATCH_NOMATCH);
3762 GETCHARINC(d, eptr);
3763 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3766 else
3767 #endif
3768 /* Not UTF mode */
3770 for (fi = min;; fi++)
3772 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3774 if (fi >= max) RRETURN(MATCH_NOMATCH);
3775 if (eptr >= md->end_subject)
3777 SCHECK_PARTIAL();
3778 RRETURN(MATCH_NOMATCH);
3780 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3781 eptr++;
3784 /* Control never gets here */
3787 /* Maximize case */
3789 else
3791 pp = eptr;
3793 #ifdef SUPPORT_UTF
3794 if (utf)
3796 unsigned int d;
3797 for (i = min; i < max; i++)
3799 int len = 1;
3800 if (eptr >= md->end_subject)
3802 SCHECK_PARTIAL();
3803 break;
3805 GETCHARLEN(d, eptr, len);
3806 if (fc == d || (unsigned int)foc == d) break;
3807 eptr += len;
3809 if (possessive) continue;
3810 for(;;)
3812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3814 if (eptr-- == pp) break; /* Stop if tried at original pos */
3815 BACKCHAR(eptr);
3818 else
3819 #endif
3820 /* Not UTF mode */
3822 for (i = min; i < max; i++)
3824 if (eptr >= md->end_subject)
3826 SCHECK_PARTIAL();
3827 break;
3829 if (fc == *eptr || foc == *eptr) break;
3830 eptr++;
3832 if (possessive) continue;
3833 while (eptr >= pp)
3835 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3837 eptr--;
3841 RRETURN(MATCH_NOMATCH);
3843 /* Control never gets here */
3846 /* Caseful comparisons */
3848 else
3850 #ifdef SUPPORT_UTF
3851 if (utf)
3853 unsigned int d;
3854 for (i = 1; i <= min; i++)
3856 if (eptr >= md->end_subject)
3858 SCHECK_PARTIAL();
3859 RRETURN(MATCH_NOMATCH);
3861 GETCHARINC(d, eptr);
3862 if (fc == d) RRETURN(MATCH_NOMATCH);
3865 else
3866 #endif
3867 /* Not UTF mode */
3869 for (i = 1; i <= min; i++)
3871 if (eptr >= md->end_subject)
3873 SCHECK_PARTIAL();
3874 RRETURN(MATCH_NOMATCH);
3876 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3880 if (min == max) continue;
3882 if (minimize)
3884 #ifdef SUPPORT_UTF
3885 if (utf)
3887 unsigned int d;
3888 for (fi = min;; fi++)
3890 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3892 if (fi >= max) RRETURN(MATCH_NOMATCH);
3893 if (eptr >= md->end_subject)
3895 SCHECK_PARTIAL();
3896 RRETURN(MATCH_NOMATCH);
3898 GETCHARINC(d, eptr);
3899 if (fc == d) RRETURN(MATCH_NOMATCH);
3902 else
3903 #endif
3904 /* Not UTF mode */
3906 for (fi = min;; fi++)
3908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3910 if (fi >= max) RRETURN(MATCH_NOMATCH);
3911 if (eptr >= md->end_subject)
3913 SCHECK_PARTIAL();
3914 RRETURN(MATCH_NOMATCH);
3916 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3919 /* Control never gets here */
3922 /* Maximize case */
3924 else
3926 pp = eptr;
3928 #ifdef SUPPORT_UTF
3929 if (utf)
3931 unsigned int d;
3932 for (i = min; i < max; i++)
3934 int len = 1;
3935 if (eptr >= md->end_subject)
3937 SCHECK_PARTIAL();
3938 break;
3940 GETCHARLEN(d, eptr, len);
3941 if (fc == d) break;
3942 eptr += len;
3944 if (possessive) continue;
3945 for(;;)
3947 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3949 if (eptr-- == pp) break; /* Stop if tried at original pos */
3950 BACKCHAR(eptr);
3953 else
3954 #endif
3955 /* Not UTF mode */
3957 for (i = min; i < max; i++)
3959 if (eptr >= md->end_subject)
3961 SCHECK_PARTIAL();
3962 break;
3964 if (fc == *eptr) break;
3965 eptr++;
3967 if (possessive) continue;
3968 while (eptr >= pp)
3970 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3972 eptr--;
3976 RRETURN(MATCH_NOMATCH);
3979 /* Control never gets here */
3981 /* Match a single character type repeatedly; several different opcodes
3982 share code. This is very similar to the code for single characters, but we
3983 repeat it in the interests of efficiency. */
3985 case OP_TYPEEXACT:
3986 min = max = GET2(ecode, 1);
3987 minimize = TRUE;
3988 ecode += 1 + IMM2_SIZE;
3989 goto REPEATTYPE;
3991 case OP_TYPEUPTO:
3992 case OP_TYPEMINUPTO:
3993 min = 0;
3994 max = GET2(ecode, 1);
3995 minimize = *ecode == OP_TYPEMINUPTO;
3996 ecode += 1 + IMM2_SIZE;
3997 goto REPEATTYPE;
3999 case OP_TYPEPOSSTAR:
4000 possessive = TRUE;
4001 min = 0;
4002 max = INT_MAX;
4003 ecode++;
4004 goto REPEATTYPE;
4006 case OP_TYPEPOSPLUS:
4007 possessive = TRUE;
4008 min = 1;
4009 max = INT_MAX;
4010 ecode++;
4011 goto REPEATTYPE;
4013 case OP_TYPEPOSQUERY:
4014 possessive = TRUE;
4015 min = 0;
4016 max = 1;
4017 ecode++;
4018 goto REPEATTYPE;
4020 case OP_TYPEPOSUPTO:
4021 possessive = TRUE;
4022 min = 0;
4023 max = GET2(ecode, 1);
4024 ecode += 1 + IMM2_SIZE;
4025 goto REPEATTYPE;
4027 case OP_TYPESTAR:
4028 case OP_TYPEMINSTAR:
4029 case OP_TYPEPLUS:
4030 case OP_TYPEMINPLUS:
4031 case OP_TYPEQUERY:
4032 case OP_TYPEMINQUERY:
4033 c = *ecode++ - OP_TYPESTAR;
4034 minimize = (c & 1) != 0;
4035 min = rep_min[c]; /* Pick up values from tables; */
4036 max = rep_max[c]; /* zero for max => infinity */
4037 if (max == 0) max = INT_MAX;
4039 /* Common code for all repeated single character type matches. Note that
4040 in UTF-8 mode, '.' matches a character of any length, but for the other
4041 character types, the valid characters are all one-byte long. */
4043 REPEATTYPE:
4044 ctype = *ecode++; /* Code for the character type */
4046 #ifdef SUPPORT_UCP
4047 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4049 prop_fail_result = ctype == OP_NOTPROP;
4050 prop_type = *ecode++;
4051 prop_value = *ecode++;
4053 else prop_type = -1;
4054 #endif
4056 /* First, ensure the minimum number of matches are present. Use inline
4057 code for maximizing the speed, and do the type test once at the start
4058 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4059 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4060 and single-bytes. */
4062 if (min > 0)
4064 #ifdef SUPPORT_UCP
4065 if (prop_type >= 0)
4067 switch(prop_type)
4069 case PT_ANY:
4070 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4071 for (i = 1; i <= min; i++)
4073 if (eptr >= md->end_subject)
4075 SCHECK_PARTIAL();
4076 RRETURN(MATCH_NOMATCH);
4078 GETCHARINCTEST(c, eptr);
4080 break;
4082 case PT_LAMP:
4083 for (i = 1; i <= min; i++)
4085 int chartype;
4086 if (eptr >= md->end_subject)
4088 SCHECK_PARTIAL();
4089 RRETURN(MATCH_NOMATCH);
4091 GETCHARINCTEST(c, eptr);
4092 chartype = UCD_CHARTYPE(c);
4093 if ((chartype == ucp_Lu ||
4094 chartype == ucp_Ll ||
4095 chartype == ucp_Lt) == prop_fail_result)
4096 RRETURN(MATCH_NOMATCH);
4098 break;
4100 case PT_GC:
4101 for (i = 1; i <= min; i++)
4103 if (eptr >= md->end_subject)
4105 SCHECK_PARTIAL();
4106 RRETURN(MATCH_NOMATCH);
4108 GETCHARINCTEST(c, eptr);
4109 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4110 RRETURN(MATCH_NOMATCH);
4112 break;
4114 case PT_PC:
4115 for (i = 1; i <= min; i++)
4117 if (eptr >= md->end_subject)
4119 SCHECK_PARTIAL();
4120 RRETURN(MATCH_NOMATCH);
4122 GETCHARINCTEST(c, eptr);
4123 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4124 RRETURN(MATCH_NOMATCH);
4126 break;
4128 case PT_SC:
4129 for (i = 1; i <= min; i++)
4131 if (eptr >= md->end_subject)
4133 SCHECK_PARTIAL();
4134 RRETURN(MATCH_NOMATCH);
4136 GETCHARINCTEST(c, eptr);
4137 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4138 RRETURN(MATCH_NOMATCH);
4140 break;
4142 case PT_ALNUM:
4143 for (i = 1; i <= min; i++)
4145 int category;
4146 if (eptr >= md->end_subject)
4148 SCHECK_PARTIAL();
4149 RRETURN(MATCH_NOMATCH);
4151 GETCHARINCTEST(c, eptr);
4152 category = UCD_CATEGORY(c);
4153 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4154 RRETURN(MATCH_NOMATCH);
4156 break;
4158 case PT_SPACE: /* Perl space */
4159 for (i = 1; i <= min; i++)
4161 if (eptr >= md->end_subject)
4163 SCHECK_PARTIAL();
4164 RRETURN(MATCH_NOMATCH);
4166 GETCHARINCTEST(c, eptr);
4167 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4168 c == CHAR_FF || c == CHAR_CR)
4169 == prop_fail_result)
4170 RRETURN(MATCH_NOMATCH);
4172 break;
4174 case PT_PXSPACE: /* POSIX space */
4175 for (i = 1; i <= min; i++)
4177 if (eptr >= md->end_subject)
4179 SCHECK_PARTIAL();
4180 RRETURN(MATCH_NOMATCH);
4182 GETCHARINCTEST(c, eptr);
4183 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4184 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4185 == prop_fail_result)
4186 RRETURN(MATCH_NOMATCH);
4188 break;
4190 case PT_WORD:
4191 for (i = 1; i <= min; i++)
4193 int category;
4194 if (eptr >= md->end_subject)
4196 SCHECK_PARTIAL();
4197 RRETURN(MATCH_NOMATCH);
4199 GETCHARINCTEST(c, eptr);
4200 category = UCD_CATEGORY(c);
4201 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4202 == prop_fail_result)
4203 RRETURN(MATCH_NOMATCH);
4205 break;
4207 /* This should not occur */
4209 default:
4210 RRETURN(PCRE_ERROR_INTERNAL);
4214 /* Match extended Unicode sequences. We will get here only if the
4215 support is in the binary; otherwise a compile-time error occurs. */
4217 else if (ctype == OP_EXTUNI)
4219 for (i = 1; i <= min; i++)
4221 if (eptr >= md->end_subject)
4223 SCHECK_PARTIAL();
4224 RRETURN(MATCH_NOMATCH);
4226 GETCHARINCTEST(c, eptr);
4227 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4228 while (eptr < md->end_subject)
4230 int len = 1;
4231 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4232 if (UCD_CATEGORY(c) != ucp_M) break;
4233 eptr += len;
4235 CHECK_PARTIAL();
4239 else
4240 #endif /* SUPPORT_UCP */
4242 /* Handle all other cases when the coding is UTF-8 */
4244 #ifdef SUPPORT_UTF
4245 if (utf) switch(ctype)
4247 case OP_ANY:
4248 for (i = 1; i <= min; i++)
4250 if (eptr >= md->end_subject)
4252 SCHECK_PARTIAL();
4253 RRETURN(MATCH_NOMATCH);
4255 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4256 if (md->partial != 0 &&
4257 eptr + 1 >= md->end_subject &&
4258 NLBLOCK->nltype == NLTYPE_FIXED &&
4259 NLBLOCK->nllen == 2 &&
4260 *eptr == NLBLOCK->nl[0])
4262 md->hitend = TRUE;
4263 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4265 eptr++;
4266 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4268 break;
4270 case OP_ALLANY:
4271 for (i = 1; i <= min; i++)
4273 if (eptr >= md->end_subject)
4275 SCHECK_PARTIAL();
4276 RRETURN(MATCH_NOMATCH);
4278 eptr++;
4279 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4281 break;
4283 case OP_ANYBYTE:
4284 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4285 eptr += min;
4286 break;
4288 case OP_ANYNL:
4289 for (i = 1; i <= min; i++)
4291 if (eptr >= md->end_subject)
4293 SCHECK_PARTIAL();
4294 RRETURN(MATCH_NOMATCH);
4296 GETCHARINC(c, eptr);
4297 switch(c)
4299 default: RRETURN(MATCH_NOMATCH);
4301 case 0x000d:
4302 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4303 break;
4305 case 0x000a:
4306 break;
4308 case 0x000b:
4309 case 0x000c:
4310 case 0x0085:
4311 case 0x2028:
4312 case 0x2029:
4313 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4314 break;
4317 break;
4319 case OP_NOT_HSPACE:
4320 for (i = 1; i <= min; i++)
4322 if (eptr >= md->end_subject)
4324 SCHECK_PARTIAL();
4325 RRETURN(MATCH_NOMATCH);
4327 GETCHARINC(c, eptr);
4328 switch(c)
4330 default: break;
4331 case 0x09: /* HT */
4332 case 0x20: /* SPACE */
4333 case 0xa0: /* NBSP */
4334 case 0x1680: /* OGHAM SPACE MARK */
4335 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4336 case 0x2000: /* EN QUAD */
4337 case 0x2001: /* EM QUAD */
4338 case 0x2002: /* EN SPACE */
4339 case 0x2003: /* EM SPACE */
4340 case 0x2004: /* THREE-PER-EM SPACE */
4341 case 0x2005: /* FOUR-PER-EM SPACE */
4342 case 0x2006: /* SIX-PER-EM SPACE */
4343 case 0x2007: /* FIGURE SPACE */
4344 case 0x2008: /* PUNCTUATION SPACE */
4345 case 0x2009: /* THIN SPACE */
4346 case 0x200A: /* HAIR SPACE */
4347 case 0x202f: /* NARROW NO-BREAK SPACE */
4348 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4349 case 0x3000: /* IDEOGRAPHIC SPACE */
4350 RRETURN(MATCH_NOMATCH);
4353 break;
4355 case OP_HSPACE:
4356 for (i = 1; i <= min; i++)
4358 if (eptr >= md->end_subject)
4360 SCHECK_PARTIAL();
4361 RRETURN(MATCH_NOMATCH);
4363 GETCHARINC(c, eptr);
4364 switch(c)
4366 default: RRETURN(MATCH_NOMATCH);
4367 case 0x09: /* HT */
4368 case 0x20: /* SPACE */
4369 case 0xa0: /* NBSP */
4370 case 0x1680: /* OGHAM SPACE MARK */
4371 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4372 case 0x2000: /* EN QUAD */
4373 case 0x2001: /* EM QUAD */
4374 case 0x2002: /* EN SPACE */
4375 case 0x2003: /* EM SPACE */
4376 case 0x2004: /* THREE-PER-EM SPACE */
4377 case 0x2005: /* FOUR-PER-EM SPACE */
4378 case 0x2006: /* SIX-PER-EM SPACE */
4379 case 0x2007: /* FIGURE SPACE */
4380 case 0x2008: /* PUNCTUATION SPACE */
4381 case 0x2009: /* THIN SPACE */
4382 case 0x200A: /* HAIR SPACE */
4383 case 0x202f: /* NARROW NO-BREAK SPACE */
4384 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4385 case 0x3000: /* IDEOGRAPHIC SPACE */
4386 break;
4389 break;
4391 case OP_NOT_VSPACE:
4392 for (i = 1; i <= min; i++)
4394 if (eptr >= md->end_subject)
4396 SCHECK_PARTIAL();
4397 RRETURN(MATCH_NOMATCH);
4399 GETCHARINC(c, eptr);
4400 switch(c)
4402 default: break;
4403 case 0x0a: /* LF */
4404 case 0x0b: /* VT */
4405 case 0x0c: /* FF */
4406 case 0x0d: /* CR */
4407 case 0x85: /* NEL */
4408 case 0x2028: /* LINE SEPARATOR */
4409 case 0x2029: /* PARAGRAPH SEPARATOR */
4410 RRETURN(MATCH_NOMATCH);
4413 break;
4415 case OP_VSPACE:
4416 for (i = 1; i <= min; i++)
4418 if (eptr >= md->end_subject)
4420 SCHECK_PARTIAL();
4421 RRETURN(MATCH_NOMATCH);
4423 GETCHARINC(c, eptr);
4424 switch(c)
4426 default: RRETURN(MATCH_NOMATCH);
4427 case 0x0a: /* LF */
4428 case 0x0b: /* VT */
4429 case 0x0c: /* FF */
4430 case 0x0d: /* CR */
4431 case 0x85: /* NEL */
4432 case 0x2028: /* LINE SEPARATOR */
4433 case 0x2029: /* PARAGRAPH SEPARATOR */
4434 break;
4437 break;
4439 case OP_NOT_DIGIT:
4440 for (i = 1; i <= min; i++)
4442 if (eptr >= md->end_subject)
4444 SCHECK_PARTIAL();
4445 RRETURN(MATCH_NOMATCH);
4447 GETCHARINC(c, eptr);
4448 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4449 RRETURN(MATCH_NOMATCH);
4451 break;
4453 case OP_DIGIT:
4454 for (i = 1; i <= min; i++)
4456 if (eptr >= md->end_subject)
4458 SCHECK_PARTIAL();
4459 RRETURN(MATCH_NOMATCH);
4461 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4462 RRETURN(MATCH_NOMATCH);
4463 eptr++;
4464 /* No need to skip more bytes - we know it's a 1-byte character */
4466 break;
4468 case OP_NOT_WHITESPACE:
4469 for (i = 1; i <= min; i++)
4471 if (eptr >= md->end_subject)
4473 SCHECK_PARTIAL();
4474 RRETURN(MATCH_NOMATCH);
4476 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4477 RRETURN(MATCH_NOMATCH);
4478 eptr++;
4479 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4481 break;
4483 case OP_WHITESPACE:
4484 for (i = 1; i <= min; i++)
4486 if (eptr >= md->end_subject)
4488 SCHECK_PARTIAL();
4489 RRETURN(MATCH_NOMATCH);
4491 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4492 RRETURN(MATCH_NOMATCH);
4493 eptr++;
4494 /* No need to skip more bytes - we know it's a 1-byte character */
4496 break;
4498 case OP_NOT_WORDCHAR:
4499 for (i = 1; i <= min; i++)
4501 if (eptr >= md->end_subject)
4503 SCHECK_PARTIAL();
4504 RRETURN(MATCH_NOMATCH);
4506 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4507 RRETURN(MATCH_NOMATCH);
4508 eptr++;
4509 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4511 break;
4513 case OP_WORDCHAR:
4514 for (i = 1; i <= min; i++)
4516 if (eptr >= md->end_subject)
4518 SCHECK_PARTIAL();
4519 RRETURN(MATCH_NOMATCH);
4521 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4522 RRETURN(MATCH_NOMATCH);
4523 eptr++;
4524 /* No need to skip more bytes - we know it's a 1-byte character */
4526 break;
4528 default:
4529 RRETURN(PCRE_ERROR_INTERNAL);
4530 } /* End switch(ctype) */
4532 else
4533 #endif /* SUPPORT_UTF */
4535 /* Code for the non-UTF-8 case for minimum matching of operators other
4536 than OP_PROP and OP_NOTPROP. */
4538 switch(ctype)
4540 case OP_ANY:
4541 for (i = 1; i <= min; i++)
4543 if (eptr >= md->end_subject)
4545 SCHECK_PARTIAL();
4546 RRETURN(MATCH_NOMATCH);
4548 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4549 if (md->partial != 0 &&
4550 eptr + 1 >= md->end_subject &&
4551 NLBLOCK->nltype == NLTYPE_FIXED &&
4552 NLBLOCK->nllen == 2 &&
4553 *eptr == NLBLOCK->nl[0])
4555 md->hitend = TRUE;
4556 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4558 eptr++;
4560 break;
4562 case OP_ALLANY:
4563 if (eptr > md->end_subject - min)
4565 SCHECK_PARTIAL();
4566 RRETURN(MATCH_NOMATCH);
4568 eptr += min;
4569 break;
4571 case OP_ANYBYTE:
4572 if (eptr > md->end_subject - min)
4574 SCHECK_PARTIAL();
4575 RRETURN(MATCH_NOMATCH);
4577 eptr += min;
4578 break;
4580 case OP_ANYNL:
4581 for (i = 1; i <= min; i++)
4583 if (eptr >= md->end_subject)
4585 SCHECK_PARTIAL();
4586 RRETURN(MATCH_NOMATCH);
4588 switch(*eptr++)
4590 default: RRETURN(MATCH_NOMATCH);
4592 case 0x000d:
4593 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4594 break;
4596 case 0x000a:
4597 break;
4599 case 0x000b:
4600 case 0x000c:
4601 case 0x0085:
4602 #ifdef COMPILE_PCRE16
4603 case 0x2028:
4604 case 0x2029:
4605 #endif
4606 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4607 break;
4610 break;
4612 case OP_NOT_HSPACE:
4613 for (i = 1; i <= min; i++)
4615 if (eptr >= md->end_subject)
4617 SCHECK_PARTIAL();
4618 RRETURN(MATCH_NOMATCH);
4620 switch(*eptr++)
4622 default: break;
4623 case 0x09: /* HT */
4624 case 0x20: /* SPACE */
4625 case 0xa0: /* NBSP */
4626 #ifdef COMPILE_PCRE16
4627 case 0x1680: /* OGHAM SPACE MARK */
4628 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4629 case 0x2000: /* EN QUAD */
4630 case 0x2001: /* EM QUAD */
4631 case 0x2002: /* EN SPACE */
4632 case 0x2003: /* EM SPACE */
4633 case 0x2004: /* THREE-PER-EM SPACE */
4634 case 0x2005: /* FOUR-PER-EM SPACE */
4635 case 0x2006: /* SIX-PER-EM SPACE */
4636 case 0x2007: /* FIGURE SPACE */
4637 case 0x2008: /* PUNCTUATION SPACE */
4638 case 0x2009: /* THIN SPACE */
4639 case 0x200A: /* HAIR SPACE */
4640 case 0x202f: /* NARROW NO-BREAK SPACE */
4641 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4642 case 0x3000: /* IDEOGRAPHIC SPACE */
4643 #endif
4644 RRETURN(MATCH_NOMATCH);
4647 break;
4649 case OP_HSPACE:
4650 for (i = 1; i <= min; i++)
4652 if (eptr >= md->end_subject)
4654 SCHECK_PARTIAL();
4655 RRETURN(MATCH_NOMATCH);
4657 switch(*eptr++)
4659 default: RRETURN(MATCH_NOMATCH);
4660 case 0x09: /* HT */
4661 case 0x20: /* SPACE */
4662 case 0xa0: /* NBSP */
4663 #ifdef COMPILE_PCRE16
4664 case 0x1680: /* OGHAM SPACE MARK */
4665 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4666 case 0x2000: /* EN QUAD */
4667 case 0x2001: /* EM QUAD */
4668 case 0x2002: /* EN SPACE */
4669 case 0x2003: /* EM SPACE */
4670 case 0x2004: /* THREE-PER-EM SPACE */
4671 case 0x2005: /* FOUR-PER-EM SPACE */
4672 case 0x2006: /* SIX-PER-EM SPACE */
4673 case 0x2007: /* FIGURE SPACE */
4674 case 0x2008: /* PUNCTUATION SPACE */
4675 case 0x2009: /* THIN SPACE */
4676 case 0x200A: /* HAIR SPACE */
4677 case 0x202f: /* NARROW NO-BREAK SPACE */
4678 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4679 case 0x3000: /* IDEOGRAPHIC SPACE */
4680 #endif
4681 break;
4684 break;
4686 case OP_NOT_VSPACE:
4687 for (i = 1; i <= min; i++)
4689 if (eptr >= md->end_subject)
4691 SCHECK_PARTIAL();
4692 RRETURN(MATCH_NOMATCH);
4694 switch(*eptr++)
4696 default: break;
4697 case 0x0a: /* LF */
4698 case 0x0b: /* VT */
4699 case 0x0c: /* FF */
4700 case 0x0d: /* CR */
4701 case 0x85: /* NEL */
4702 #ifdef COMPILE_PCRE16
4703 case 0x2028: /* LINE SEPARATOR */
4704 case 0x2029: /* PARAGRAPH SEPARATOR */
4705 #endif
4706 RRETURN(MATCH_NOMATCH);
4709 break;
4711 case OP_VSPACE:
4712 for (i = 1; i <= min; i++)
4714 if (eptr >= md->end_subject)
4716 SCHECK_PARTIAL();
4717 RRETURN(MATCH_NOMATCH);
4719 switch(*eptr++)
4721 default: RRETURN(MATCH_NOMATCH);
4722 case 0x0a: /* LF */
4723 case 0x0b: /* VT */
4724 case 0x0c: /* FF */
4725 case 0x0d: /* CR */
4726 case 0x85: /* NEL */
4727 #ifdef COMPILE_PCRE16
4728 case 0x2028: /* LINE SEPARATOR */
4729 case 0x2029: /* PARAGRAPH SEPARATOR */
4730 #endif
4731 break;
4734 break;
4736 case OP_NOT_DIGIT:
4737 for (i = 1; i <= min; i++)
4739 if (eptr >= md->end_subject)
4741 SCHECK_PARTIAL();
4742 RRETURN(MATCH_NOMATCH);
4744 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4745 RRETURN(MATCH_NOMATCH);
4746 eptr++;
4748 break;
4750 case OP_DIGIT:
4751 for (i = 1; i <= min; i++)
4753 if (eptr >= md->end_subject)
4755 SCHECK_PARTIAL();
4756 RRETURN(MATCH_NOMATCH);
4758 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4759 RRETURN(MATCH_NOMATCH);
4760 eptr++;
4762 break;
4764 case OP_NOT_WHITESPACE:
4765 for (i = 1; i <= min; i++)
4767 if (eptr >= md->end_subject)
4769 SCHECK_PARTIAL();
4770 RRETURN(MATCH_NOMATCH);
4772 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4773 RRETURN(MATCH_NOMATCH);
4774 eptr++;
4776 break;
4778 case OP_WHITESPACE:
4779 for (i = 1; i <= min; i++)
4781 if (eptr >= md->end_subject)
4783 SCHECK_PARTIAL();
4784 RRETURN(MATCH_NOMATCH);
4786 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4787 RRETURN(MATCH_NOMATCH);
4788 eptr++;
4790 break;
4792 case OP_NOT_WORDCHAR:
4793 for (i = 1; i <= min; i++)
4795 if (eptr >= md->end_subject)
4797 SCHECK_PARTIAL();
4798 RRETURN(MATCH_NOMATCH);
4800 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4801 RRETURN(MATCH_NOMATCH);
4802 eptr++;
4804 break;
4806 case OP_WORDCHAR:
4807 for (i = 1; i <= min; i++)
4809 if (eptr >= md->end_subject)
4811 SCHECK_PARTIAL();
4812 RRETURN(MATCH_NOMATCH);
4814 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4815 RRETURN(MATCH_NOMATCH);
4816 eptr++;
4818 break;
4820 default:
4821 RRETURN(PCRE_ERROR_INTERNAL);
4825 /* If min = max, continue at the same level without recursing */
4827 if (min == max) continue;
4829 /* If minimizing, we have to test the rest of the pattern before each
4830 subsequent match. Again, separate the UTF-8 case for speed, and also
4831 separate the UCP cases. */
4833 if (minimize)
4835 #ifdef SUPPORT_UCP
4836 if (prop_type >= 0)
4838 switch(prop_type)
4840 case PT_ANY:
4841 for (fi = min;; fi++)
4843 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4844 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4845 if (fi >= max) RRETURN(MATCH_NOMATCH);
4846 if (eptr >= md->end_subject)
4848 SCHECK_PARTIAL();
4849 RRETURN(MATCH_NOMATCH);
4851 GETCHARINCTEST(c, eptr);
4852 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4854 /* Control never gets here */
4856 case PT_LAMP:
4857 for (fi = min;; fi++)
4859 int chartype;
4860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4862 if (fi >= max) RRETURN(MATCH_NOMATCH);
4863 if (eptr >= md->end_subject)
4865 SCHECK_PARTIAL();
4866 RRETURN(MATCH_NOMATCH);
4868 GETCHARINCTEST(c, eptr);
4869 chartype = UCD_CHARTYPE(c);
4870 if ((chartype == ucp_Lu ||
4871 chartype == ucp_Ll ||
4872 chartype == ucp_Lt) == prop_fail_result)
4873 RRETURN(MATCH_NOMATCH);
4875 /* Control never gets here */
4877 case PT_GC:
4878 for (fi = min;; fi++)
4880 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4882 if (fi >= max) RRETURN(MATCH_NOMATCH);
4883 if (eptr >= md->end_subject)
4885 SCHECK_PARTIAL();
4886 RRETURN(MATCH_NOMATCH);
4888 GETCHARINCTEST(c, eptr);
4889 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4890 RRETURN(MATCH_NOMATCH);
4892 /* Control never gets here */
4894 case PT_PC:
4895 for (fi = min;; fi++)
4897 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4898 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4899 if (fi >= max) RRETURN(MATCH_NOMATCH);
4900 if (eptr >= md->end_subject)
4902 SCHECK_PARTIAL();
4903 RRETURN(MATCH_NOMATCH);
4905 GETCHARINCTEST(c, eptr);
4906 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4907 RRETURN(MATCH_NOMATCH);
4909 /* Control never gets here */
4911 case PT_SC:
4912 for (fi = min;; fi++)
4914 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4916 if (fi >= max) RRETURN(MATCH_NOMATCH);
4917 if (eptr >= md->end_subject)
4919 SCHECK_PARTIAL();
4920 RRETURN(MATCH_NOMATCH);
4922 GETCHARINCTEST(c, eptr);
4923 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4924 RRETURN(MATCH_NOMATCH);
4926 /* Control never gets here */
4928 case PT_ALNUM:
4929 for (fi = min;; fi++)
4931 int category;
4932 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4934 if (fi >= max) RRETURN(MATCH_NOMATCH);
4935 if (eptr >= md->end_subject)
4937 SCHECK_PARTIAL();
4938 RRETURN(MATCH_NOMATCH);
4940 GETCHARINCTEST(c, eptr);
4941 category = UCD_CATEGORY(c);
4942 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4943 RRETURN(MATCH_NOMATCH);
4945 /* Control never gets here */
4947 case PT_SPACE: /* Perl space */
4948 for (fi = min;; fi++)
4950 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4952 if (fi >= max) RRETURN(MATCH_NOMATCH);
4953 if (eptr >= md->end_subject)
4955 SCHECK_PARTIAL();
4956 RRETURN(MATCH_NOMATCH);
4958 GETCHARINCTEST(c, eptr);
4959 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4960 c == CHAR_FF || c == CHAR_CR)
4961 == prop_fail_result)
4962 RRETURN(MATCH_NOMATCH);
4964 /* Control never gets here */
4966 case PT_PXSPACE: /* POSIX space */
4967 for (fi = min;; fi++)
4969 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4971 if (fi >= max) RRETURN(MATCH_NOMATCH);
4972 if (eptr >= md->end_subject)
4974 SCHECK_PARTIAL();
4975 RRETURN(MATCH_NOMATCH);
4977 GETCHARINCTEST(c, eptr);
4978 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4979 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4980 == prop_fail_result)
4981 RRETURN(MATCH_NOMATCH);
4983 /* Control never gets here */
4985 case PT_WORD:
4986 for (fi = min;; fi++)
4988 int category;
4989 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4991 if (fi >= max) RRETURN(MATCH_NOMATCH);
4992 if (eptr >= md->end_subject)
4994 SCHECK_PARTIAL();
4995 RRETURN(MATCH_NOMATCH);
4997 GETCHARINCTEST(c, eptr);
4998 category = UCD_CATEGORY(c);
4999 if ((category == ucp_L ||
5000 category == ucp_N ||
5001 c == CHAR_UNDERSCORE)
5002 == prop_fail_result)
5003 RRETURN(MATCH_NOMATCH);
5005 /* Control never gets here */
5007 /* This should never occur */
5009 default:
5010 RRETURN(PCRE_ERROR_INTERNAL);
5014 /* Match extended Unicode sequences. We will get here only if the
5015 support is in the binary; otherwise a compile-time error occurs. */
5017 else if (ctype == OP_EXTUNI)
5019 for (fi = min;; fi++)
5021 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5022 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5023 if (fi >= max) RRETURN(MATCH_NOMATCH);
5024 if (eptr >= md->end_subject)
5026 SCHECK_PARTIAL();
5027 RRETURN(MATCH_NOMATCH);
5029 GETCHARINCTEST(c, eptr);
5030 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
5031 while (eptr < md->end_subject)
5033 int len = 1;
5034 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5035 if (UCD_CATEGORY(c) != ucp_M) break;
5036 eptr += len;
5038 CHECK_PARTIAL();
5041 else
5042 #endif /* SUPPORT_UCP */
5044 #ifdef SUPPORT_UTF
5045 if (utf)
5047 for (fi = min;; fi++)
5049 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5051 if (fi >= max) RRETURN(MATCH_NOMATCH);
5052 if (eptr >= md->end_subject)
5054 SCHECK_PARTIAL();
5055 RRETURN(MATCH_NOMATCH);
5057 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5058 RRETURN(MATCH_NOMATCH);
5059 GETCHARINC(c, eptr);
5060 switch(ctype)
5062 case OP_ANY: /* This is the non-NL case */
5063 if (md->partial != 0 && /* Take care with CRLF partial */
5064 eptr >= md->end_subject &&
5065 NLBLOCK->nltype == NLTYPE_FIXED &&
5066 NLBLOCK->nllen == 2 &&
5067 c == NLBLOCK->nl[0])
5069 md->hitend = TRUE;
5070 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5072 break;
5074 case OP_ALLANY:
5075 case OP_ANYBYTE:
5076 break;
5078 case OP_ANYNL:
5079 switch(c)
5081 default: RRETURN(MATCH_NOMATCH);
5082 case 0x000d:
5083 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5084 break;
5085 case 0x000a:
5086 break;
5088 case 0x000b:
5089 case 0x000c:
5090 case 0x0085:
5091 case 0x2028:
5092 case 0x2029:
5093 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5094 break;
5096 break;
5098 case OP_NOT_HSPACE:
5099 switch(c)
5101 default: break;
5102 case 0x09: /* HT */
5103 case 0x20: /* SPACE */
5104 case 0xa0: /* NBSP */
5105 case 0x1680: /* OGHAM SPACE MARK */
5106 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5107 case 0x2000: /* EN QUAD */
5108 case 0x2001: /* EM QUAD */
5109 case 0x2002: /* EN SPACE */
5110 case 0x2003: /* EM SPACE */
5111 case 0x2004: /* THREE-PER-EM SPACE */
5112 case 0x2005: /* FOUR-PER-EM SPACE */
5113 case 0x2006: /* SIX-PER-EM SPACE */
5114 case 0x2007: /* FIGURE SPACE */
5115 case 0x2008: /* PUNCTUATION SPACE */
5116 case 0x2009: /* THIN SPACE */
5117 case 0x200A: /* HAIR SPACE */
5118 case 0x202f: /* NARROW NO-BREAK SPACE */
5119 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5120 case 0x3000: /* IDEOGRAPHIC SPACE */
5121 RRETURN(MATCH_NOMATCH);
5123 break;
5125 case OP_HSPACE:
5126 switch(c)
5128 default: RRETURN(MATCH_NOMATCH);
5129 case 0x09: /* HT */
5130 case 0x20: /* SPACE */
5131 case 0xa0: /* NBSP */
5132 case 0x1680: /* OGHAM SPACE MARK */
5133 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5134 case 0x2000: /* EN QUAD */
5135 case 0x2001: /* EM QUAD */
5136 case 0x2002: /* EN SPACE */
5137 case 0x2003: /* EM SPACE */
5138 case 0x2004: /* THREE-PER-EM SPACE */
5139 case 0x2005: /* FOUR-PER-EM SPACE */
5140 case 0x2006: /* SIX-PER-EM SPACE */
5141 case 0x2007: /* FIGURE SPACE */
5142 case 0x2008: /* PUNCTUATION SPACE */
5143 case 0x2009: /* THIN SPACE */
5144 case 0x200A: /* HAIR SPACE */
5145 case 0x202f: /* NARROW NO-BREAK SPACE */
5146 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5147 case 0x3000: /* IDEOGRAPHIC SPACE */
5148 break;
5150 break;
5152 case OP_NOT_VSPACE:
5153 switch(c)
5155 default: break;
5156 case 0x0a: /* LF */
5157 case 0x0b: /* VT */
5158 case 0x0c: /* FF */
5159 case 0x0d: /* CR */
5160 case 0x85: /* NEL */
5161 case 0x2028: /* LINE SEPARATOR */
5162 case 0x2029: /* PARAGRAPH SEPARATOR */
5163 RRETURN(MATCH_NOMATCH);
5165 break;
5167 case OP_VSPACE:
5168 switch(c)
5170 default: RRETURN(MATCH_NOMATCH);
5171 case 0x0a: /* LF */
5172 case 0x0b: /* VT */
5173 case 0x0c: /* FF */
5174 case 0x0d: /* CR */
5175 case 0x85: /* NEL */
5176 case 0x2028: /* LINE SEPARATOR */
5177 case 0x2029: /* PARAGRAPH SEPARATOR */
5178 break;
5180 break;
5182 case OP_NOT_DIGIT:
5183 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5184 RRETURN(MATCH_NOMATCH);
5185 break;
5187 case OP_DIGIT:
5188 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5189 RRETURN(MATCH_NOMATCH);
5190 break;
5192 case OP_NOT_WHITESPACE:
5193 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5194 RRETURN(MATCH_NOMATCH);
5195 break;
5197 case OP_WHITESPACE:
5198 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5199 RRETURN(MATCH_NOMATCH);
5200 break;
5202 case OP_NOT_WORDCHAR:
5203 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5204 RRETURN(MATCH_NOMATCH);
5205 break;
5207 case OP_WORDCHAR:
5208 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5209 RRETURN(MATCH_NOMATCH);
5210 break;
5212 default:
5213 RRETURN(PCRE_ERROR_INTERNAL);
5217 else
5218 #endif
5219 /* Not UTF mode */
5221 for (fi = min;; fi++)
5223 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5224 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5225 if (fi >= max) RRETURN(MATCH_NOMATCH);
5226 if (eptr >= md->end_subject)
5228 SCHECK_PARTIAL();
5229 RRETURN(MATCH_NOMATCH);
5231 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5232 RRETURN(MATCH_NOMATCH);
5233 c = *eptr++;
5234 switch(ctype)
5236 case OP_ANY: /* This is the non-NL case */
5237 if (md->partial != 0 && /* Take care with CRLF partial */
5238 eptr >= md->end_subject &&
5239 NLBLOCK->nltype == NLTYPE_FIXED &&
5240 NLBLOCK->nllen == 2 &&
5241 c == NLBLOCK->nl[0])
5243 md->hitend = TRUE;
5244 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5246 break;
5248 case OP_ALLANY:
5249 case OP_ANYBYTE:
5250 break;
5252 case OP_ANYNL:
5253 switch(c)
5255 default: RRETURN(MATCH_NOMATCH);
5256 case 0x000d:
5257 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5258 break;
5260 case 0x000a:
5261 break;
5263 case 0x000b:
5264 case 0x000c:
5265 case 0x0085:
5266 #ifdef COMPILE_PCRE16
5267 case 0x2028:
5268 case 0x2029:
5269 #endif
5270 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5271 break;
5273 break;
5275 case OP_NOT_HSPACE:
5276 switch(c)
5278 default: break;
5279 case 0x09: /* HT */
5280 case 0x20: /* SPACE */
5281 case 0xa0: /* NBSP */
5282 #ifdef COMPILE_PCRE16
5283 case 0x1680: /* OGHAM SPACE MARK */
5284 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5285 case 0x2000: /* EN QUAD */
5286 case 0x2001: /* EM QUAD */
5287 case 0x2002: /* EN SPACE */
5288 case 0x2003: /* EM SPACE */
5289 case 0x2004: /* THREE-PER-EM SPACE */
5290 case 0x2005: /* FOUR-PER-EM SPACE */
5291 case 0x2006: /* SIX-PER-EM SPACE */
5292 case 0x2007: /* FIGURE SPACE */
5293 case 0x2008: /* PUNCTUATION SPACE */
5294 case 0x2009: /* THIN SPACE */
5295 case 0x200A: /* HAIR SPACE */
5296 case 0x202f: /* NARROW NO-BREAK SPACE */
5297 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5298 case 0x3000: /* IDEOGRAPHIC SPACE */
5299 #endif
5300 RRETURN(MATCH_NOMATCH);
5302 break;
5304 case OP_HSPACE:
5305 switch(c)
5307 default: RRETURN(MATCH_NOMATCH);
5308 case 0x09: /* HT */
5309 case 0x20: /* SPACE */
5310 case 0xa0: /* NBSP */
5311 #ifdef COMPILE_PCRE16
5312 case 0x1680: /* OGHAM SPACE MARK */
5313 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5314 case 0x2000: /* EN QUAD */
5315 case 0x2001: /* EM QUAD */
5316 case 0x2002: /* EN SPACE */
5317 case 0x2003: /* EM SPACE */
5318 case 0x2004: /* THREE-PER-EM SPACE */
5319 case 0x2005: /* FOUR-PER-EM SPACE */
5320 case 0x2006: /* SIX-PER-EM SPACE */
5321 case 0x2007: /* FIGURE SPACE */
5322 case 0x2008: /* PUNCTUATION SPACE */
5323 case 0x2009: /* THIN SPACE */
5324 case 0x200A: /* HAIR SPACE */
5325 case 0x202f: /* NARROW NO-BREAK SPACE */
5326 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5327 case 0x3000: /* IDEOGRAPHIC SPACE */
5328 #endif
5329 break;
5331 break;
5333 case OP_NOT_VSPACE:
5334 switch(c)
5336 default: break;
5337 case 0x0a: /* LF */
5338 case 0x0b: /* VT */
5339 case 0x0c: /* FF */
5340 case 0x0d: /* CR */
5341 case 0x85: /* NEL */
5342 #ifdef COMPILE_PCRE16
5343 case 0x2028: /* LINE SEPARATOR */
5344 case 0x2029: /* PARAGRAPH SEPARATOR */
5345 #endif
5346 RRETURN(MATCH_NOMATCH);
5348 break;
5350 case OP_VSPACE:
5351 switch(c)
5353 default: RRETURN(MATCH_NOMATCH);
5354 case 0x0a: /* LF */
5355 case 0x0b: /* VT */
5356 case 0x0c: /* FF */
5357 case 0x0d: /* CR */
5358 case 0x85: /* NEL */
5359 #ifdef COMPILE_PCRE16
5360 case 0x2028: /* LINE SEPARATOR */
5361 case 0x2029: /* PARAGRAPH SEPARATOR */
5362 #endif
5363 break;
5365 break;
5367 case OP_NOT_DIGIT:
5368 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5369 break;
5371 case OP_DIGIT:
5372 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5373 break;
5375 case OP_NOT_WHITESPACE:
5376 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5377 break;
5379 case OP_WHITESPACE:
5380 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5381 break;
5383 case OP_NOT_WORDCHAR:
5384 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5385 break;
5387 case OP_WORDCHAR:
5388 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5389 break;
5391 default:
5392 RRETURN(PCRE_ERROR_INTERNAL);
5396 /* Control never gets here */
5399 /* If maximizing, it is worth using inline code for speed, doing the type
5400 test once at the start (i.e. keep it out of the loop). Again, keep the
5401 UTF-8 and UCP stuff separate. */
5403 else
5405 pp = eptr; /* Remember where we started */
5407 #ifdef SUPPORT_UCP
5408 if (prop_type >= 0)
5410 switch(prop_type)
5412 case PT_ANY:
5413 for (i = min; i < max; i++)
5415 int len = 1;
5416 if (eptr >= md->end_subject)
5418 SCHECK_PARTIAL();
5419 break;
5421 GETCHARLENTEST(c, eptr, len);
5422 if (prop_fail_result) break;
5423 eptr+= len;
5425 break;
5427 case PT_LAMP:
5428 for (i = min; i < max; i++)
5430 int chartype;
5431 int len = 1;
5432 if (eptr >= md->end_subject)
5434 SCHECK_PARTIAL();
5435 break;
5437 GETCHARLENTEST(c, eptr, len);
5438 chartype = UCD_CHARTYPE(c);
5439 if ((chartype == ucp_Lu ||
5440 chartype == ucp_Ll ||
5441 chartype == ucp_Lt) == prop_fail_result)
5442 break;
5443 eptr+= len;
5445 break;
5447 case PT_GC:
5448 for (i = min; i < max; i++)
5450 int len = 1;
5451 if (eptr >= md->end_subject)
5453 SCHECK_PARTIAL();
5454 break;
5456 GETCHARLENTEST(c, eptr, len);
5457 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5458 eptr+= len;
5460 break;
5462 case PT_PC:
5463 for (i = min; i < max; i++)
5465 int len = 1;
5466 if (eptr >= md->end_subject)
5468 SCHECK_PARTIAL();
5469 break;
5471 GETCHARLENTEST(c, eptr, len);
5472 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5473 eptr+= len;
5475 break;
5477 case PT_SC:
5478 for (i = min; i < max; i++)
5480 int len = 1;
5481 if (eptr >= md->end_subject)
5483 SCHECK_PARTIAL();
5484 break;
5486 GETCHARLENTEST(c, eptr, len);
5487 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5488 eptr+= len;
5490 break;
5492 case PT_ALNUM:
5493 for (i = min; i < max; i++)
5495 int category;
5496 int len = 1;
5497 if (eptr >= md->end_subject)
5499 SCHECK_PARTIAL();
5500 break;
5502 GETCHARLENTEST(c, eptr, len);
5503 category = UCD_CATEGORY(c);
5504 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5505 break;
5506 eptr+= len;
5508 break;
5510 case PT_SPACE: /* Perl space */
5511 for (i = min; i < max; i++)
5513 int len = 1;
5514 if (eptr >= md->end_subject)
5516 SCHECK_PARTIAL();
5517 break;
5519 GETCHARLENTEST(c, eptr, len);
5520 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5521 c == CHAR_FF || c == CHAR_CR)
5522 == prop_fail_result)
5523 break;
5524 eptr+= len;
5526 break;
5528 case PT_PXSPACE: /* POSIX space */
5529 for (i = min; i < max; i++)
5531 int len = 1;
5532 if (eptr >= md->end_subject)
5534 SCHECK_PARTIAL();
5535 break;
5537 GETCHARLENTEST(c, eptr, len);
5538 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5539 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5540 == prop_fail_result)
5541 break;
5542 eptr+= len;
5544 break;
5546 case PT_WORD:
5547 for (i = min; i < max; i++)
5549 int category;
5550 int len = 1;
5551 if (eptr >= md->end_subject)
5553 SCHECK_PARTIAL();
5554 break;
5556 GETCHARLENTEST(c, eptr, len);
5557 category = UCD_CATEGORY(c);
5558 if ((category == ucp_L || category == ucp_N ||
5559 c == CHAR_UNDERSCORE) == prop_fail_result)
5560 break;
5561 eptr+= len;
5563 break;
5565 default:
5566 RRETURN(PCRE_ERROR_INTERNAL);
5569 /* eptr is now past the end of the maximum run */
5571 if (possessive) continue;
5572 for(;;)
5574 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5575 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5576 if (eptr-- == pp) break; /* Stop if tried at original pos */
5577 if (utf) BACKCHAR(eptr);
5581 /* Match extended Unicode sequences. We will get here only if the
5582 support is in the binary; otherwise a compile-time error occurs. */
5584 else if (ctype == OP_EXTUNI)
5586 for (i = min; i < max; i++)
5588 int len = 1;
5589 if (eptr >= md->end_subject)
5591 SCHECK_PARTIAL();
5592 break;
5594 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5595 if (UCD_CATEGORY(c) == ucp_M) break;
5596 eptr += len;
5597 while (eptr < md->end_subject)
5599 len = 1;
5600 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5601 if (UCD_CATEGORY(c) != ucp_M) break;
5602 eptr += len;
5604 CHECK_PARTIAL();
5607 /* eptr is now past the end of the maximum run */
5609 if (possessive) continue;
5611 for(;;)
5613 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5615 if (eptr-- == pp) break; /* Stop if tried at original pos */
5616 for (;;) /* Move back over one extended */
5618 if (!utf) c = *eptr; else
5620 BACKCHAR(eptr);
5621 GETCHAR(c, eptr);
5623 if (UCD_CATEGORY(c) != ucp_M) break;
5624 eptr--;
5629 else
5630 #endif /* SUPPORT_UCP */
5632 #ifdef SUPPORT_UTF
5633 if (utf)
5635 switch(ctype)
5637 case OP_ANY:
5638 if (max < INT_MAX)
5640 for (i = min; i < max; i++)
5642 if (eptr >= md->end_subject)
5644 SCHECK_PARTIAL();
5645 break;
5647 if (IS_NEWLINE(eptr)) break;
5648 if (md->partial != 0 && /* Take care with CRLF partial */
5649 eptr + 1 >= md->end_subject &&
5650 NLBLOCK->nltype == NLTYPE_FIXED &&
5651 NLBLOCK->nllen == 2 &&
5652 *eptr == NLBLOCK->nl[0])
5654 md->hitend = TRUE;
5655 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5657 eptr++;
5658 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5662 /* Handle unlimited UTF-8 repeat */
5664 else
5666 for (i = min; i < max; i++)
5668 if (eptr >= md->end_subject)
5670 SCHECK_PARTIAL();
5671 break;
5673 if (IS_NEWLINE(eptr)) break;
5674 if (md->partial != 0 && /* Take care with CRLF partial */
5675 eptr + 1 >= md->end_subject &&
5676 NLBLOCK->nltype == NLTYPE_FIXED &&
5677 NLBLOCK->nllen == 2 &&
5678 *eptr == NLBLOCK->nl[0])
5680 md->hitend = TRUE;
5681 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5683 eptr++;
5684 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5687 break;
5689 case OP_ALLANY:
5690 if (max < INT_MAX)
5692 for (i = min; i < max; i++)
5694 if (eptr >= md->end_subject)
5696 SCHECK_PARTIAL();
5697 break;
5699 eptr++;
5700 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5703 else
5705 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5706 SCHECK_PARTIAL();
5708 break;
5710 /* The byte case is the same as non-UTF8 */
5712 case OP_ANYBYTE:
5713 c = max - min;
5714 if (c > (unsigned int)(md->end_subject - eptr))
5716 eptr = md->end_subject;
5717 SCHECK_PARTIAL();
5719 else eptr += c;
5720 break;
5722 case OP_ANYNL:
5723 for (i = min; i < max; i++)
5725 int len = 1;
5726 if (eptr >= md->end_subject)
5728 SCHECK_PARTIAL();
5729 break;
5731 GETCHARLEN(c, eptr, len);
5732 if (c == 0x000d)
5734 if (++eptr >= md->end_subject) break;
5735 if (*eptr == 0x000a) eptr++;
5737 else
5739 if (c != 0x000a &&
5740 (md->bsr_anycrlf ||
5741 (c != 0x000b && c != 0x000c &&
5742 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5743 break;
5744 eptr += len;
5747 break;
5749 case OP_NOT_HSPACE:
5750 case OP_HSPACE:
5751 for (i = min; i < max; i++)
5753 BOOL gotspace;
5754 int len = 1;
5755 if (eptr >= md->end_subject)
5757 SCHECK_PARTIAL();
5758 break;
5760 GETCHARLEN(c, eptr, len);
5761 switch(c)
5763 default: gotspace = FALSE; break;
5764 case 0x09: /* HT */
5765 case 0x20: /* SPACE */
5766 case 0xa0: /* NBSP */
5767 case 0x1680: /* OGHAM SPACE MARK */
5768 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5769 case 0x2000: /* EN QUAD */
5770 case 0x2001: /* EM QUAD */
5771 case 0x2002: /* EN SPACE */
5772 case 0x2003: /* EM SPACE */
5773 case 0x2004: /* THREE-PER-EM SPACE */
5774 case 0x2005: /* FOUR-PER-EM SPACE */
5775 case 0x2006: /* SIX-PER-EM SPACE */
5776 case 0x2007: /* FIGURE SPACE */
5777 case 0x2008: /* PUNCTUATION SPACE */
5778 case 0x2009: /* THIN SPACE */
5779 case 0x200A: /* HAIR SPACE */
5780 case 0x202f: /* NARROW NO-BREAK SPACE */
5781 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5782 case 0x3000: /* IDEOGRAPHIC SPACE */
5783 gotspace = TRUE;
5784 break;
5786 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5787 eptr += len;
5789 break;
5791 case OP_NOT_VSPACE:
5792 case OP_VSPACE:
5793 for (i = min; i < max; i++)
5795 BOOL gotspace;
5796 int len = 1;
5797 if (eptr >= md->end_subject)
5799 SCHECK_PARTIAL();
5800 break;
5802 GETCHARLEN(c, eptr, len);
5803 switch(c)
5805 default: gotspace = FALSE; break;
5806 case 0x0a: /* LF */
5807 case 0x0b: /* VT */
5808 case 0x0c: /* FF */
5809 case 0x0d: /* CR */
5810 case 0x85: /* NEL */
5811 case 0x2028: /* LINE SEPARATOR */
5812 case 0x2029: /* PARAGRAPH SEPARATOR */
5813 gotspace = TRUE;
5814 break;
5816 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5817 eptr += len;
5819 break;
5821 case OP_NOT_DIGIT:
5822 for (i = min; i < max; i++)
5824 int len = 1;
5825 if (eptr >= md->end_subject)
5827 SCHECK_PARTIAL();
5828 break;
5830 GETCHARLEN(c, eptr, len);
5831 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5832 eptr+= len;
5834 break;
5836 case OP_DIGIT:
5837 for (i = min; i < max; i++)
5839 int len = 1;
5840 if (eptr >= md->end_subject)
5842 SCHECK_PARTIAL();
5843 break;
5845 GETCHARLEN(c, eptr, len);
5846 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5847 eptr+= len;
5849 break;
5851 case OP_NOT_WHITESPACE:
5852 for (i = min; i < max; i++)
5854 int len = 1;
5855 if (eptr >= md->end_subject)
5857 SCHECK_PARTIAL();
5858 break;
5860 GETCHARLEN(c, eptr, len);
5861 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5862 eptr+= len;
5864 break;
5866 case OP_WHITESPACE:
5867 for (i = min; i < max; i++)
5869 int len = 1;
5870 if (eptr >= md->end_subject)
5872 SCHECK_PARTIAL();
5873 break;
5875 GETCHARLEN(c, eptr, len);
5876 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5877 eptr+= len;
5879 break;
5881 case OP_NOT_WORDCHAR:
5882 for (i = min; i < max; i++)
5884 int len = 1;
5885 if (eptr >= md->end_subject)
5887 SCHECK_PARTIAL();
5888 break;
5890 GETCHARLEN(c, eptr, len);
5891 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5892 eptr+= len;
5894 break;
5896 case OP_WORDCHAR:
5897 for (i = min; i < max; i++)
5899 int len = 1;
5900 if (eptr >= md->end_subject)
5902 SCHECK_PARTIAL();
5903 break;
5905 GETCHARLEN(c, eptr, len);
5906 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5907 eptr+= len;
5909 break;
5911 default:
5912 RRETURN(PCRE_ERROR_INTERNAL);
5915 /* eptr is now past the end of the maximum run. If possessive, we are
5916 done (no backing up). Otherwise, match at this position; anything other
5917 than no match is immediately returned. For nomatch, back up one
5918 character, unless we are matching \R and the last thing matched was
5919 \r\n, in which case, back up two bytes. */
5921 if (possessive) continue;
5922 for(;;)
5924 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5925 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5926 if (eptr-- == pp) break; /* Stop if tried at original pos */
5927 BACKCHAR(eptr);
5928 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5929 eptr[-1] == '\r') eptr--;
5932 else
5933 #endif /* SUPPORT_UTF */
5934 /* Not UTF mode */
5936 switch(ctype)
5938 case OP_ANY:
5939 for (i = min; i < max; i++)
5941 if (eptr >= md->end_subject)
5943 SCHECK_PARTIAL();
5944 break;
5946 if (IS_NEWLINE(eptr)) break;
5947 if (md->partial != 0 && /* Take care with CRLF partial */
5948 eptr + 1 >= md->end_subject &&
5949 NLBLOCK->nltype == NLTYPE_FIXED &&
5950 NLBLOCK->nllen == 2 &&
5951 *eptr == NLBLOCK->nl[0])
5953 md->hitend = TRUE;
5954 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5956 eptr++;
5958 break;
5960 case OP_ALLANY:
5961 case OP_ANYBYTE:
5962 c = max - min;
5963 if (c > (unsigned int)(md->end_subject - eptr))
5965 eptr = md->end_subject;
5966 SCHECK_PARTIAL();
5968 else eptr += c;
5969 break;
5971 case OP_ANYNL:
5972 for (i = min; i < max; i++)
5974 if (eptr >= md->end_subject)
5976 SCHECK_PARTIAL();
5977 break;
5979 c = *eptr;
5980 if (c == 0x000d)
5982 if (++eptr >= md->end_subject) break;
5983 if (*eptr == 0x000a) eptr++;
5985 else
5987 if (c != 0x000a && (md->bsr_anycrlf ||
5988 (c != 0x000b && c != 0x000c && c != 0x0085
5989 #ifdef COMPILE_PCRE16
5990 && c != 0x2028 && c != 0x2029
5991 #endif
5992 ))) break;
5993 eptr++;
5996 break;
5998 case OP_NOT_HSPACE:
5999 for (i = min; i < max; i++)
6001 if (eptr >= md->end_subject)
6003 SCHECK_PARTIAL();
6004 break;
6006 c = *eptr;
6007 if (c == 0x09 || c == 0x20 || c == 0xa0
6008 #ifdef COMPILE_PCRE16
6009 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6010 || c == 0x202f || c == 0x205f || c == 0x3000
6011 #endif
6012 ) break;
6013 eptr++;
6015 break;
6017 case OP_HSPACE:
6018 for (i = min; i < max; i++)
6020 if (eptr >= md->end_subject)
6022 SCHECK_PARTIAL();
6023 break;
6025 c = *eptr;
6026 if (c != 0x09 && c != 0x20 && c != 0xa0
6027 #ifdef COMPILE_PCRE16
6028 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6029 && c != 0x202f && c != 0x205f && c != 0x3000
6030 #endif
6031 ) break;
6032 eptr++;
6034 break;
6036 case OP_NOT_VSPACE:
6037 for (i = min; i < max; i++)
6039 if (eptr >= md->end_subject)
6041 SCHECK_PARTIAL();
6042 break;
6044 c = *eptr;
6045 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
6046 #ifdef COMPILE_PCRE16
6047 || c == 0x2028 || c == 0x2029
6048 #endif
6049 ) break;
6050 eptr++;
6052 break;
6054 case OP_VSPACE:
6055 for (i = min; i < max; i++)
6057 if (eptr >= md->end_subject)
6059 SCHECK_PARTIAL();
6060 break;
6062 c = *eptr;
6063 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
6064 #ifdef COMPILE_PCRE16
6065 && c != 0x2028 && c != 0x2029
6066 #endif
6067 ) break;
6068 eptr++;
6070 break;
6072 case OP_NOT_DIGIT:
6073 for (i = min; i < max; i++)
6075 if (eptr >= md->end_subject)
6077 SCHECK_PARTIAL();
6078 break;
6080 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6081 eptr++;
6083 break;
6085 case OP_DIGIT:
6086 for (i = min; i < max; i++)
6088 if (eptr >= md->end_subject)
6090 SCHECK_PARTIAL();
6091 break;
6093 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6094 eptr++;
6096 break;
6098 case OP_NOT_WHITESPACE:
6099 for (i = min; i < max; i++)
6101 if (eptr >= md->end_subject)
6103 SCHECK_PARTIAL();
6104 break;
6106 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6107 eptr++;
6109 break;
6111 case OP_WHITESPACE:
6112 for (i = min; i < max; i++)
6114 if (eptr >= md->end_subject)
6116 SCHECK_PARTIAL();
6117 break;
6119 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6120 eptr++;
6122 break;
6124 case OP_NOT_WORDCHAR:
6125 for (i = min; i < max; i++)
6127 if (eptr >= md->end_subject)
6129 SCHECK_PARTIAL();
6130 break;
6132 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6133 eptr++;
6135 break;
6137 case OP_WORDCHAR:
6138 for (i = min; i < max; i++)
6140 if (eptr >= md->end_subject)
6142 SCHECK_PARTIAL();
6143 break;
6145 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6146 eptr++;
6148 break;
6150 default:
6151 RRETURN(PCRE_ERROR_INTERNAL);
6154 /* eptr is now past the end of the maximum run. If possessive, we are
6155 done (no backing up). Otherwise, match at this position; anything other
6156 than no match is immediately returned. For nomatch, back up one
6157 character (byte), unless we are matching \R and the last thing matched
6158 was \r\n, in which case, back up two bytes. */
6160 if (possessive) continue;
6161 while (eptr >= pp)
6163 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6164 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6165 eptr--;
6166 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6167 eptr[-1] == '\r') eptr--;
6171 /* Get here if we can't make it match with any permitted repetitions */
6173 RRETURN(MATCH_NOMATCH);
6175 /* Control never gets here */
6177 /* There's been some horrible disaster. Arrival here can only mean there is
6178 something seriously wrong in the code above or the OP_xxx definitions. */
6180 default:
6181 DPRINTF(("Unknown opcode %d\n", *ecode));
6182 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6185 /* Do not stick any code in here without much thought; it is assumed
6186 that "continue" in the code above comes out to here to repeat the main
6187 loop. */
6189 } /* End of main loop */
6190 /* Control never reaches here */
6193 /* When compiling to use the heap rather than the stack for recursive calls to
6194 match(), the RRETURN() macro jumps here. The number that is saved in
6195 frame->Xwhere indicates which label we actually want to return to. */
6197 #ifdef NO_RECURSE
6198 #define LBL(val) case val: goto L_RM##val;
6199 HEAP_RETURN:
6200 switch (frame->Xwhere)
6202 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6203 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6204 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6205 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6206 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6207 LBL(65) LBL(66)
6208 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6209 LBL(21)
6210 #endif
6211 #ifdef SUPPORT_UTF
6212 LBL(16) LBL(18) LBL(20)
6213 LBL(22) LBL(23) LBL(28) LBL(30)
6214 LBL(32) LBL(34) LBL(42) LBL(46)
6215 #ifdef SUPPORT_UCP
6216 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6217 LBL(59) LBL(60) LBL(61) LBL(62)
6218 #endif /* SUPPORT_UCP */
6219 #endif /* SUPPORT_UTF */
6220 default:
6221 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6223 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6225 return PCRE_ERROR_INTERNAL;
6227 #undef LBL
6228 #endif /* NO_RECURSE */
6232 /***************************************************************************
6233 ****************************************************************************
6234 RECURSION IN THE match() FUNCTION
6236 Undefine all the macros that were defined above to handle this. */
6238 #ifdef NO_RECURSE
6239 #undef eptr
6240 #undef ecode
6241 #undef mstart
6242 #undef offset_top
6243 #undef eptrb
6244 #undef flags
6246 #undef callpat
6247 #undef charptr
6248 #undef data
6249 #undef next
6250 #undef pp
6251 #undef prev
6252 #undef saved_eptr
6254 #undef new_recursive
6256 #undef cur_is_word
6257 #undef condition
6258 #undef prev_is_word
6260 #undef ctype
6261 #undef length
6262 #undef max
6263 #undef min
6264 #undef number
6265 #undef offset
6266 #undef op
6267 #undef save_capture_last
6268 #undef save_offset1
6269 #undef save_offset2
6270 #undef save_offset3
6271 #undef stacksave
6273 #undef newptrb
6275 #endif
6277 /* These two are defined as macros in both cases */
6279 #undef fc
6280 #undef fi
6282 /***************************************************************************
6283 ***************************************************************************/
6286 #ifdef NO_RECURSE
6287 /*************************************************
6288 * Release allocated heap frames *
6289 *************************************************/
6291 /* This function releases all the allocated frames. The base frame is on the
6292 machine stack, and so must not be freed.
6294 Argument: the address of the base frame
6295 Returns: nothing
6298 static void
6299 release_match_heapframes (heapframe *frame_base)
6301 heapframe *nextframe = frame_base->Xnextframe;
6302 while (nextframe != NULL)
6304 heapframe *oldframe = nextframe;
6305 nextframe = nextframe->Xnextframe;
6306 (PUBL(stack_free))(oldframe);
6309 #endif
6312 /*************************************************
6313 * Execute a Regular Expression *
6314 *************************************************/
6316 /* This function applies a compiled re to a subject string and picks out
6317 portions of the string if it matches. Two elements in the vector are set for
6318 each substring: the offsets to the start and end of the substring.
6320 Arguments:
6321 argument_re points to the compiled expression
6322 extra_data points to extra data or is NULL
6323 subject points to the subject string
6324 length length of subject string (may contain binary zeros)
6325 start_offset where to start in the subject string
6326 options option bits
6327 offsets points to a vector of ints to be filled in with offsets
6328 offsetcount the number of elements in the vector
6330 Returns: > 0 => success; value is the number of elements filled in
6331 = 0 => success, but offsets is not big enough
6332 -1 => failed to match
6333 < -1 => some kind of unexpected problem
6336 #ifdef COMPILE_PCRE8
6337 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6338 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6339 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6340 int offsetcount)
6341 #else
6342 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6343 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6344 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6345 int offsetcount)
6346 #endif
6348 int rc, ocount, arg_offset_max;
6349 int newline;
6350 BOOL using_temporary_offsets = FALSE;
6351 BOOL anchored;
6352 BOOL startline;
6353 BOOL firstline;
6354 BOOL utf;
6355 BOOL has_first_char = FALSE;
6356 BOOL has_req_char = FALSE;
6357 pcre_uchar first_char = 0;
6358 pcre_uchar first_char2 = 0;
6359 pcre_uchar req_char = 0;
6360 pcre_uchar req_char2 = 0;
6361 match_data match_block;
6362 match_data *md = &match_block;
6363 const pcre_uint8 *tables;
6364 const pcre_uint8 *start_bits = NULL;
6365 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6366 PCRE_PUCHAR end_subject;
6367 PCRE_PUCHAR start_partial = NULL;
6368 PCRE_PUCHAR req_char_ptr = start_match - 1;
6370 const pcre_study_data *study;
6371 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6373 #ifdef NO_RECURSE
6374 heapframe frame_zero;
6375 frame_zero.Xprevframe = NULL; /* Marks the top level */
6376 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6377 md->match_frames_base = &frame_zero;
6378 #endif
6380 /* Check for the special magic call that measures the size of the stack used
6381 per recursive call of match(). Without the funny casting for sizeof, a Windows
6382 compiler gave this error: "unary minus operator applied to unsigned type,
6383 result still unsigned". Hopefully the cast fixes that. */
6385 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6386 start_offset == -999)
6387 #ifdef NO_RECURSE
6388 return -((int)sizeof(heapframe));
6389 #else
6390 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6391 #endif
6393 /* Plausibility checks */
6395 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6396 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6397 return PCRE_ERROR_NULL;
6398 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6399 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6401 /* Check that the first field in the block is the magic number. If it is not,
6402 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6403 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6404 means that the pattern is likely compiled with different endianness. */
6406 if (re->magic_number != MAGIC_NUMBER)
6407 return re->magic_number == REVERSED_MAGIC_NUMBER?
6408 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6409 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6411 /* These two settings are used in the code for checking a UTF-8 string that
6412 follows immediately afterwards. Other values in the md block are used only
6413 during "normal" pcre_exec() processing, not when the JIT support is in use,
6414 so they are set up later. */
6416 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6417 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6418 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6419 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6421 /* Check a UTF-8 string if required. Pass back the character offset and error
6422 code for an invalid string if a results vector is available. */
6424 #ifdef SUPPORT_UTF
6425 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6427 int erroroffset;
6428 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6429 if (errorcode != 0)
6431 if (offsetcount >= 2)
6433 offsets[0] = erroroffset;
6434 offsets[1] = errorcode;
6436 #ifdef COMPILE_PCRE16
6437 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6438 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6439 #else
6440 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6441 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6442 #endif
6445 /* Check that a start_offset points to the start of a UTF character. */
6446 if (start_offset > 0 && start_offset < length &&
6447 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6448 return PCRE_ERROR_BADUTF8_OFFSET;
6450 #endif
6452 /* If the pattern was successfully studied with JIT support, run the JIT
6453 executable instead of the rest of this function. Most options must be set at
6454 compile time for the JIT code to be usable. Fallback to the normal code path if
6455 an unsupported flag is set. */
6457 #ifdef SUPPORT_JIT
6458 if (extra_data != NULL
6459 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6460 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6461 && extra_data->executable_jit != NULL
6462 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6463 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6464 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6466 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6467 start_offset, options, offsets, offsetcount);
6469 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6470 mode is not compiled. In this case we simply fallback to interpreter. */
6472 if (rc != PCRE_ERROR_NULL) return rc;
6474 #endif
6476 /* Carry on with non-JIT matching. This information is for finding all the
6477 numbers associated with a given name, for condition testing. */
6479 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6480 md->name_count = re->name_count;
6481 md->name_entry_size = re->name_entry_size;
6483 /* Fish out the optional data from the extra_data structure, first setting
6484 the default values. */
6486 study = NULL;
6487 md->match_limit = MATCH_LIMIT;
6488 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6489 md->callout_data = NULL;
6491 /* The table pointer is always in native byte order. */
6493 tables = re->tables;
6495 if (extra_data != NULL)
6497 unsigned int flags = extra_data->flags;
6498 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6499 study = (const pcre_study_data *)extra_data->study_data;
6500 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6501 md->match_limit = extra_data->match_limit;
6502 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6503 md->match_limit_recursion = extra_data->match_limit_recursion;
6504 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6505 md->callout_data = extra_data->callout_data;
6506 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6509 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6510 is a feature that makes it possible to save compiled regex and re-use them
6511 in other programs later. */
6513 if (tables == NULL) tables = PRIV(default_tables);
6515 /* Set up other data */
6517 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6518 startline = (re->flags & PCRE_STARTLINE) != 0;
6519 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6521 /* The code starts after the real_pcre block and the capture name table. */
6523 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6524 re->name_count * re->name_entry_size;
6526 md->start_subject = (PCRE_PUCHAR)subject;
6527 md->start_offset = start_offset;
6528 md->end_subject = md->start_subject + length;
6529 end_subject = md->end_subject;
6531 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6532 md->use_ucp = (re->options & PCRE_UCP) != 0;
6533 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6534 md->ignore_skip_arg = FALSE;
6536 /* Some options are unpacked into BOOL variables in the hope that testing
6537 them will be faster than individual option bits. */
6539 md->notbol = (options & PCRE_NOTBOL) != 0;
6540 md->noteol = (options & PCRE_NOTEOL) != 0;
6541 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6542 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6544 md->hitend = FALSE;
6545 md->mark = md->nomatch_mark = NULL; /* In case never set */
6547 md->recursive = NULL; /* No recursion at top level */
6548 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6550 md->lcc = tables + lcc_offset;
6551 md->fcc = tables + fcc_offset;
6552 md->ctypes = tables + ctypes_offset;
6554 /* Handle different \R options. */
6556 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6558 case 0:
6559 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6560 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6561 else
6562 #ifdef BSR_ANYCRLF
6563 md->bsr_anycrlf = TRUE;
6564 #else
6565 md->bsr_anycrlf = FALSE;
6566 #endif
6567 break;
6569 case PCRE_BSR_ANYCRLF:
6570 md->bsr_anycrlf = TRUE;
6571 break;
6573 case PCRE_BSR_UNICODE:
6574 md->bsr_anycrlf = FALSE;
6575 break;
6577 default: return PCRE_ERROR_BADNEWLINE;
6580 /* Handle different types of newline. The three bits give eight cases. If
6581 nothing is set at run time, whatever was used at compile time applies. */
6583 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6584 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6586 case 0: newline = NEWLINE; break; /* Compile-time default */
6587 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6588 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6589 case PCRE_NEWLINE_CR+
6590 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6591 case PCRE_NEWLINE_ANY: newline = -1; break;
6592 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6593 default: return PCRE_ERROR_BADNEWLINE;
6596 if (newline == -2)
6598 md->nltype = NLTYPE_ANYCRLF;
6600 else if (newline < 0)
6602 md->nltype = NLTYPE_ANY;
6604 else
6606 md->nltype = NLTYPE_FIXED;
6607 if (newline > 255)
6609 md->nllen = 2;
6610 md->nl[0] = (newline >> 8) & 255;
6611 md->nl[1] = newline & 255;
6613 else
6615 md->nllen = 1;
6616 md->nl[0] = newline;
6620 /* Partial matching was originally supported only for a restricted set of
6621 regexes; from release 8.00 there are no restrictions, but the bits are still
6622 defined (though never set). So there's no harm in leaving this code. */
6624 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6625 return PCRE_ERROR_BADPARTIAL;
6627 /* If the expression has got more back references than the offsets supplied can
6628 hold, we get a temporary chunk of working store to use during the matching.
6629 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6630 of 3. */
6632 ocount = offsetcount - (offsetcount % 3);
6633 arg_offset_max = (2*ocount)/3;
6635 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6637 ocount = re->top_backref * 3 + 3;
6638 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6639 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6640 using_temporary_offsets = TRUE;
6641 DPRINTF(("Got memory to hold back references\n"));
6643 else md->offset_vector = offsets;
6645 md->offset_end = ocount;
6646 md->offset_max = (2*ocount)/3;
6647 md->offset_overflow = FALSE;
6648 md->capture_last = -1;
6650 /* Reset the working variable associated with each extraction. These should
6651 never be used unless previously set, but they get saved and restored, and so we
6652 initialize them to avoid reading uninitialized locations. Also, unset the
6653 offsets for the matched string. This is really just for tidiness with callouts,
6654 in case they inspect these fields. */
6656 if (md->offset_vector != NULL)
6658 int *iptr = md->offset_vector + ocount;
6659 int *iend = iptr - re->top_bracket;
6660 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6661 while (--iptr >= iend) *iptr = -1;
6662 md->offset_vector[0] = md->offset_vector[1] = -1;
6665 /* Set up the first character to match, if available. The first_char value is
6666 never set for an anchored regular expression, but the anchoring may be forced
6667 at run time, so we have to test for anchoring. The first char may be unset for
6668 an unanchored pattern, of course. If there's no first char and the pattern was
6669 studied, there may be a bitmap of possible first characters. */
6671 if (!anchored)
6673 if ((re->flags & PCRE_FIRSTSET) != 0)
6675 has_first_char = TRUE;
6676 first_char = first_char2 = (pcre_uchar)(re->first_char);
6677 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6679 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6680 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6681 if (utf && first_char > 127)
6682 first_char2 = UCD_OTHERCASE(first_char);
6683 #endif
6686 else
6687 if (!startline && study != NULL &&
6688 (study->flags & PCRE_STUDY_MAPPED) != 0)
6689 start_bits = study->start_bits;
6692 /* For anchored or unanchored matches, there may be a "last known required
6693 character" set. */
6695 if ((re->flags & PCRE_REQCHSET) != 0)
6697 has_req_char = TRUE;
6698 req_char = req_char2 = (pcre_uchar)(re->req_char);
6699 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6701 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6702 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6703 if (utf && req_char > 127)
6704 req_char2 = UCD_OTHERCASE(req_char);
6705 #endif
6710 /* ==========================================================================*/
6712 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6713 the loop runs just once. */
6715 for(;;)
6717 PCRE_PUCHAR save_end_subject = end_subject;
6718 PCRE_PUCHAR new_start_match;
6720 /* If firstline is TRUE, the start of the match is constrained to the first
6721 line of a multiline string. That is, the match must be before or at the first
6722 newline. Implement this by temporarily adjusting end_subject so that we stop
6723 scanning at a newline. If the match fails at the newline, later code breaks
6724 this loop. */
6726 if (firstline)
6728 PCRE_PUCHAR t = start_match;
6729 #ifdef SUPPORT_UTF
6730 if (utf)
6732 while (t < md->end_subject && !IS_NEWLINE(t))
6734 t++;
6735 ACROSSCHAR(t < end_subject, *t, t++);
6738 else
6739 #endif
6740 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6741 end_subject = t;
6744 /* There are some optimizations that avoid running the match if a known
6745 starting point is not found, or if a known later character is not present.
6746 However, there is an option that disables these, for testing and for ensuring
6747 that all callouts do actually occur. The option can be set in the regex by
6748 (*NO_START_OPT) or passed in match-time options. */
6750 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6752 /* Advance to a unique first char if there is one. */
6754 if (has_first_char)
6756 if (first_char != first_char2)
6757 while (start_match < end_subject &&
6758 *start_match != first_char && *start_match != first_char2)
6759 start_match++;
6760 else
6761 while (start_match < end_subject && *start_match != first_char)
6762 start_match++;
6765 /* Or to just after a linebreak for a multiline match */
6767 else if (startline)
6769 if (start_match > md->start_subject + start_offset)
6771 #ifdef SUPPORT_UTF
6772 if (utf)
6774 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6776 start_match++;
6777 ACROSSCHAR(start_match < end_subject, *start_match,
6778 start_match++);
6781 else
6782 #endif
6783 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6784 start_match++;
6786 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6787 and we are now at a LF, advance the match position by one more character.
6790 if (start_match[-1] == CHAR_CR &&
6791 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6792 start_match < end_subject &&
6793 *start_match == CHAR_NL)
6794 start_match++;
6798 /* Or to a non-unique first byte after study */
6800 else if (start_bits != NULL)
6802 while (start_match < end_subject)
6804 unsigned int c = *start_match;
6805 #ifndef COMPILE_PCRE8
6806 if (c > 255) c = 255;
6807 #endif
6808 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6810 start_match++;
6811 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6812 /* In non 8-bit mode, the iteration will stop for
6813 characters > 255 at the beginning or not stop at all. */
6814 if (utf)
6815 ACROSSCHAR(start_match < end_subject, *start_match,
6816 start_match++);
6817 #endif
6819 else break;
6822 } /* Starting optimizations */
6824 /* Restore fudged end_subject */
6826 end_subject = save_end_subject;
6828 /* The following two optimizations are disabled for partial matching or if
6829 disabling is explicitly requested. */
6831 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6833 /* If the pattern was studied, a minimum subject length may be set. This is
6834 a lower bound; no actual string of that length may actually match the
6835 pattern. Although the value is, strictly, in characters, we treat it as
6836 bytes to avoid spending too much time in this optimization. */
6838 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6839 (pcre_uint32)(end_subject - start_match) < study->minlength)
6841 rc = MATCH_NOMATCH;
6842 break;
6845 /* If req_char is set, we know that that character must appear in the
6846 subject for the match to succeed. If the first character is set, req_char
6847 must be later in the subject; otherwise the test starts at the match point.
6848 This optimization can save a huge amount of backtracking in patterns with
6849 nested unlimited repeats that aren't going to match. Writing separate code
6850 for cased/caseless versions makes it go faster, as does using an
6851 autoincrement and backing off on a match.
6853 HOWEVER: when the subject string is very, very long, searching to its end
6854 can take a long time, and give bad performance on quite ordinary patterns.
6855 This showed up when somebody was matching something like /^\d+C/ on a
6856 32-megabyte string... so we don't do this when the string is sufficiently
6857 long. */
6859 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6861 PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6863 /* We don't need to repeat the search if we haven't yet reached the
6864 place we found it at last time. */
6866 if (p > req_char_ptr)
6868 if (req_char != req_char2)
6870 while (p < end_subject)
6872 int pp = *p++;
6873 if (pp == req_char || pp == req_char2) { p--; break; }
6876 else
6878 while (p < end_subject)
6880 if (*p++ == req_char) { p--; break; }
6884 /* If we can't find the required character, break the matching loop,
6885 forcing a match failure. */
6887 if (p >= end_subject)
6889 rc = MATCH_NOMATCH;
6890 break;
6893 /* If we have found the required character, save the point where we
6894 found it, so that we don't search again next time round the loop if
6895 the start hasn't passed this character yet. */
6897 req_char_ptr = p;
6902 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6903 printf(">>>> Match against: ");
6904 pchars(start_match, end_subject - start_match, TRUE, md);
6905 printf("\n");
6906 #endif
6908 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6909 first starting point for which a partial match was found. */
6911 md->start_match_ptr = start_match;
6912 md->start_used_ptr = start_match;
6913 md->match_call_count = 0;
6914 md->match_function_type = 0;
6915 md->end_offset_top = 0;
6916 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6917 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6919 switch(rc)
6921 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6922 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6923 entirely. The only way we can do that is to re-do the match at the same
6924 point, with a flag to force SKIP with an argument to be ignored. Just
6925 treating this case as NOMATCH does not work because it does not check other
6926 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6928 case MATCH_SKIP_ARG:
6929 new_start_match = start_match;
6930 md->ignore_skip_arg = TRUE;
6931 break;
6933 /* SKIP passes back the next starting point explicitly, but if it is the
6934 same as the match we have just done, treat it as NOMATCH. */
6936 case MATCH_SKIP:
6937 if (md->start_match_ptr != start_match)
6939 new_start_match = md->start_match_ptr;
6940 break;
6942 /* Fall through */
6944 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6945 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6947 case MATCH_NOMATCH:
6948 case MATCH_PRUNE:
6949 case MATCH_THEN:
6950 md->ignore_skip_arg = FALSE;
6951 new_start_match = start_match + 1;
6952 #ifdef SUPPORT_UTF
6953 if (utf)
6954 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6955 new_start_match++);
6956 #endif
6957 break;
6959 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6961 case MATCH_COMMIT:
6962 rc = MATCH_NOMATCH;
6963 goto ENDLOOP;
6965 /* Any other return is either a match, or some kind of error. */
6967 default:
6968 goto ENDLOOP;
6971 /* Control reaches here for the various types of "no match at this point"
6972 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6974 rc = MATCH_NOMATCH;
6976 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6977 newline in the subject (though it may continue over the newline). Therefore,
6978 if we have just failed to match, starting at a newline, do not continue. */
6980 if (firstline && IS_NEWLINE(start_match)) break;
6982 /* Advance to new matching position */
6984 start_match = new_start_match;
6986 /* Break the loop if the pattern is anchored or if we have passed the end of
6987 the subject. */
6989 if (anchored || start_match > end_subject) break;
6991 /* If we have just passed a CR and we are now at a LF, and the pattern does
6992 not contain any explicit matches for \r or \n, and the newline option is CRLF
6993 or ANY or ANYCRLF, advance the match position by one more character. In
6994 normal matching start_match will aways be greater than the first position at
6995 this stage, but a failed *SKIP can cause a return at the same point, which is
6996 why the first test exists. */
6998 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
6999 start_match[-1] == CHAR_CR &&
7000 start_match < end_subject &&
7001 *start_match == CHAR_NL &&
7002 (re->flags & PCRE_HASCRORLF) == 0 &&
7003 (md->nltype == NLTYPE_ANY ||
7004 md->nltype == NLTYPE_ANYCRLF ||
7005 md->nllen == 2))
7006 start_match++;
7008 md->mark = NULL; /* Reset for start of next match attempt */
7009 } /* End of for(;;) "bumpalong" loop */
7011 /* ==========================================================================*/
7013 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7014 conditions is true:
7016 (1) The pattern is anchored or the match was failed by (*COMMIT);
7018 (2) We are past the end of the subject;
7020 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7021 this option requests that a match occur at or before the first newline in
7022 the subject.
7024 When we have a match and the offset vector is big enough to deal with any
7025 backreferences, captured substring offsets will already be set up. In the case
7026 where we had to get some local store to hold offsets for backreference
7027 processing, copy those that we can. In this case there need not be overflow if
7028 certain parts of the pattern were not used, even though there are more
7029 capturing parentheses than vector slots. */
7031 ENDLOOP:
7033 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7035 if (using_temporary_offsets)
7037 if (arg_offset_max >= 4)
7039 memcpy(offsets + 2, md->offset_vector + 2,
7040 (arg_offset_max - 2) * sizeof(int));
7041 DPRINTF(("Copied offsets from temporary memory\n"));
7043 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
7044 DPRINTF(("Freeing temporary memory\n"));
7045 (PUBL(free))(md->offset_vector);
7048 /* Set the return code to the number of captured strings, or 0 if there were
7049 too many to fit into the vector. */
7051 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
7052 0 : md->end_offset_top/2;
7054 /* If there is space in the offset vector, set any unused pairs at the end of
7055 the pattern to -1 for backwards compatibility. It is documented that this
7056 happens. In earlier versions, the whole set of potential capturing offsets
7057 was set to -1 each time round the loop, but this is handled differently now.
7058 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7059 those at the end that need unsetting here. We can't just unset them all at
7060 the start of the whole thing because they may get set in one branch that is
7061 not the final matching branch. */
7063 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7065 int *iptr, *iend;
7066 int resetcount = 2 + re->top_bracket * 2;
7067 if (resetcount > offsetcount) resetcount = offsetcount;
7068 iptr = offsets + md->end_offset_top;
7069 iend = offsets + resetcount;
7070 while (iptr < iend) *iptr++ = -1;
7073 /* If there is space, set up the whole thing as substring 0. The value of
7074 md->start_match_ptr might be modified if \K was encountered on the success
7075 matching path. */
7077 if (offsetcount < 2) rc = 0; else
7079 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7080 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7083 /* Return MARK data if requested */
7085 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7086 *(extra_data->mark) = (pcre_uchar *)md->mark;
7087 DPRINTF((">>>> returning %d\n", rc));
7088 #ifdef NO_RECURSE
7089 release_match_heapframes(&frame_zero);
7090 #endif
7091 return rc;
7094 /* Control gets here if there has been an error, or if the overall match
7095 attempt has failed at all permitted starting positions. */
7097 if (using_temporary_offsets)
7099 DPRINTF(("Freeing temporary memory\n"));
7100 (PUBL(free))(md->offset_vector);
7103 /* For anything other than nomatch or partial match, just return the code. */
7105 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7107 DPRINTF((">>>> error: returning %d\n", rc));
7108 #ifdef NO_RECURSE
7109 release_match_heapframes(&frame_zero);
7110 #endif
7111 return rc;
7114 /* Handle partial matches - disable any mark data */
7116 if (start_partial != NULL)
7118 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7119 md->mark = NULL;
7120 if (offsetcount > 1)
7122 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7123 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7125 rc = PCRE_ERROR_PARTIAL;
7128 /* This is the classic nomatch case */
7130 else
7132 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7133 rc = PCRE_ERROR_NOMATCH;
7136 /* Return the MARK data if it has been requested. */
7138 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7139 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7140 #ifdef NO_RECURSE
7141 release_match_heapframes(&frame_zero);
7142 #endif
7143 return rc;
7146 /* End of pcre_exec.c */