gmain: use Linux eventfd() for main context wake up
[glib.git] / glib / pcre / pcre_exec.c
blob569207cc34792e90d4fcd55b4d0f0b80dd70b65c
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
53 #include "pcre_internal.h"
55 /* Undefine some potentially clashing cpp symbols */
57 #undef min
58 #undef max
60 /* Flag bits for the match() function */
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
81 /* This is a convenience macro for code that occurs many times. */
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
93 #define REC_STACK_SAVE_MAX 30
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
116 Returns: nothing
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
127 #endif
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
145 Returns: TRUE if matched
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
152 USPTR p = md->start_subject + md->offset_vector[offset];
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
167 /* Always fail if not enough characters left */
169 if (length > md->end_subject - eptr) return FALSE;
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
175 if ((ims & PCRE_CASELESS) != 0)
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
190 else
191 #endif
192 #endif
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
207 return TRUE;
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
259 RM61, RM62 };
261 /* These versions of the macros use the stack, as normal. There are debugging
262 versions and production versions. Note that the "rw" argument of RMATCH isn't
263 actually used in this definition. */
265 #ifndef NO_RECURSE
266 #define REGISTER register
268 #ifdef PCRE_DEBUG
269 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 printf("match() called in line %d\n", __LINE__); \
272 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
273 printf("to line %d\n", __LINE__); \
275 #define RRETURN(ra) \
277 printf("match() returned %d from line %d ", ra, __LINE__); \
278 return ra; \
280 #else
281 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
282 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
283 #define RRETURN(ra) return ra
284 #endif
286 #else
289 /* These versions of the macros manage a private stack on the heap. Note that
290 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
291 argument of match(), which never changes. */
293 #define REGISTER
295 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
297 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
298 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
299 frame->Xwhere = rw; \
300 newframe->Xeptr = ra;\
301 newframe->Xecode = rb;\
302 newframe->Xmstart = mstart;\
303 newframe->Xmarkptr = markptr;\
304 newframe->Xoffset_top = rc;\
305 newframe->Xims = re;\
306 newframe->Xeptrb = rf;\
307 newframe->Xflags = rg;\
308 newframe->Xrdepth = frame->Xrdepth + 1;\
309 newframe->Xprevframe = frame;\
310 frame = newframe;\
311 DPRINTF(("restarting from line %d\n", __LINE__));\
312 goto HEAP_RECURSE;\
313 L_##rw:\
314 DPRINTF(("jumped back to line %d\n", __LINE__));\
317 #define RRETURN(ra)\
319 heapframe *oldframe = frame;\
320 frame = oldframe->Xprevframe;\
321 (pcre_stack_free)(oldframe);\
322 if (frame != NULL)\
324 rrc = ra;\
325 goto HEAP_RETURN;\
327 return ra;\
331 /* Structure for remembering the local variables in a private frame */
333 typedef struct heapframe {
334 struct heapframe *Xprevframe;
336 /* Function arguments that may change */
338 USPTR Xeptr;
339 const uschar *Xecode;
340 USPTR Xmstart;
341 USPTR Xmarkptr;
342 int Xoffset_top;
343 long int Xims;
344 eptrblock *Xeptrb;
345 int Xflags;
346 unsigned int Xrdepth;
348 /* Function local variables */
350 USPTR Xcallpat;
351 #ifdef SUPPORT_UTF8
352 USPTR Xcharptr;
353 #endif
354 USPTR Xdata;
355 USPTR Xnext;
356 USPTR Xpp;
357 USPTR Xprev;
358 USPTR Xsaved_eptr;
360 recursion_info Xnew_recursive;
362 BOOL Xcur_is_word;
363 BOOL Xcondition;
364 BOOL Xprev_is_word;
366 unsigned long int Xoriginal_ims;
368 #ifdef SUPPORT_UCP
369 int Xprop_type;
370 int Xprop_value;
371 int Xprop_fail_result;
372 int Xprop_category;
373 int Xprop_chartype;
374 int Xprop_script;
375 int Xoclength;
376 uschar Xocchars[8];
377 #endif
379 int Xcodelink;
380 int Xctype;
381 unsigned int Xfc;
382 int Xfi;
383 int Xlength;
384 int Xmax;
385 int Xmin;
386 int Xnumber;
387 int Xoffset;
388 int Xop;
389 int Xsave_capture_last;
390 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
391 int Xstacksave[REC_STACK_SAVE_MAX];
393 eptrblock Xnewptrb;
395 /* Where to jump back to */
397 int Xwhere;
399 } heapframe;
401 #endif
404 /***************************************************************************
405 ***************************************************************************/
409 /*************************************************
410 * Match from current position *
411 *************************************************/
413 /* This function is called recursively in many circumstances. Whenever it
414 returns a negative (error) response, the outer incarnation must also return the
415 same response. */
417 /* These macros pack up tests that are used for partial matching, and which
418 appears several times in the code. We set the "hit end" flag if the pointer is
419 at the end of the subject and also past the start of the subject (i.e.
420 something has been matched). For hard partial matching, we then return
421 immediately. The second one is used when we already know we are past the end of
422 the subject. */
424 #define CHECK_PARTIAL()\
425 if (md->partial != 0 && eptr >= md->end_subject && \
426 eptr > md->start_used_ptr) \
428 md->hitend = TRUE; \
429 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
432 #define SCHECK_PARTIAL()\
433 if (md->partial != 0 && eptr > md->start_used_ptr) \
435 md->hitend = TRUE; \
436 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
440 /* Performance note: It might be tempting to extract commonly used fields from
441 the md structure (e.g. utf8, end_subject) into individual variables to improve
442 performance. Tests using gcc on a SPARC disproved this; in the first case, it
443 made performance worse.
445 Arguments:
446 eptr pointer to current character in subject
447 ecode pointer to current position in compiled code
448 mstart pointer to the current match start position (can be modified
449 by encountering \K)
450 markptr pointer to the most recent MARK name, or NULL
451 offset_top current top pointer
452 md pointer to "static" info for the match
453 ims current /i, /m, and /s options
454 eptrb pointer to chain of blocks containing eptr at start of
455 brackets - for testing for empty matches
456 flags can contain
457 match_condassert - this is an assertion condition
458 match_cbegroup - this is the start of an unlimited repeat
459 group that can match an empty string
460 rdepth the recursion depth
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
469 static int
470 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
471 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
472 eptrblock *eptrb, int flags, unsigned int rdepth)
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
483 BOOL minimize, possessive; /* Quantifier options */
484 int condcode;
486 /* When recursion is not being used, all "local" variables that have to be
487 preserved over calls to RMATCH() are part of a "frame" which is obtained from
488 heap storage. Set up the top-level frame here; others are obtained from the
489 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491 #ifdef NO_RECURSE
492 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
493 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
494 frame->Xprevframe = NULL; /* Marks the top level */
496 /* Copy in the original argument variables */
498 frame->Xeptr = eptr;
499 frame->Xecode = ecode;
500 frame->Xmstart = mstart;
501 frame->Xmarkptr = markptr;
502 frame->Xoffset_top = offset_top;
503 frame->Xims = ims;
504 frame->Xeptrb = eptrb;
505 frame->Xflags = flags;
506 frame->Xrdepth = rdepth;
508 /* This is where control jumps back to to effect "recursion" */
510 HEAP_RECURSE:
512 /* Macros make the argument variables come from the current frame */
514 #define eptr frame->Xeptr
515 #define ecode frame->Xecode
516 #define mstart frame->Xmstart
517 #define markptr frame->Xmarkptr
518 #define offset_top frame->Xoffset_top
519 #define ims frame->Xims
520 #define eptrb frame->Xeptrb
521 #define flags frame->Xflags
522 #define rdepth frame->Xrdepth
524 /* Ditto for the local variables */
526 #ifdef SUPPORT_UTF8
527 #define charptr frame->Xcharptr
528 #endif
529 #define callpat frame->Xcallpat
530 #define codelink frame->Xcodelink
531 #define data frame->Xdata
532 #define next frame->Xnext
533 #define pp frame->Xpp
534 #define prev frame->Xprev
535 #define saved_eptr frame->Xsaved_eptr
537 #define new_recursive frame->Xnew_recursive
539 #define cur_is_word frame->Xcur_is_word
540 #define condition frame->Xcondition
541 #define prev_is_word frame->Xprev_is_word
543 #define original_ims frame->Xoriginal_ims
545 #ifdef SUPPORT_UCP
546 #define prop_type frame->Xprop_type
547 #define prop_value frame->Xprop_value
548 #define prop_fail_result frame->Xprop_fail_result
549 #define prop_category frame->Xprop_category
550 #define prop_chartype frame->Xprop_chartype
551 #define prop_script frame->Xprop_script
552 #define oclength frame->Xoclength
553 #define occhars frame->Xocchars
554 #endif
556 #define ctype frame->Xctype
557 #define fc frame->Xfc
558 #define fi frame->Xfi
559 #define length frame->Xlength
560 #define max frame->Xmax
561 #define min frame->Xmin
562 #define number frame->Xnumber
563 #define offset frame->Xoffset
564 #define op frame->Xop
565 #define save_capture_last frame->Xsave_capture_last
566 #define save_offset1 frame->Xsave_offset1
567 #define save_offset2 frame->Xsave_offset2
568 #define save_offset3 frame->Xsave_offset3
569 #define stacksave frame->Xstacksave
571 #define newptrb frame->Xnewptrb
573 /* When recursion is being used, local variables are allocated on the stack and
574 get preserved during recursion in the normal way. In this environment, fi and
575 i, and fc and c, can be the same variables. */
577 #else /* NO_RECURSE not defined */
578 #define fi i
579 #define fc c
582 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
583 const uschar *charptr; /* in small blocks of the code. My normal */
584 #endif /* style of coding would have declared */
585 const uschar *callpat; /* them within each of those blocks. */
586 const uschar *data; /* However, in order to accommodate the */
587 const uschar *next; /* version of this code that uses an */
588 USPTR pp; /* external "stack" implemented on the */
589 const uschar *prev; /* heap, it is easier to declare them all */
590 USPTR saved_eptr; /* here, so the declarations can be cut */
591 /* out in a block. The only declarations */
592 recursion_info new_recursive; /* within blocks below are for variables */
593 /* that do not have to be preserved over */
594 BOOL cur_is_word; /* a recursive call to RMATCH(). */
595 BOOL condition;
596 BOOL prev_is_word;
598 unsigned long int original_ims;
600 #ifdef SUPPORT_UCP
601 int prop_type;
602 int prop_value;
603 int prop_fail_result;
604 int prop_category;
605 int prop_chartype;
606 int prop_script;
607 int oclength;
608 uschar occhars[8];
609 #endif
611 int codelink;
612 int ctype;
613 int length;
614 int max;
615 int min;
616 int number;
617 int offset;
618 int op;
619 int save_capture_last;
620 int save_offset1, save_offset2, save_offset3;
621 int stacksave[REC_STACK_SAVE_MAX];
623 eptrblock newptrb;
624 #endif /* NO_RECURSE */
626 /* These statements are here to stop the compiler complaining about unitialized
627 variables. */
629 #ifdef SUPPORT_UCP
630 prop_value = 0;
631 prop_fail_result = 0;
632 #endif
635 /* This label is used for tail recursion, which is used in a few cases even
636 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
637 used. Thanks to Ian Taylor for noticing this possibility and sending the
638 original patch. */
640 TAIL_RECURSE:
642 /* OK, now we can get on with the real code of the function. Recursive calls
643 are specified by the macro RMATCH and RRETURN is used to return. When
644 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
645 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
646 defined). However, RMATCH isn't like a function call because it's quite a
647 complicated macro. It has to be used in one particular way. This shouldn't,
648 however, impact performance when true recursion is being used. */
650 #ifdef SUPPORT_UTF8
651 utf8 = md->utf8; /* Local copy of the flag */
652 #else
653 utf8 = FALSE;
654 #endif
656 /* First check that we haven't called match() too many times, or that we
657 haven't exceeded the recursive call limit. */
659 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
660 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
662 original_ims = ims; /* Save for resetting on ')' */
664 /* At the start of a group with an unlimited repeat that may match an empty
665 string, the match_cbegroup flag is set. When this is the case, add the current
666 subject pointer to the chain of such remembered pointers, to be checked when we
667 hit the closing ket, in order to break infinite loops that match no characters.
668 When match() is called in other circumstances, don't add to the chain. The
669 match_cbegroup flag must NOT be used with tail recursion, because the memory
670 block that is used is on the stack, so a new one may be required for each
671 match(). */
673 if ((flags & match_cbegroup) != 0)
675 newptrb.epb_saved_eptr = eptr;
676 newptrb.epb_prev = eptrb;
677 eptrb = &newptrb;
680 /* Now start processing the opcodes. */
682 for (;;)
684 minimize = possessive = FALSE;
685 op = *ecode;
687 switch(op)
689 case OP_MARK:
690 markptr = ecode + 2;
691 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
692 ims, eptrb, flags, RM55);
694 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
695 argument, and we must check whether that argument matches this MARK's
696 argument. It is passed back in md->start_match_ptr (an overloading of that
697 variable). If it does match, we reset that variable to the current subject
698 position and return MATCH_SKIP. Otherwise, pass back the return code
699 unaltered. */
701 if (rrc == MATCH_SKIP_ARG &&
702 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
704 md->start_match_ptr = eptr;
705 RRETURN(MATCH_SKIP);
708 if (md->mark == NULL) md->mark = markptr;
709 RRETURN(rrc);
711 case OP_FAIL:
712 MRRETURN(MATCH_NOMATCH);
714 /* COMMIT overrides PRUNE, SKIP, and THEN */
716 case OP_COMMIT:
717 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
718 ims, eptrb, flags, RM52);
719 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
720 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
721 rrc != MATCH_THEN)
722 RRETURN(rrc);
723 MRRETURN(MATCH_COMMIT);
725 /* PRUNE overrides THEN */
727 case OP_PRUNE:
728 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
729 ims, eptrb, flags, RM51);
730 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
731 MRRETURN(MATCH_PRUNE);
733 case OP_PRUNE_ARG:
734 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
735 ims, eptrb, flags, RM56);
736 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
737 md->mark = ecode + 2;
738 RRETURN(MATCH_PRUNE);
740 /* SKIP overrides PRUNE and THEN */
742 case OP_SKIP:
743 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
744 ims, eptrb, flags, RM53);
745 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
746 RRETURN(rrc);
747 md->start_match_ptr = eptr; /* Pass back current position */
748 MRRETURN(MATCH_SKIP);
750 case OP_SKIP_ARG:
751 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
752 ims, eptrb, flags, RM57);
753 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
754 RRETURN(rrc);
756 /* Pass back the current skip name by overloading md->start_match_ptr and
757 returning the special MATCH_SKIP_ARG return code. This will either be
758 caught by a matching MARK, or get to the top, where it is treated the same
759 as PRUNE. */
761 md->start_match_ptr = ecode + 2;
762 RRETURN(MATCH_SKIP_ARG);
764 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
765 the alt that is at the start of the current branch. This makes it possible
766 to skip back past alternatives that precede the THEN within the current
767 branch. */
769 case OP_THEN:
770 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
771 ims, eptrb, flags, RM54);
772 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
773 md->start_match_ptr = ecode - GET(ecode, 1);
774 MRRETURN(MATCH_THEN);
776 case OP_THEN_ARG:
777 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
778 offset_top, md, ims, eptrb, flags, RM58);
779 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
780 md->start_match_ptr = ecode - GET(ecode, 1);
781 md->mark = ecode + LINK_SIZE + 2;
782 RRETURN(MATCH_THEN);
784 /* Handle a capturing bracket. If there is space in the offset vector, save
785 the current subject position in the working slot at the top of the vector.
786 We mustn't change the current values of the data slot, because they may be
787 set from a previous iteration of this group, and be referred to by a
788 reference inside the group.
790 If the bracket fails to match, we need to restore this value and also the
791 values of the final offsets, in case they were set by a previous iteration
792 of the same bracket.
794 If there isn't enough space in the offset vector, treat this as if it were
795 a non-capturing bracket. Don't worry about setting the flag for the error
796 case here; that is handled in the code for KET. */
798 case OP_CBRA:
799 case OP_SCBRA:
800 number = GET2(ecode, 1+LINK_SIZE);
801 offset = number << 1;
803 #ifdef PCRE_DEBUG
804 printf("start bracket %d\n", number);
805 printf("subject=");
806 pchars(eptr, 16, TRUE, md);
807 printf("\n");
808 #endif
810 if (offset < md->offset_max)
812 save_offset1 = md->offset_vector[offset];
813 save_offset2 = md->offset_vector[offset+1];
814 save_offset3 = md->offset_vector[md->offset_end - number];
815 save_capture_last = md->capture_last;
817 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
818 md->offset_vector[md->offset_end - number] =
819 (int)(eptr - md->start_subject);
821 flags = (op == OP_SCBRA)? match_cbegroup : 0;
824 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
825 ims, eptrb, flags, RM1);
826 if (rrc != MATCH_NOMATCH &&
827 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
828 RRETURN(rrc);
829 md->capture_last = save_capture_last;
830 ecode += GET(ecode, 1);
832 while (*ecode == OP_ALT);
834 DPRINTF(("bracket %d failed\n", number));
836 md->offset_vector[offset] = save_offset1;
837 md->offset_vector[offset+1] = save_offset2;
838 md->offset_vector[md->offset_end - number] = save_offset3;
840 if (rrc != MATCH_THEN) md->mark = markptr;
841 RRETURN(MATCH_NOMATCH);
844 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
845 as a non-capturing bracket. */
847 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
848 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
850 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
852 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
853 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
855 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
856 final alternative within the brackets, we would return the result of a
857 recursive call to match() whatever happened. We can reduce stack usage by
858 turning this into a tail recursion, except in the case when match_cbegroup
859 is set.*/
861 case OP_BRA:
862 case OP_SBRA:
863 DPRINTF(("start non-capturing bracket\n"));
864 flags = (op >= OP_SBRA)? match_cbegroup : 0;
865 for (;;)
867 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
869 if (flags == 0) /* Not a possibly empty group */
871 ecode += _pcre_OP_lengths[*ecode];
872 DPRINTF(("bracket 0 tail recursion\n"));
873 goto TAIL_RECURSE;
876 /* Possibly empty group; can't use tail recursion. */
878 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
879 eptrb, flags, RM48);
880 if (rrc == MATCH_NOMATCH) md->mark = markptr;
881 RRETURN(rrc);
884 /* For non-final alternatives, continue the loop for a NOMATCH result;
885 otherwise return. */
887 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
888 eptrb, flags, RM2);
889 if (rrc != MATCH_NOMATCH &&
890 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
891 RRETURN(rrc);
892 ecode += GET(ecode, 1);
894 /* Control never reaches here. */
896 /* Conditional group: compilation checked that there are no more than
897 two branches. If the condition is false, skipping the first branch takes us
898 past the end if there is only one branch, but that's OK because that is
899 exactly what going to the ket would do. As there is only one branch to be
900 obeyed, we can use tail recursion to avoid using another stack frame. */
902 case OP_COND:
903 case OP_SCOND:
904 codelink= GET(ecode, 1);
906 /* Because of the way auto-callout works during compile, a callout item is
907 inserted between OP_COND and an assertion condition. */
909 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
911 if (pcre_callout != NULL)
913 pcre_callout_block cb;
914 cb.version = 1; /* Version 1 of the callout block */
915 cb.callout_number = ecode[LINK_SIZE+2];
916 cb.offset_vector = md->offset_vector;
917 cb.subject = (PCRE_SPTR)md->start_subject;
918 cb.subject_length = (int)(md->end_subject - md->start_subject);
919 cb.start_match = (int)(mstart - md->start_subject);
920 cb.current_position = (int)(eptr - md->start_subject);
921 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
922 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
923 cb.capture_top = offset_top/2;
924 cb.capture_last = md->capture_last;
925 cb.callout_data = md->callout_data;
926 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
927 if (rrc < 0) RRETURN(rrc);
929 ecode += _pcre_OP_lengths[OP_CALLOUT];
932 condcode = ecode[LINK_SIZE+1];
934 /* Now see what the actual condition is */
936 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
938 if (md->recursive == NULL) /* Not recursing => FALSE */
940 condition = FALSE;
941 ecode += GET(ecode, 1);
943 else
945 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
946 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
948 /* If the test is for recursion into a specific subpattern, and it is
949 false, but the test was set up by name, scan the table to see if the
950 name refers to any other numbers, and test them. The condition is true
951 if any one is set. */
953 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
955 uschar *slotA = md->name_table;
956 for (i = 0; i < md->name_count; i++)
958 if (GET2(slotA, 0) == recno) break;
959 slotA += md->name_entry_size;
962 /* Found a name for the number - there can be only one; duplicate
963 names for different numbers are allowed, but not vice versa. First
964 scan down for duplicates. */
966 if (i < md->name_count)
968 uschar *slotB = slotA;
969 while (slotB > md->name_table)
971 slotB -= md->name_entry_size;
972 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
974 condition = GET2(slotB, 0) == md->recursive->group_num;
975 if (condition) break;
977 else break;
980 /* Scan up for duplicates */
982 if (!condition)
984 slotB = slotA;
985 for (i++; i < md->name_count; i++)
987 slotB += md->name_entry_size;
988 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
990 condition = GET2(slotB, 0) == md->recursive->group_num;
991 if (condition) break;
993 else break;
999 /* Chose branch according to the condition */
1001 ecode += condition? 3 : GET(ecode, 1);
1005 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1007 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1008 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1010 /* If the numbered capture is unset, but the reference was by name,
1011 scan the table to see if the name refers to any other numbers, and test
1012 them. The condition is true if any one is set. This is tediously similar
1013 to the code above, but not close enough to try to amalgamate. */
1015 if (!condition && condcode == OP_NCREF)
1017 int refno = offset >> 1;
1018 uschar *slotA = md->name_table;
1020 for (i = 0; i < md->name_count; i++)
1022 if (GET2(slotA, 0) == refno) break;
1023 slotA += md->name_entry_size;
1026 /* Found a name for the number - there can be only one; duplicate names
1027 for different numbers are allowed, but not vice versa. First scan down
1028 for duplicates. */
1030 if (i < md->name_count)
1032 uschar *slotB = slotA;
1033 while (slotB > md->name_table)
1035 slotB -= md->name_entry_size;
1036 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1038 offset = GET2(slotB, 0) << 1;
1039 condition = offset < offset_top &&
1040 md->offset_vector[offset] >= 0;
1041 if (condition) break;
1043 else break;
1046 /* Scan up for duplicates */
1048 if (!condition)
1050 slotB = slotA;
1051 for (i++; i < md->name_count; i++)
1053 slotB += md->name_entry_size;
1054 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1056 offset = GET2(slotB, 0) << 1;
1057 condition = offset < offset_top &&
1058 md->offset_vector[offset] >= 0;
1059 if (condition) break;
1061 else break;
1067 /* Chose branch according to the condition */
1069 ecode += condition? 3 : GET(ecode, 1);
1072 else if (condcode == OP_DEF) /* DEFINE - always false */
1074 condition = FALSE;
1075 ecode += GET(ecode, 1);
1078 /* The condition is an assertion. Call match() to evaluate it - setting
1079 the final argument match_condassert causes it to stop at the end of an
1080 assertion. */
1082 else
1084 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1085 match_condassert, RM3);
1086 if (rrc == MATCH_MATCH)
1088 condition = TRUE;
1089 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1090 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1092 else if (rrc != MATCH_NOMATCH &&
1093 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1095 RRETURN(rrc); /* Need braces because of following else */
1097 else
1099 condition = FALSE;
1100 ecode += codelink;
1104 /* We are now at the branch that is to be obeyed. As there is only one,
1105 we can use tail recursion to avoid using another stack frame, except when
1106 match_cbegroup is required for an unlimited repeat of a possibly empty
1107 group. If the second alternative doesn't exist, we can just plough on. */
1109 if (condition || *ecode == OP_ALT)
1111 ecode += 1 + LINK_SIZE;
1112 if (op == OP_SCOND) /* Possibly empty group */
1114 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1115 RRETURN(rrc);
1117 else /* Group must match something */
1119 flags = 0;
1120 goto TAIL_RECURSE;
1123 else /* Condition false & no alternative */
1125 ecode += 1 + LINK_SIZE;
1127 break;
1130 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1131 to close any currently open capturing brackets. */
1133 case OP_CLOSE:
1134 number = GET2(ecode, 1);
1135 offset = number << 1;
1137 #ifdef PCRE_DEBUG
1138 printf("end bracket %d at *ACCEPT", number);
1139 printf("\n");
1140 #endif
1142 md->capture_last = number;
1143 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1145 md->offset_vector[offset] =
1146 md->offset_vector[md->offset_end - number];
1147 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1148 if (offset_top <= offset) offset_top = offset + 2;
1150 ecode += 3;
1151 break;
1154 /* End of the pattern, either real or forced. If we are in a top-level
1155 recursion, we should restore the offsets appropriately and continue from
1156 after the call. */
1158 case OP_ACCEPT:
1159 case OP_END:
1160 if (md->recursive != NULL && md->recursive->group_num == 0)
1162 recursion_info *rec = md->recursive;
1163 DPRINTF(("End of pattern in a (?0) recursion\n"));
1164 md->recursive = rec->prevrec;
1165 memmove(md->offset_vector, rec->offset_save,
1166 rec->saved_max * sizeof(int));
1167 offset_top = rec->save_offset_top;
1168 ims = original_ims;
1169 ecode = rec->after_call;
1170 break;
1173 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1174 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1175 the subject. In both cases, backtracking will then try other alternatives,
1176 if any. */
1178 if (eptr == mstart &&
1179 (md->notempty ||
1180 (md->notempty_atstart &&
1181 mstart == md->start_subject + md->start_offset)))
1182 MRRETURN(MATCH_NOMATCH);
1184 /* Otherwise, we have a match. */
1186 md->end_match_ptr = eptr; /* Record where we ended */
1187 md->end_offset_top = offset_top; /* and how many extracts were taken */
1188 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1190 /* For some reason, the macros don't work properly if an expression is
1191 given as the argument to MRRETURN when the heap is in use. */
1193 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1194 MRRETURN(rrc);
1196 /* Change option settings */
1198 case OP_OPT:
1199 ims = ecode[1];
1200 ecode += 2;
1201 DPRINTF(("ims set to %02lx\n", ims));
1202 break;
1204 /* Assertion brackets. Check the alternative branches in turn - the
1205 matching won't pass the KET for an assertion. If any one branch matches,
1206 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1207 start of each branch to move the current point backwards, so the code at
1208 this level is identical to the lookahead case. */
1210 case OP_ASSERT:
1211 case OP_ASSERTBACK:
1214 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1215 RM4);
1216 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1218 mstart = md->start_match_ptr; /* In case \K reset it */
1219 break;
1221 if (rrc != MATCH_NOMATCH &&
1222 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1223 RRETURN(rrc);
1224 ecode += GET(ecode, 1);
1226 while (*ecode == OP_ALT);
1227 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1229 /* If checking an assertion for a condition, return MATCH_MATCH. */
1231 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1233 /* Continue from after the assertion, updating the offsets high water
1234 mark, since extracts may have been taken during the assertion. */
1236 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1237 ecode += 1 + LINK_SIZE;
1238 offset_top = md->end_offset_top;
1239 continue;
1241 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1242 PRUNE, or COMMIT means we must assume failure without checking subsequent
1243 branches. */
1245 case OP_ASSERT_NOT:
1246 case OP_ASSERTBACK_NOT:
1249 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1250 RM5);
1251 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1252 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1254 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1255 break;
1257 if (rrc != MATCH_NOMATCH &&
1258 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1259 RRETURN(rrc);
1260 ecode += GET(ecode,1);
1262 while (*ecode == OP_ALT);
1264 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1266 ecode += 1 + LINK_SIZE;
1267 continue;
1269 /* Move the subject pointer back. This occurs only at the start of
1270 each branch of a lookbehind assertion. If we are too close to the start to
1271 move back, this match function fails. When working with UTF-8 we move
1272 back a number of characters, not bytes. */
1274 case OP_REVERSE:
1275 #ifdef SUPPORT_UTF8
1276 if (utf8)
1278 i = GET(ecode, 1);
1279 while (i-- > 0)
1281 eptr--;
1282 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1283 BACKCHAR(eptr);
1286 else
1287 #endif
1289 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1292 eptr -= GET(ecode, 1);
1293 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1296 /* Save the earliest consulted character, then skip to next op code */
1298 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1299 ecode += 1 + LINK_SIZE;
1300 break;
1302 /* The callout item calls an external function, if one is provided, passing
1303 details of the match so far. This is mainly for debugging, though the
1304 function is able to force a failure. */
1306 case OP_CALLOUT:
1307 if (pcre_callout != NULL)
1309 pcre_callout_block cb;
1310 cb.version = 1; /* Version 1 of the callout block */
1311 cb.callout_number = ecode[1];
1312 cb.offset_vector = md->offset_vector;
1313 cb.subject = (PCRE_SPTR)md->start_subject;
1314 cb.subject_length = (int)(md->end_subject - md->start_subject);
1315 cb.start_match = (int)(mstart - md->start_subject);
1316 cb.current_position = (int)(eptr - md->start_subject);
1317 cb.pattern_position = GET(ecode, 2);
1318 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1319 cb.capture_top = offset_top/2;
1320 cb.capture_last = md->capture_last;
1321 cb.callout_data = md->callout_data;
1322 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1323 if (rrc < 0) RRETURN(rrc);
1325 ecode += 2 + 2*LINK_SIZE;
1326 break;
1328 /* Recursion either matches the current regex, or some subexpression. The
1329 offset data is the offset to the starting bracket from the start of the
1330 whole pattern. (This is so that it works from duplicated subpatterns.)
1332 If there are any capturing brackets started but not finished, we have to
1333 save their starting points and reinstate them after the recursion. However,
1334 we don't know how many such there are (offset_top records the completed
1335 total) so we just have to save all the potential data. There may be up to
1336 65535 such values, which is too large to put on the stack, but using malloc
1337 for small numbers seems expensive. As a compromise, the stack is used when
1338 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1339 is used. A problem is what to do if the malloc fails ... there is no way of
1340 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1341 values on the stack, and accept that the rest may be wrong.
1343 There are also other values that have to be saved. We use a chained
1344 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1345 for the original version of this logic. */
1347 case OP_RECURSE:
1349 callpat = md->start_code + GET(ecode, 1);
1350 new_recursive.group_num = (callpat == md->start_code)? 0 :
1351 GET2(callpat, 1 + LINK_SIZE);
1353 /* Add to "recursing stack" */
1355 new_recursive.prevrec = md->recursive;
1356 md->recursive = &new_recursive;
1358 /* Find where to continue from afterwards */
1360 ecode += 1 + LINK_SIZE;
1361 new_recursive.after_call = ecode;
1363 /* Now save the offset data. */
1365 new_recursive.saved_max = md->offset_end;
1366 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1367 new_recursive.offset_save = stacksave;
1368 else
1370 new_recursive.offset_save =
1371 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1372 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1375 memcpy(new_recursive.offset_save, md->offset_vector,
1376 new_recursive.saved_max * sizeof(int));
1377 new_recursive.save_offset_top = offset_top;
1379 /* OK, now we can do the recursion. For each top-level alternative we
1380 restore the offset and recursion data. */
1382 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1383 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1386 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1387 md, ims, eptrb, flags, RM6);
1388 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1390 DPRINTF(("Recursion matched\n"));
1391 md->recursive = new_recursive.prevrec;
1392 if (new_recursive.offset_save != stacksave)
1393 (pcre_free)(new_recursive.offset_save);
1394 MRRETURN(MATCH_MATCH);
1396 else if (rrc != MATCH_NOMATCH &&
1397 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1399 DPRINTF(("Recursion gave error %d\n", rrc));
1400 if (new_recursive.offset_save != stacksave)
1401 (pcre_free)(new_recursive.offset_save);
1402 RRETURN(rrc);
1405 md->recursive = &new_recursive;
1406 memcpy(md->offset_vector, new_recursive.offset_save,
1407 new_recursive.saved_max * sizeof(int));
1408 callpat += GET(callpat, 1);
1410 while (*callpat == OP_ALT);
1412 DPRINTF(("Recursion didn't match\n"));
1413 md->recursive = new_recursive.prevrec;
1414 if (new_recursive.offset_save != stacksave)
1415 (pcre_free)(new_recursive.offset_save);
1416 MRRETURN(MATCH_NOMATCH);
1418 /* Control never reaches here */
1420 /* "Once" brackets are like assertion brackets except that after a match,
1421 the point in the subject string is not moved back. Thus there can never be
1422 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1423 Check the alternative branches in turn - the matching won't pass the KET
1424 for this kind of subpattern. If any one branch matches, we carry on as at
1425 the end of a normal bracket, leaving the subject pointer, but resetting
1426 the start-of-match value in case it was changed by \K. */
1428 case OP_ONCE:
1429 prev = ecode;
1430 saved_eptr = eptr;
1434 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1435 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1437 mstart = md->start_match_ptr;
1438 break;
1440 if (rrc != MATCH_NOMATCH &&
1441 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1442 RRETURN(rrc);
1443 ecode += GET(ecode,1);
1445 while (*ecode == OP_ALT);
1447 /* If hit the end of the group (which could be repeated), fail */
1449 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1451 /* Continue as from after the assertion, updating the offsets high water
1452 mark, since extracts may have been taken. */
1454 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1456 offset_top = md->end_offset_top;
1457 eptr = md->end_match_ptr;
1459 /* For a non-repeating ket, just continue at this level. This also
1460 happens for a repeating ket if no characters were matched in the group.
1461 This is the forcible breaking of infinite loops as implemented in Perl
1462 5.005. If there is an options reset, it will get obeyed in the normal
1463 course of events. */
1465 if (*ecode == OP_KET || eptr == saved_eptr)
1467 ecode += 1+LINK_SIZE;
1468 break;
1471 /* The repeating kets try the rest of the pattern or restart from the
1472 preceding bracket, in the appropriate order. The second "call" of match()
1473 uses tail recursion, to avoid using another stack frame. We need to reset
1474 any options that changed within the bracket before re-running it, so
1475 check the next opcode. */
1477 if (ecode[1+LINK_SIZE] == OP_OPT)
1479 ims = (ims & ~PCRE_IMS) | ecode[4];
1480 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1483 if (*ecode == OP_KETRMIN)
1485 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1486 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1487 ecode = prev;
1488 flags = 0;
1489 goto TAIL_RECURSE;
1491 else /* OP_KETRMAX */
1493 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1495 ecode += 1 + LINK_SIZE;
1496 flags = 0;
1497 goto TAIL_RECURSE;
1499 /* Control never gets here */
1501 /* An alternation is the end of a branch; scan along to find the end of the
1502 bracketed group and go to there. */
1504 case OP_ALT:
1505 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1506 break;
1508 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1509 indicating that it may occur zero times. It may repeat infinitely, or not
1510 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1511 with fixed upper repeat limits are compiled as a number of copies, with the
1512 optional ones preceded by BRAZERO or BRAMINZERO. */
1514 case OP_BRAZERO:
1516 next = ecode+1;
1517 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1518 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1519 do next += GET(next,1); while (*next == OP_ALT);
1520 ecode = next + 1 + LINK_SIZE;
1522 break;
1524 case OP_BRAMINZERO:
1526 next = ecode+1;
1527 do next += GET(next, 1); while (*next == OP_ALT);
1528 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1529 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1530 ecode++;
1532 break;
1534 case OP_SKIPZERO:
1536 next = ecode+1;
1537 do next += GET(next,1); while (*next == OP_ALT);
1538 ecode = next + 1 + LINK_SIZE;
1540 break;
1542 /* End of a group, repeated or non-repeating. */
1544 case OP_KET:
1545 case OP_KETRMIN:
1546 case OP_KETRMAX:
1547 prev = ecode - GET(ecode, 1);
1549 /* If this was a group that remembered the subject start, in order to break
1550 infinite repeats of empty string matches, retrieve the subject start from
1551 the chain. Otherwise, set it NULL. */
1553 if (*prev >= OP_SBRA)
1555 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1556 eptrb = eptrb->epb_prev; /* Backup to previous group */
1558 else saved_eptr = NULL;
1560 /* If we are at the end of an assertion group or an atomic group, stop
1561 matching and return MATCH_MATCH, but record the current high water mark for
1562 use by positive assertions. We also need to record the match start in case
1563 it was changed by \K. */
1565 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1566 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1567 *prev == OP_ONCE)
1569 md->end_match_ptr = eptr; /* For ONCE */
1570 md->end_offset_top = offset_top;
1571 md->start_match_ptr = mstart;
1572 MRRETURN(MATCH_MATCH);
1575 /* For capturing groups we have to check the group number back at the start
1576 and if necessary complete handling an extraction by setting the offsets and
1577 bumping the high water mark. Note that whole-pattern recursion is coded as
1578 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1579 when the OP_END is reached. Other recursion is handled here. */
1581 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1583 number = GET2(prev, 1+LINK_SIZE);
1584 offset = number << 1;
1586 #ifdef PCRE_DEBUG
1587 printf("end bracket %d", number);
1588 printf("\n");
1589 #endif
1591 md->capture_last = number;
1592 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1594 md->offset_vector[offset] =
1595 md->offset_vector[md->offset_end - number];
1596 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1597 if (offset_top <= offset) offset_top = offset + 2;
1600 /* Handle a recursively called group. Restore the offsets
1601 appropriately and continue from after the call. */
1603 if (md->recursive != NULL && md->recursive->group_num == number)
1605 recursion_info *rec = md->recursive;
1606 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1607 md->recursive = rec->prevrec;
1608 memcpy(md->offset_vector, rec->offset_save,
1609 rec->saved_max * sizeof(int));
1610 offset_top = rec->save_offset_top;
1611 ecode = rec->after_call;
1612 ims = original_ims;
1613 break;
1617 /* For both capturing and non-capturing groups, reset the value of the ims
1618 flags, in case they got changed during the group. */
1620 ims = original_ims;
1621 DPRINTF(("ims reset to %02lx\n", ims));
1623 /* For a non-repeating ket, just continue at this level. This also
1624 happens for a repeating ket if no characters were matched in the group.
1625 This is the forcible breaking of infinite loops as implemented in Perl
1626 5.005. If there is an options reset, it will get obeyed in the normal
1627 course of events. */
1629 if (*ecode == OP_KET || eptr == saved_eptr)
1631 ecode += 1 + LINK_SIZE;
1632 break;
1635 /* The repeating kets try the rest of the pattern or restart from the
1636 preceding bracket, in the appropriate order. In the second case, we can use
1637 tail recursion to avoid using another stack frame, unless we have an
1638 unlimited repeat of a group that can match an empty string. */
1640 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1642 if (*ecode == OP_KETRMIN)
1644 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1645 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1646 if (flags != 0) /* Could match an empty string */
1648 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1649 RRETURN(rrc);
1651 ecode = prev;
1652 goto TAIL_RECURSE;
1654 else /* OP_KETRMAX */
1656 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1658 ecode += 1 + LINK_SIZE;
1659 flags = 0;
1660 goto TAIL_RECURSE;
1662 /* Control never gets here */
1664 /* Start of subject unless notbol, or after internal newline if multiline */
1666 case OP_CIRC:
1667 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1668 if ((ims & PCRE_MULTILINE) != 0)
1670 if (eptr != md->start_subject &&
1671 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1672 MRRETURN(MATCH_NOMATCH);
1673 ecode++;
1674 break;
1676 /* ... else fall through */
1678 /* Start of subject assertion */
1680 case OP_SOD:
1681 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1682 ecode++;
1683 break;
1685 /* Start of match assertion */
1687 case OP_SOM:
1688 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1689 ecode++;
1690 break;
1692 /* Reset the start of match point */
1694 case OP_SET_SOM:
1695 mstart = eptr;
1696 ecode++;
1697 break;
1699 /* Assert before internal newline if multiline, or before a terminating
1700 newline unless endonly is set, else end of subject unless noteol is set. */
1702 case OP_DOLL:
1703 if ((ims & PCRE_MULTILINE) != 0)
1705 if (eptr < md->end_subject)
1706 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1707 else
1709 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1710 SCHECK_PARTIAL();
1712 ecode++;
1713 break;
1715 else /* Not multiline */
1717 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1718 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1721 /* ... else fall through for endonly */
1723 /* End of subject assertion (\z) */
1725 case OP_EOD:
1726 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1727 SCHECK_PARTIAL();
1728 ecode++;
1729 break;
1731 /* End of subject or ending \n assertion (\Z) */
1733 case OP_EODN:
1734 ASSERT_NL_OR_EOS:
1735 if (eptr < md->end_subject &&
1736 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1737 MRRETURN(MATCH_NOMATCH);
1739 /* Either at end of string or \n before end. */
1741 SCHECK_PARTIAL();
1742 ecode++;
1743 break;
1745 /* Word boundary assertions */
1747 case OP_NOT_WORD_BOUNDARY:
1748 case OP_WORD_BOUNDARY:
1751 /* Find out if the previous and current characters are "word" characters.
1752 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1753 be "non-word" characters. Remember the earliest consulted character for
1754 partial matching. */
1756 #ifdef SUPPORT_UTF8
1757 if (utf8)
1759 /* Get status of previous character */
1761 if (eptr == md->start_subject) prev_is_word = FALSE; else
1763 USPTR lastptr = eptr - 1;
1764 while((*lastptr & 0xc0) == 0x80) lastptr--;
1765 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1766 GETCHAR(c, lastptr);
1767 #ifdef SUPPORT_UCP
1768 if (md->use_ucp)
1770 if (c == '_') prev_is_word = TRUE; else
1772 int cat = UCD_CATEGORY(c);
1773 prev_is_word = (cat == ucp_L || cat == ucp_N);
1776 else
1777 #endif
1778 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1781 /* Get status of next character */
1783 if (eptr >= md->end_subject)
1785 SCHECK_PARTIAL();
1786 cur_is_word = FALSE;
1788 else
1790 GETCHAR(c, eptr);
1791 #ifdef SUPPORT_UCP
1792 if (md->use_ucp)
1794 if (c == '_') cur_is_word = TRUE; else
1796 int cat = UCD_CATEGORY(c);
1797 cur_is_word = (cat == ucp_L || cat == ucp_N);
1800 else
1801 #endif
1802 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1805 else
1806 #endif
1808 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1809 consistency with the behaviour of \w we do use it in this case. */
1812 /* Get status of previous character */
1814 if (eptr == md->start_subject) prev_is_word = FALSE; else
1816 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1817 #ifdef SUPPORT_UCP
1818 if (md->use_ucp)
1820 c = eptr[-1];
1821 if (c == '_') prev_is_word = TRUE; else
1823 int cat = UCD_CATEGORY(c);
1824 prev_is_word = (cat == ucp_L || cat == ucp_N);
1827 else
1828 #endif
1829 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1832 /* Get status of next character */
1834 if (eptr >= md->end_subject)
1836 SCHECK_PARTIAL();
1837 cur_is_word = FALSE;
1839 else
1840 #ifdef SUPPORT_UCP
1841 if (md->use_ucp)
1843 c = *eptr;
1844 if (c == '_') cur_is_word = TRUE; else
1846 int cat = UCD_CATEGORY(c);
1847 cur_is_word = (cat == ucp_L || cat == ucp_N);
1850 else
1851 #endif
1852 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1855 /* Now see if the situation is what we want */
1857 if ((*ecode++ == OP_WORD_BOUNDARY)?
1858 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1859 MRRETURN(MATCH_NOMATCH);
1861 break;
1863 /* Match a single character type; inline for speed */
1865 case OP_ANY:
1866 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1867 /* Fall through */
1869 case OP_ALLANY:
1870 if (eptr++ >= md->end_subject)
1872 SCHECK_PARTIAL();
1873 MRRETURN(MATCH_NOMATCH);
1875 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1876 ecode++;
1877 break;
1879 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1880 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1882 case OP_ANYBYTE:
1883 if (eptr++ >= md->end_subject)
1885 SCHECK_PARTIAL();
1886 MRRETURN(MATCH_NOMATCH);
1888 ecode++;
1889 break;
1891 case OP_NOT_DIGIT:
1892 if (eptr >= md->end_subject)
1894 SCHECK_PARTIAL();
1895 MRRETURN(MATCH_NOMATCH);
1897 GETCHARINCTEST(c, eptr);
1898 if (
1899 #ifdef SUPPORT_UTF8
1900 c < 256 &&
1901 #endif
1902 (md->ctypes[c] & ctype_digit) != 0
1904 MRRETURN(MATCH_NOMATCH);
1905 ecode++;
1906 break;
1908 case OP_DIGIT:
1909 if (eptr >= md->end_subject)
1911 SCHECK_PARTIAL();
1912 MRRETURN(MATCH_NOMATCH);
1914 GETCHARINCTEST(c, eptr);
1915 if (
1916 #ifdef SUPPORT_UTF8
1917 c >= 256 ||
1918 #endif
1919 (md->ctypes[c] & ctype_digit) == 0
1921 MRRETURN(MATCH_NOMATCH);
1922 ecode++;
1923 break;
1925 case OP_NOT_WHITESPACE:
1926 if (eptr >= md->end_subject)
1928 SCHECK_PARTIAL();
1929 MRRETURN(MATCH_NOMATCH);
1931 GETCHARINCTEST(c, eptr);
1932 if (
1933 #ifdef SUPPORT_UTF8
1934 c < 256 &&
1935 #endif
1936 (md->ctypes[c] & ctype_space) != 0
1938 MRRETURN(MATCH_NOMATCH);
1939 ecode++;
1940 break;
1942 case OP_WHITESPACE:
1943 if (eptr >= md->end_subject)
1945 SCHECK_PARTIAL();
1946 MRRETURN(MATCH_NOMATCH);
1948 GETCHARINCTEST(c, eptr);
1949 if (
1950 #ifdef SUPPORT_UTF8
1951 c >= 256 ||
1952 #endif
1953 (md->ctypes[c] & ctype_space) == 0
1955 MRRETURN(MATCH_NOMATCH);
1956 ecode++;
1957 break;
1959 case OP_NOT_WORDCHAR:
1960 if (eptr >= md->end_subject)
1962 SCHECK_PARTIAL();
1963 MRRETURN(MATCH_NOMATCH);
1965 GETCHARINCTEST(c, eptr);
1966 if (
1967 #ifdef SUPPORT_UTF8
1968 c < 256 &&
1969 #endif
1970 (md->ctypes[c] & ctype_word) != 0
1972 MRRETURN(MATCH_NOMATCH);
1973 ecode++;
1974 break;
1976 case OP_WORDCHAR:
1977 if (eptr >= md->end_subject)
1979 SCHECK_PARTIAL();
1980 MRRETURN(MATCH_NOMATCH);
1982 GETCHARINCTEST(c, eptr);
1983 if (
1984 #ifdef SUPPORT_UTF8
1985 c >= 256 ||
1986 #endif
1987 (md->ctypes[c] & ctype_word) == 0
1989 MRRETURN(MATCH_NOMATCH);
1990 ecode++;
1991 break;
1993 case OP_ANYNL:
1994 if (eptr >= md->end_subject)
1996 SCHECK_PARTIAL();
1997 MRRETURN(MATCH_NOMATCH);
1999 GETCHARINCTEST(c, eptr);
2000 switch(c)
2002 default: MRRETURN(MATCH_NOMATCH);
2003 case 0x000d:
2004 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2005 break;
2007 case 0x000a:
2008 break;
2010 case 0x000b:
2011 case 0x000c:
2012 case 0x0085:
2013 case 0x2028:
2014 case 0x2029:
2015 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2016 break;
2018 ecode++;
2019 break;
2021 case OP_NOT_HSPACE:
2022 if (eptr >= md->end_subject)
2024 SCHECK_PARTIAL();
2025 MRRETURN(MATCH_NOMATCH);
2027 GETCHARINCTEST(c, eptr);
2028 switch(c)
2030 default: break;
2031 case 0x09: /* HT */
2032 case 0x20: /* SPACE */
2033 case 0xa0: /* NBSP */
2034 case 0x1680: /* OGHAM SPACE MARK */
2035 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2036 case 0x2000: /* EN QUAD */
2037 case 0x2001: /* EM QUAD */
2038 case 0x2002: /* EN SPACE */
2039 case 0x2003: /* EM SPACE */
2040 case 0x2004: /* THREE-PER-EM SPACE */
2041 case 0x2005: /* FOUR-PER-EM SPACE */
2042 case 0x2006: /* SIX-PER-EM SPACE */
2043 case 0x2007: /* FIGURE SPACE */
2044 case 0x2008: /* PUNCTUATION SPACE */
2045 case 0x2009: /* THIN SPACE */
2046 case 0x200A: /* HAIR SPACE */
2047 case 0x202f: /* NARROW NO-BREAK SPACE */
2048 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2049 case 0x3000: /* IDEOGRAPHIC SPACE */
2050 MRRETURN(MATCH_NOMATCH);
2052 ecode++;
2053 break;
2055 case OP_HSPACE:
2056 if (eptr >= md->end_subject)
2058 SCHECK_PARTIAL();
2059 MRRETURN(MATCH_NOMATCH);
2061 GETCHARINCTEST(c, eptr);
2062 switch(c)
2064 default: MRRETURN(MATCH_NOMATCH);
2065 case 0x09: /* HT */
2066 case 0x20: /* SPACE */
2067 case 0xa0: /* NBSP */
2068 case 0x1680: /* OGHAM SPACE MARK */
2069 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2070 case 0x2000: /* EN QUAD */
2071 case 0x2001: /* EM QUAD */
2072 case 0x2002: /* EN SPACE */
2073 case 0x2003: /* EM SPACE */
2074 case 0x2004: /* THREE-PER-EM SPACE */
2075 case 0x2005: /* FOUR-PER-EM SPACE */
2076 case 0x2006: /* SIX-PER-EM SPACE */
2077 case 0x2007: /* FIGURE SPACE */
2078 case 0x2008: /* PUNCTUATION SPACE */
2079 case 0x2009: /* THIN SPACE */
2080 case 0x200A: /* HAIR SPACE */
2081 case 0x202f: /* NARROW NO-BREAK SPACE */
2082 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2083 case 0x3000: /* IDEOGRAPHIC SPACE */
2084 break;
2086 ecode++;
2087 break;
2089 case OP_NOT_VSPACE:
2090 if (eptr >= md->end_subject)
2092 SCHECK_PARTIAL();
2093 MRRETURN(MATCH_NOMATCH);
2095 GETCHARINCTEST(c, eptr);
2096 switch(c)
2098 default: break;
2099 case 0x0a: /* LF */
2100 case 0x0b: /* VT */
2101 case 0x0c: /* FF */
2102 case 0x0d: /* CR */
2103 case 0x85: /* NEL */
2104 case 0x2028: /* LINE SEPARATOR */
2105 case 0x2029: /* PARAGRAPH SEPARATOR */
2106 MRRETURN(MATCH_NOMATCH);
2108 ecode++;
2109 break;
2111 case OP_VSPACE:
2112 if (eptr >= md->end_subject)
2114 SCHECK_PARTIAL();
2115 MRRETURN(MATCH_NOMATCH);
2117 GETCHARINCTEST(c, eptr);
2118 switch(c)
2120 default: MRRETURN(MATCH_NOMATCH);
2121 case 0x0a: /* LF */
2122 case 0x0b: /* VT */
2123 case 0x0c: /* FF */
2124 case 0x0d: /* CR */
2125 case 0x85: /* NEL */
2126 case 0x2028: /* LINE SEPARATOR */
2127 case 0x2029: /* PARAGRAPH SEPARATOR */
2128 break;
2130 ecode++;
2131 break;
2133 #ifdef SUPPORT_UCP
2134 /* Check the next character by Unicode property. We will get here only
2135 if the support is in the binary; otherwise a compile-time error occurs. */
2137 case OP_PROP:
2138 case OP_NOTPROP:
2139 if (eptr >= md->end_subject)
2141 SCHECK_PARTIAL();
2142 MRRETURN(MATCH_NOMATCH);
2144 GETCHARINCTEST(c, eptr);
2146 int chartype = UCD_CHARTYPE(c);
2148 switch(ecode[1])
2150 case PT_ANY:
2151 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2152 break;
2154 case PT_LAMP:
2155 if ((chartype == ucp_Lu ||
2156 chartype == ucp_Ll ||
2157 chartype == ucp_Lt) == (op == OP_NOTPROP))
2158 MRRETURN(MATCH_NOMATCH);
2159 break;
2161 case PT_GC:
2162 if ((ecode[2] != _pcre_ucp_gentype[chartype]) == (op == OP_PROP))
2163 MRRETURN(MATCH_NOMATCH);
2164 break;
2166 case PT_PC:
2167 if ((ecode[2] != chartype) == (op == OP_PROP))
2168 MRRETURN(MATCH_NOMATCH);
2169 break;
2171 case PT_SC:
2172 if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
2173 MRRETURN(MATCH_NOMATCH);
2174 break;
2176 /* These are specials */
2178 case PT_ALNUM:
2179 if ((_pcre_ucp_gentype[chartype] == ucp_L ||
2180 _pcre_ucp_gentype[chartype] == ucp_N) == (op == OP_NOTPROP))
2181 MRRETURN(MATCH_NOMATCH);
2182 break;
2184 case PT_SPACE: /* Perl space */
2185 if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
2186 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2187 == (op == OP_NOTPROP))
2188 MRRETURN(MATCH_NOMATCH);
2189 break;
2191 case PT_PXSPACE: /* POSIX space */
2192 if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
2193 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2194 c == CHAR_FF || c == CHAR_CR)
2195 == (op == OP_NOTPROP))
2196 MRRETURN(MATCH_NOMATCH);
2197 break;
2199 case PT_WORD:
2200 if ((_pcre_ucp_gentype[chartype] == ucp_L ||
2201 _pcre_ucp_gentype[chartype] == ucp_N ||
2202 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2203 MRRETURN(MATCH_NOMATCH);
2204 break;
2206 /* This should never occur */
2208 default:
2209 RRETURN(PCRE_ERROR_INTERNAL);
2212 ecode += 3;
2214 break;
2216 /* Match an extended Unicode sequence. We will get here only if the support
2217 is in the binary; otherwise a compile-time error occurs. */
2219 case OP_EXTUNI:
2220 if (eptr >= md->end_subject)
2222 SCHECK_PARTIAL();
2223 MRRETURN(MATCH_NOMATCH);
2225 GETCHARINCTEST(c, eptr);
2227 int category = UCD_CATEGORY(c);
2228 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2229 while (eptr < md->end_subject)
2231 int len = 1;
2232 if (!utf8) c = *eptr; else
2234 GETCHARLEN(c, eptr, len);
2236 category = UCD_CATEGORY(c);
2237 if (category != ucp_M) break;
2238 eptr += len;
2241 ecode++;
2242 break;
2243 #endif
2246 /* Match a back reference, possibly repeatedly. Look past the end of the
2247 item to see if there is repeat information following. The code is similar
2248 to that for character classes, but repeated for efficiency. Then obey
2249 similar code to character type repeats - written out again for speed.
2250 However, if the referenced string is the empty string, always treat
2251 it as matched, any number of times (otherwise there could be infinite
2252 loops). */
2254 case OP_REF:
2256 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2257 ecode += 3;
2259 /* If the reference is unset, there are two possibilities:
2261 (a) In the default, Perl-compatible state, set the length to be longer
2262 than the amount of subject left; this ensures that every attempt at a
2263 match fails. We can't just fail here, because of the possibility of
2264 quantifiers with zero minima.
2266 (b) If the JavaScript compatibility flag is set, set the length to zero
2267 so that the back reference matches an empty string.
2269 Otherwise, set the length to the length of what was matched by the
2270 referenced subpattern. */
2272 if (offset >= offset_top || md->offset_vector[offset] < 0)
2273 length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
2274 else
2275 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2277 /* Set up for repetition, or handle the non-repeated case */
2279 switch (*ecode)
2281 case OP_CRSTAR:
2282 case OP_CRMINSTAR:
2283 case OP_CRPLUS:
2284 case OP_CRMINPLUS:
2285 case OP_CRQUERY:
2286 case OP_CRMINQUERY:
2287 c = *ecode++ - OP_CRSTAR;
2288 minimize = (c & 1) != 0;
2289 min = rep_min[c]; /* Pick up values from tables; */
2290 max = rep_max[c]; /* zero for max => infinity */
2291 if (max == 0) max = INT_MAX;
2292 break;
2294 case OP_CRRANGE:
2295 case OP_CRMINRANGE:
2296 minimize = (*ecode == OP_CRMINRANGE);
2297 min = GET2(ecode, 1);
2298 max = GET2(ecode, 3);
2299 if (max == 0) max = INT_MAX;
2300 ecode += 5;
2301 break;
2303 default: /* No repeat follows */
2304 if (!match_ref(offset, eptr, length, md, ims))
2306 CHECK_PARTIAL();
2307 MRRETURN(MATCH_NOMATCH);
2309 eptr += length;
2310 continue; /* With the main loop */
2313 /* If the length of the reference is zero, just continue with the
2314 main loop. */
2316 if (length == 0) continue;
2318 /* First, ensure the minimum number of matches are present. We get back
2319 the length of the reference string explicitly rather than passing the
2320 address of eptr, so that eptr can be a register variable. */
2322 for (i = 1; i <= min; i++)
2324 if (!match_ref(offset, eptr, length, md, ims))
2326 CHECK_PARTIAL();
2327 MRRETURN(MATCH_NOMATCH);
2329 eptr += length;
2332 /* If min = max, continue at the same level without recursion.
2333 They are not both allowed to be zero. */
2335 if (min == max) continue;
2337 /* If minimizing, keep trying and advancing the pointer */
2339 if (minimize)
2341 for (fi = min;; fi++)
2343 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2344 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2345 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2346 if (!match_ref(offset, eptr, length, md, ims))
2348 CHECK_PARTIAL();
2349 MRRETURN(MATCH_NOMATCH);
2351 eptr += length;
2353 /* Control never gets here */
2356 /* If maximizing, find the longest string and work backwards */
2358 else
2360 pp = eptr;
2361 for (i = min; i < max; i++)
2363 if (!match_ref(offset, eptr, length, md, ims))
2365 CHECK_PARTIAL();
2366 break;
2368 eptr += length;
2370 while (eptr >= pp)
2372 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2374 eptr -= length;
2376 MRRETURN(MATCH_NOMATCH);
2379 /* Control never gets here */
2381 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2382 used when all the characters in the class have values in the range 0-255,
2383 and either the matching is caseful, or the characters are in the range
2384 0-127 when UTF-8 processing is enabled. The only difference between
2385 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2386 encountered.
2388 First, look past the end of the item to see if there is repeat information
2389 following. Then obey similar code to character type repeats - written out
2390 again for speed. */
2392 case OP_NCLASS:
2393 case OP_CLASS:
2395 data = ecode + 1; /* Save for matching */
2396 ecode += 33; /* Advance past the item */
2398 switch (*ecode)
2400 case OP_CRSTAR:
2401 case OP_CRMINSTAR:
2402 case OP_CRPLUS:
2403 case OP_CRMINPLUS:
2404 case OP_CRQUERY:
2405 case OP_CRMINQUERY:
2406 c = *ecode++ - OP_CRSTAR;
2407 minimize = (c & 1) != 0;
2408 min = rep_min[c]; /* Pick up values from tables; */
2409 max = rep_max[c]; /* zero for max => infinity */
2410 if (max == 0) max = INT_MAX;
2411 break;
2413 case OP_CRRANGE:
2414 case OP_CRMINRANGE:
2415 minimize = (*ecode == OP_CRMINRANGE);
2416 min = GET2(ecode, 1);
2417 max = GET2(ecode, 3);
2418 if (max == 0) max = INT_MAX;
2419 ecode += 5;
2420 break;
2422 default: /* No repeat follows */
2423 min = max = 1;
2424 break;
2427 /* First, ensure the minimum number of matches are present. */
2429 #ifdef SUPPORT_UTF8
2430 /* UTF-8 mode */
2431 if (utf8)
2433 for (i = 1; i <= min; i++)
2435 if (eptr >= md->end_subject)
2437 SCHECK_PARTIAL();
2438 MRRETURN(MATCH_NOMATCH);
2440 GETCHARINC(c, eptr);
2441 if (c > 255)
2443 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2445 else
2447 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2451 else
2452 #endif
2453 /* Not UTF-8 mode */
2455 for (i = 1; i <= min; i++)
2457 if (eptr >= md->end_subject)
2459 SCHECK_PARTIAL();
2460 MRRETURN(MATCH_NOMATCH);
2462 c = *eptr++;
2463 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2467 /* If max == min we can continue with the main loop without the
2468 need to recurse. */
2470 if (min == max) continue;
2472 /* If minimizing, keep testing the rest of the expression and advancing
2473 the pointer while it matches the class. */
2475 if (minimize)
2477 #ifdef SUPPORT_UTF8
2478 /* UTF-8 mode */
2479 if (utf8)
2481 for (fi = min;; fi++)
2483 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2485 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2486 if (eptr >= md->end_subject)
2488 SCHECK_PARTIAL();
2489 MRRETURN(MATCH_NOMATCH);
2491 GETCHARINC(c, eptr);
2492 if (c > 255)
2494 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2496 else
2498 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2502 else
2503 #endif
2504 /* Not UTF-8 mode */
2506 for (fi = min;; fi++)
2508 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2509 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2511 if (eptr >= md->end_subject)
2513 SCHECK_PARTIAL();
2514 MRRETURN(MATCH_NOMATCH);
2516 c = *eptr++;
2517 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2520 /* Control never gets here */
2523 /* If maximizing, find the longest possible run, then work backwards. */
2525 else
2527 pp = eptr;
2529 #ifdef SUPPORT_UTF8
2530 /* UTF-8 mode */
2531 if (utf8)
2533 for (i = min; i < max; i++)
2535 int len = 1;
2536 if (eptr >= md->end_subject)
2538 SCHECK_PARTIAL();
2539 break;
2541 GETCHARLEN(c, eptr, len);
2542 if (c > 255)
2544 if (op == OP_CLASS) break;
2546 else
2548 if ((data[c/8] & (1 << (c&7))) == 0) break;
2550 eptr += len;
2552 for (;;)
2554 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2556 if (eptr-- == pp) break; /* Stop if tried at original pos */
2557 BACKCHAR(eptr);
2560 else
2561 #endif
2562 /* Not UTF-8 mode */
2564 for (i = min; i < max; i++)
2566 if (eptr >= md->end_subject)
2568 SCHECK_PARTIAL();
2569 break;
2571 c = *eptr;
2572 if ((data[c/8] & (1 << (c&7))) == 0) break;
2573 eptr++;
2575 while (eptr >= pp)
2577 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2578 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2579 eptr--;
2583 MRRETURN(MATCH_NOMATCH);
2586 /* Control never gets here */
2589 /* Match an extended character class. This opcode is encountered only
2590 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2591 mode, because Unicode properties are supported in non-UTF-8 mode. */
2593 #ifdef SUPPORT_UTF8
2594 case OP_XCLASS:
2596 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2597 ecode += GET(ecode, 1); /* Advance past the item */
2599 switch (*ecode)
2601 case OP_CRSTAR:
2602 case OP_CRMINSTAR:
2603 case OP_CRPLUS:
2604 case OP_CRMINPLUS:
2605 case OP_CRQUERY:
2606 case OP_CRMINQUERY:
2607 c = *ecode++ - OP_CRSTAR;
2608 minimize = (c & 1) != 0;
2609 min = rep_min[c]; /* Pick up values from tables; */
2610 max = rep_max[c]; /* zero for max => infinity */
2611 if (max == 0) max = INT_MAX;
2612 break;
2614 case OP_CRRANGE:
2615 case OP_CRMINRANGE:
2616 minimize = (*ecode == OP_CRMINRANGE);
2617 min = GET2(ecode, 1);
2618 max = GET2(ecode, 3);
2619 if (max == 0) max = INT_MAX;
2620 ecode += 5;
2621 break;
2623 default: /* No repeat follows */
2624 min = max = 1;
2625 break;
2628 /* First, ensure the minimum number of matches are present. */
2630 for (i = 1; i <= min; i++)
2632 if (eptr >= md->end_subject)
2634 SCHECK_PARTIAL();
2635 MRRETURN(MATCH_NOMATCH);
2637 GETCHARINCTEST(c, eptr);
2638 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2641 /* If max == min we can continue with the main loop without the
2642 need to recurse. */
2644 if (min == max) continue;
2646 /* If minimizing, keep testing the rest of the expression and advancing
2647 the pointer while it matches the class. */
2649 if (minimize)
2651 for (fi = min;; fi++)
2653 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2655 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2656 if (eptr >= md->end_subject)
2658 SCHECK_PARTIAL();
2659 MRRETURN(MATCH_NOMATCH);
2661 GETCHARINCTEST(c, eptr);
2662 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2664 /* Control never gets here */
2667 /* If maximizing, find the longest possible run, then work backwards. */
2669 else
2671 pp = eptr;
2672 for (i = min; i < max; i++)
2674 int len = 1;
2675 if (eptr >= md->end_subject)
2677 SCHECK_PARTIAL();
2678 break;
2680 GETCHARLENTEST(c, eptr, len);
2681 if (!_pcre_xclass(c, data)) break;
2682 eptr += len;
2684 for(;;)
2686 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2688 if (eptr-- == pp) break; /* Stop if tried at original pos */
2689 if (utf8) BACKCHAR(eptr);
2691 MRRETURN(MATCH_NOMATCH);
2694 /* Control never gets here */
2696 #endif /* End of XCLASS */
2698 /* Match a single character, casefully */
2700 case OP_CHAR:
2701 #ifdef SUPPORT_UTF8
2702 if (utf8)
2704 length = 1;
2705 ecode++;
2706 GETCHARLEN(fc, ecode, length);
2707 if (length > md->end_subject - eptr)
2709 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2710 MRRETURN(MATCH_NOMATCH);
2712 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2714 else
2715 #endif
2717 /* Non-UTF-8 mode */
2719 if (md->end_subject - eptr < 1)
2721 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2722 MRRETURN(MATCH_NOMATCH);
2724 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2725 ecode += 2;
2727 break;
2729 /* Match a single character, caselessly */
2731 case OP_CHARNC:
2732 #ifdef SUPPORT_UTF8
2733 if (utf8)
2735 length = 1;
2736 ecode++;
2737 GETCHARLEN(fc, ecode, length);
2739 if (length > md->end_subject - eptr)
2741 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2742 MRRETURN(MATCH_NOMATCH);
2745 /* If the pattern character's value is < 128, we have only one byte, and
2746 can use the fast lookup table. */
2748 if (fc < 128)
2750 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2753 /* Otherwise we must pick up the subject character */
2755 else
2757 unsigned int dc;
2758 GETCHARINC(dc, eptr);
2759 ecode += length;
2761 /* If we have Unicode property support, we can use it to test the other
2762 case of the character, if there is one. */
2764 if (fc != dc)
2766 #ifdef SUPPORT_UCP
2767 if (dc != UCD_OTHERCASE(fc))
2768 #endif
2769 MRRETURN(MATCH_NOMATCH);
2773 else
2774 #endif /* SUPPORT_UTF8 */
2776 /* Non-UTF-8 mode */
2778 if (md->end_subject - eptr < 1)
2780 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2781 MRRETURN(MATCH_NOMATCH);
2783 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2784 ecode += 2;
2786 break;
2788 /* Match a single character repeatedly. */
2790 case OP_EXACT:
2791 min = max = GET2(ecode, 1);
2792 ecode += 3;
2793 goto REPEATCHAR;
2795 case OP_POSUPTO:
2796 possessive = TRUE;
2797 /* Fall through */
2799 case OP_UPTO:
2800 case OP_MINUPTO:
2801 min = 0;
2802 max = GET2(ecode, 1);
2803 minimize = *ecode == OP_MINUPTO;
2804 ecode += 3;
2805 goto REPEATCHAR;
2807 case OP_POSSTAR:
2808 possessive = TRUE;
2809 min = 0;
2810 max = INT_MAX;
2811 ecode++;
2812 goto REPEATCHAR;
2814 case OP_POSPLUS:
2815 possessive = TRUE;
2816 min = 1;
2817 max = INT_MAX;
2818 ecode++;
2819 goto REPEATCHAR;
2821 case OP_POSQUERY:
2822 possessive = TRUE;
2823 min = 0;
2824 max = 1;
2825 ecode++;
2826 goto REPEATCHAR;
2828 case OP_STAR:
2829 case OP_MINSTAR:
2830 case OP_PLUS:
2831 case OP_MINPLUS:
2832 case OP_QUERY:
2833 case OP_MINQUERY:
2834 c = *ecode++ - OP_STAR;
2835 minimize = (c & 1) != 0;
2837 min = rep_min[c]; /* Pick up values from tables; */
2838 max = rep_max[c]; /* zero for max => infinity */
2839 if (max == 0) max = INT_MAX;
2841 /* Common code for all repeated single-character matches. */
2843 REPEATCHAR:
2844 #ifdef SUPPORT_UTF8
2845 if (utf8)
2847 length = 1;
2848 charptr = ecode;
2849 GETCHARLEN(fc, ecode, length);
2850 ecode += length;
2852 /* Handle multibyte character matching specially here. There is
2853 support for caseless matching if UCP support is present. */
2855 if (length > 1)
2857 #ifdef SUPPORT_UCP
2858 unsigned int othercase;
2859 if ((ims & PCRE_CASELESS) != 0 &&
2860 (othercase = UCD_OTHERCASE(fc)) != fc)
2861 oclength = _pcre_ord2utf8(othercase, occhars);
2862 else oclength = 0;
2863 #endif /* SUPPORT_UCP */
2865 for (i = 1; i <= min; i++)
2867 if (eptr <= md->end_subject - length &&
2868 memcmp(eptr, charptr, length) == 0) eptr += length;
2869 #ifdef SUPPORT_UCP
2870 else if (oclength > 0 &&
2871 eptr <= md->end_subject - oclength &&
2872 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2873 #endif /* SUPPORT_UCP */
2874 else
2876 CHECK_PARTIAL();
2877 MRRETURN(MATCH_NOMATCH);
2881 if (min == max) continue;
2883 if (minimize)
2885 for (fi = min;; fi++)
2887 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2888 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2889 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2890 if (eptr <= md->end_subject - length &&
2891 memcmp(eptr, charptr, length) == 0) eptr += length;
2892 #ifdef SUPPORT_UCP
2893 else if (oclength > 0 &&
2894 eptr <= md->end_subject - oclength &&
2895 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2896 #endif /* SUPPORT_UCP */
2897 else
2899 CHECK_PARTIAL();
2900 MRRETURN(MATCH_NOMATCH);
2903 /* Control never gets here */
2906 else /* Maximize */
2908 pp = eptr;
2909 for (i = min; i < max; i++)
2911 if (eptr <= md->end_subject - length &&
2912 memcmp(eptr, charptr, length) == 0) eptr += length;
2913 #ifdef SUPPORT_UCP
2914 else if (oclength > 0 &&
2915 eptr <= md->end_subject - oclength &&
2916 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2917 #endif /* SUPPORT_UCP */
2918 else
2920 CHECK_PARTIAL();
2921 break;
2925 if (possessive) continue;
2927 for(;;)
2929 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2930 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2931 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2932 #ifdef SUPPORT_UCP
2933 eptr--;
2934 BACKCHAR(eptr);
2935 #else /* without SUPPORT_UCP */
2936 eptr -= length;
2937 #endif /* SUPPORT_UCP */
2940 /* Control never gets here */
2943 /* If the length of a UTF-8 character is 1, we fall through here, and
2944 obey the code as for non-UTF-8 characters below, though in this case the
2945 value of fc will always be < 128. */
2947 else
2948 #endif /* SUPPORT_UTF8 */
2950 /* When not in UTF-8 mode, load a single-byte character. */
2952 fc = *ecode++;
2954 /* The value of fc at this point is always less than 256, though we may or
2955 may not be in UTF-8 mode. The code is duplicated for the caseless and
2956 caseful cases, for speed, since matching characters is likely to be quite
2957 common. First, ensure the minimum number of matches are present. If min =
2958 max, continue at the same level without recursing. Otherwise, if
2959 minimizing, keep trying the rest of the expression and advancing one
2960 matching character if failing, up to the maximum. Alternatively, if
2961 maximizing, find the maximum number of characters and work backwards. */
2963 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2964 max, eptr));
2966 if ((ims & PCRE_CASELESS) != 0)
2968 fc = md->lcc[fc];
2969 for (i = 1; i <= min; i++)
2971 if (eptr >= md->end_subject)
2973 SCHECK_PARTIAL();
2974 MRRETURN(MATCH_NOMATCH);
2976 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2978 if (min == max) continue;
2979 if (minimize)
2981 for (fi = min;; fi++)
2983 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2985 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2986 if (eptr >= md->end_subject)
2988 SCHECK_PARTIAL();
2989 MRRETURN(MATCH_NOMATCH);
2991 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2993 /* Control never gets here */
2995 else /* Maximize */
2997 pp = eptr;
2998 for (i = min; i < max; i++)
3000 if (eptr >= md->end_subject)
3002 SCHECK_PARTIAL();
3003 break;
3005 if (fc != md->lcc[*eptr]) break;
3006 eptr++;
3009 if (possessive) continue;
3011 while (eptr >= pp)
3013 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
3014 eptr--;
3015 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3017 MRRETURN(MATCH_NOMATCH);
3019 /* Control never gets here */
3022 /* Caseful comparisons (includes all multi-byte characters) */
3024 else
3026 for (i = 1; i <= min; i++)
3028 if (eptr >= md->end_subject)
3030 SCHECK_PARTIAL();
3031 MRRETURN(MATCH_NOMATCH);
3033 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3036 if (min == max) continue;
3038 if (minimize)
3040 for (fi = min;; fi++)
3042 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3044 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3045 if (eptr >= md->end_subject)
3047 SCHECK_PARTIAL();
3048 MRRETURN(MATCH_NOMATCH);
3050 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3052 /* Control never gets here */
3054 else /* Maximize */
3056 pp = eptr;
3057 for (i = min; i < max; i++)
3059 if (eptr >= md->end_subject)
3061 SCHECK_PARTIAL();
3062 break;
3064 if (fc != *eptr) break;
3065 eptr++;
3067 if (possessive) continue;
3069 while (eptr >= pp)
3071 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3072 eptr--;
3073 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3075 MRRETURN(MATCH_NOMATCH);
3078 /* Control never gets here */
3080 /* Match a negated single one-byte character. The character we are
3081 checking can be multibyte. */
3083 case OP_NOT:
3084 if (eptr >= md->end_subject)
3086 SCHECK_PARTIAL();
3087 MRRETURN(MATCH_NOMATCH);
3089 ecode++;
3090 GETCHARINCTEST(c, eptr);
3091 if ((ims & PCRE_CASELESS) != 0)
3093 #ifdef SUPPORT_UTF8
3094 if (c < 256)
3095 #endif
3096 c = md->lcc[c];
3097 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3099 else
3101 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3103 break;
3105 /* Match a negated single one-byte character repeatedly. This is almost a
3106 repeat of the code for a repeated single character, but I haven't found a
3107 nice way of commoning these up that doesn't require a test of the
3108 positive/negative option for each character match. Maybe that wouldn't add
3109 very much to the time taken, but character matching *is* what this is all
3110 about... */
3112 case OP_NOTEXACT:
3113 min = max = GET2(ecode, 1);
3114 ecode += 3;
3115 goto REPEATNOTCHAR;
3117 case OP_NOTUPTO:
3118 case OP_NOTMINUPTO:
3119 min = 0;
3120 max = GET2(ecode, 1);
3121 minimize = *ecode == OP_NOTMINUPTO;
3122 ecode += 3;
3123 goto REPEATNOTCHAR;
3125 case OP_NOTPOSSTAR:
3126 possessive = TRUE;
3127 min = 0;
3128 max = INT_MAX;
3129 ecode++;
3130 goto REPEATNOTCHAR;
3132 case OP_NOTPOSPLUS:
3133 possessive = TRUE;
3134 min = 1;
3135 max = INT_MAX;
3136 ecode++;
3137 goto REPEATNOTCHAR;
3139 case OP_NOTPOSQUERY:
3140 possessive = TRUE;
3141 min = 0;
3142 max = 1;
3143 ecode++;
3144 goto REPEATNOTCHAR;
3146 case OP_NOTPOSUPTO:
3147 possessive = TRUE;
3148 min = 0;
3149 max = GET2(ecode, 1);
3150 ecode += 3;
3151 goto REPEATNOTCHAR;
3153 case OP_NOTSTAR:
3154 case OP_NOTMINSTAR:
3155 case OP_NOTPLUS:
3156 case OP_NOTMINPLUS:
3157 case OP_NOTQUERY:
3158 case OP_NOTMINQUERY:
3159 c = *ecode++ - OP_NOTSTAR;
3160 minimize = (c & 1) != 0;
3161 min = rep_min[c]; /* Pick up values from tables; */
3162 max = rep_max[c]; /* zero for max => infinity */
3163 if (max == 0) max = INT_MAX;
3165 /* Common code for all repeated single-byte matches. */
3167 REPEATNOTCHAR:
3168 fc = *ecode++;
3170 /* The code is duplicated for the caseless and caseful cases, for speed,
3171 since matching characters is likely to be quite common. First, ensure the
3172 minimum number of matches are present. If min = max, continue at the same
3173 level without recursing. Otherwise, if minimizing, keep trying the rest of
3174 the expression and advancing one matching character if failing, up to the
3175 maximum. Alternatively, if maximizing, find the maximum number of
3176 characters and work backwards. */
3178 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3179 max, eptr));
3181 if ((ims & PCRE_CASELESS) != 0)
3183 fc = md->lcc[fc];
3185 #ifdef SUPPORT_UTF8
3186 /* UTF-8 mode */
3187 if (utf8)
3189 register unsigned int d;
3190 for (i = 1; i <= min; i++)
3192 if (eptr >= md->end_subject)
3194 SCHECK_PARTIAL();
3195 MRRETURN(MATCH_NOMATCH);
3197 GETCHARINC(d, eptr);
3198 if (d < 256) d = md->lcc[d];
3199 if (fc == d) MRRETURN(MATCH_NOMATCH);
3202 else
3203 #endif
3205 /* Not UTF-8 mode */
3207 for (i = 1; i <= min; i++)
3209 if (eptr >= md->end_subject)
3211 SCHECK_PARTIAL();
3212 MRRETURN(MATCH_NOMATCH);
3214 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3218 if (min == max) continue;
3220 if (minimize)
3222 #ifdef SUPPORT_UTF8
3223 /* UTF-8 mode */
3224 if (utf8)
3226 register unsigned int d;
3227 for (fi = min;; fi++)
3229 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3230 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3231 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3232 if (eptr >= md->end_subject)
3234 SCHECK_PARTIAL();
3235 MRRETURN(MATCH_NOMATCH);
3237 GETCHARINC(d, eptr);
3238 if (d < 256) d = md->lcc[d];
3239 if (fc == d) MRRETURN(MATCH_NOMATCH);
3242 else
3243 #endif
3244 /* Not UTF-8 mode */
3246 for (fi = min;; fi++)
3248 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3249 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3250 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3251 if (eptr >= md->end_subject)
3253 SCHECK_PARTIAL();
3254 MRRETURN(MATCH_NOMATCH);
3256 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3259 /* Control never gets here */
3262 /* Maximize case */
3264 else
3266 pp = eptr;
3268 #ifdef SUPPORT_UTF8
3269 /* UTF-8 mode */
3270 if (utf8)
3272 register unsigned int d;
3273 for (i = min; i < max; i++)
3275 int len = 1;
3276 if (eptr >= md->end_subject)
3278 SCHECK_PARTIAL();
3279 break;
3281 GETCHARLEN(d, eptr, len);
3282 if (d < 256) d = md->lcc[d];
3283 if (fc == d) break;
3284 eptr += len;
3286 if (possessive) continue;
3287 for(;;)
3289 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3290 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3291 if (eptr-- == pp) break; /* Stop if tried at original pos */
3292 BACKCHAR(eptr);
3295 else
3296 #endif
3297 /* Not UTF-8 mode */
3299 for (i = min; i < max; i++)
3301 if (eptr >= md->end_subject)
3303 SCHECK_PARTIAL();
3304 break;
3306 if (fc == md->lcc[*eptr]) break;
3307 eptr++;
3309 if (possessive) continue;
3310 while (eptr >= pp)
3312 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3313 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3314 eptr--;
3318 MRRETURN(MATCH_NOMATCH);
3320 /* Control never gets here */
3323 /* Caseful comparisons */
3325 else
3327 #ifdef SUPPORT_UTF8
3328 /* UTF-8 mode */
3329 if (utf8)
3331 register unsigned int d;
3332 for (i = 1; i <= min; i++)
3334 if (eptr >= md->end_subject)
3336 SCHECK_PARTIAL();
3337 MRRETURN(MATCH_NOMATCH);
3339 GETCHARINC(d, eptr);
3340 if (fc == d) MRRETURN(MATCH_NOMATCH);
3343 else
3344 #endif
3345 /* Not UTF-8 mode */
3347 for (i = 1; i <= min; i++)
3349 if (eptr >= md->end_subject)
3351 SCHECK_PARTIAL();
3352 MRRETURN(MATCH_NOMATCH);
3354 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3358 if (min == max) continue;
3360 if (minimize)
3362 #ifdef SUPPORT_UTF8
3363 /* UTF-8 mode */
3364 if (utf8)
3366 register unsigned int d;
3367 for (fi = min;; fi++)
3369 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3370 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3371 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3372 if (eptr >= md->end_subject)
3374 SCHECK_PARTIAL();
3375 MRRETURN(MATCH_NOMATCH);
3377 GETCHARINC(d, eptr);
3378 if (fc == d) MRRETURN(MATCH_NOMATCH);
3381 else
3382 #endif
3383 /* Not UTF-8 mode */
3385 for (fi = min;; fi++)
3387 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3389 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3390 if (eptr >= md->end_subject)
3392 SCHECK_PARTIAL();
3393 MRRETURN(MATCH_NOMATCH);
3395 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3398 /* Control never gets here */
3401 /* Maximize case */
3403 else
3405 pp = eptr;
3407 #ifdef SUPPORT_UTF8
3408 /* UTF-8 mode */
3409 if (utf8)
3411 register unsigned int d;
3412 for (i = min; i < max; i++)
3414 int len = 1;
3415 if (eptr >= md->end_subject)
3417 SCHECK_PARTIAL();
3418 break;
3420 GETCHARLEN(d, eptr, len);
3421 if (fc == d) break;
3422 eptr += len;
3424 if (possessive) continue;
3425 for(;;)
3427 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429 if (eptr-- == pp) break; /* Stop if tried at original pos */
3430 BACKCHAR(eptr);
3433 else
3434 #endif
3435 /* Not UTF-8 mode */
3437 for (i = min; i < max; i++)
3439 if (eptr >= md->end_subject)
3441 SCHECK_PARTIAL();
3442 break;
3444 if (fc == *eptr) break;
3445 eptr++;
3447 if (possessive) continue;
3448 while (eptr >= pp)
3450 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3451 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3452 eptr--;
3456 MRRETURN(MATCH_NOMATCH);
3459 /* Control never gets here */
3461 /* Match a single character type repeatedly; several different opcodes
3462 share code. This is very similar to the code for single characters, but we
3463 repeat it in the interests of efficiency. */
3465 case OP_TYPEEXACT:
3466 min = max = GET2(ecode, 1);
3467 minimize = TRUE;
3468 ecode += 3;
3469 goto REPEATTYPE;
3471 case OP_TYPEUPTO:
3472 case OP_TYPEMINUPTO:
3473 min = 0;
3474 max = GET2(ecode, 1);
3475 minimize = *ecode == OP_TYPEMINUPTO;
3476 ecode += 3;
3477 goto REPEATTYPE;
3479 case OP_TYPEPOSSTAR:
3480 possessive = TRUE;
3481 min = 0;
3482 max = INT_MAX;
3483 ecode++;
3484 goto REPEATTYPE;
3486 case OP_TYPEPOSPLUS:
3487 possessive = TRUE;
3488 min = 1;
3489 max = INT_MAX;
3490 ecode++;
3491 goto REPEATTYPE;
3493 case OP_TYPEPOSQUERY:
3494 possessive = TRUE;
3495 min = 0;
3496 max = 1;
3497 ecode++;
3498 goto REPEATTYPE;
3500 case OP_TYPEPOSUPTO:
3501 possessive = TRUE;
3502 min = 0;
3503 max = GET2(ecode, 1);
3504 ecode += 3;
3505 goto REPEATTYPE;
3507 case OP_TYPESTAR:
3508 case OP_TYPEMINSTAR:
3509 case OP_TYPEPLUS:
3510 case OP_TYPEMINPLUS:
3511 case OP_TYPEQUERY:
3512 case OP_TYPEMINQUERY:
3513 c = *ecode++ - OP_TYPESTAR;
3514 minimize = (c & 1) != 0;
3515 min = rep_min[c]; /* Pick up values from tables; */
3516 max = rep_max[c]; /* zero for max => infinity */
3517 if (max == 0) max = INT_MAX;
3519 /* Common code for all repeated single character type matches. Note that
3520 in UTF-8 mode, '.' matches a character of any length, but for the other
3521 character types, the valid characters are all one-byte long. */
3523 REPEATTYPE:
3524 ctype = *ecode++; /* Code for the character type */
3526 #ifdef SUPPORT_UCP
3527 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3529 prop_fail_result = ctype == OP_NOTPROP;
3530 prop_type = *ecode++;
3531 prop_value = *ecode++;
3533 else prop_type = -1;
3534 #endif
3536 /* First, ensure the minimum number of matches are present. Use inline
3537 code for maximizing the speed, and do the type test once at the start
3538 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3539 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3540 and single-bytes. */
3542 if (min > 0)
3544 #ifdef SUPPORT_UCP
3545 if (prop_type >= 0)
3547 switch(prop_type)
3549 case PT_ANY:
3550 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3551 for (i = 1; i <= min; i++)
3553 if (eptr >= md->end_subject)
3555 SCHECK_PARTIAL();
3556 MRRETURN(MATCH_NOMATCH);
3558 GETCHARINCTEST(c, eptr);
3560 break;
3562 case PT_LAMP:
3563 for (i = 1; i <= min; i++)
3565 if (eptr >= md->end_subject)
3567 SCHECK_PARTIAL();
3568 MRRETURN(MATCH_NOMATCH);
3570 GETCHARINCTEST(c, eptr);
3571 prop_chartype = UCD_CHARTYPE(c);
3572 if ((prop_chartype == ucp_Lu ||
3573 prop_chartype == ucp_Ll ||
3574 prop_chartype == ucp_Lt) == prop_fail_result)
3575 MRRETURN(MATCH_NOMATCH);
3577 break;
3579 case PT_GC:
3580 for (i = 1; i <= min; i++)
3582 if (eptr >= md->end_subject)
3584 SCHECK_PARTIAL();
3585 MRRETURN(MATCH_NOMATCH);
3587 GETCHARINCTEST(c, eptr);
3588 prop_category = UCD_CATEGORY(c);
3589 if ((prop_category == prop_value) == prop_fail_result)
3590 MRRETURN(MATCH_NOMATCH);
3592 break;
3594 case PT_PC:
3595 for (i = 1; i <= min; i++)
3597 if (eptr >= md->end_subject)
3599 SCHECK_PARTIAL();
3600 MRRETURN(MATCH_NOMATCH);
3602 GETCHARINCTEST(c, eptr);
3603 prop_chartype = UCD_CHARTYPE(c);
3604 if ((prop_chartype == prop_value) == prop_fail_result)
3605 MRRETURN(MATCH_NOMATCH);
3607 break;
3609 case PT_SC:
3610 for (i = 1; i <= min; i++)
3612 if (eptr >= md->end_subject)
3614 SCHECK_PARTIAL();
3615 MRRETURN(MATCH_NOMATCH);
3617 GETCHARINCTEST(c, eptr);
3618 prop_script = UCD_SCRIPT(c);
3619 if ((prop_script == prop_value) == prop_fail_result)
3620 MRRETURN(MATCH_NOMATCH);
3622 break;
3624 case PT_ALNUM:
3625 for (i = 1; i <= min; i++)
3627 if (eptr >= md->end_subject)
3629 SCHECK_PARTIAL();
3630 MRRETURN(MATCH_NOMATCH);
3632 GETCHARINCTEST(c, eptr);
3633 prop_category = UCD_CATEGORY(c);
3634 if ((prop_category == ucp_L || prop_category == ucp_N)
3635 == prop_fail_result)
3636 MRRETURN(MATCH_NOMATCH);
3638 break;
3640 case PT_SPACE: /* Perl space */
3641 for (i = 1; i <= min; i++)
3643 if (eptr >= md->end_subject)
3645 SCHECK_PARTIAL();
3646 MRRETURN(MATCH_NOMATCH);
3648 GETCHARINCTEST(c, eptr);
3649 prop_category = UCD_CATEGORY(c);
3650 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3651 c == CHAR_FF || c == CHAR_CR)
3652 == prop_fail_result)
3653 MRRETURN(MATCH_NOMATCH);
3655 break;
3657 case PT_PXSPACE: /* POSIX space */
3658 for (i = 1; i <= min; i++)
3660 if (eptr >= md->end_subject)
3662 SCHECK_PARTIAL();
3663 MRRETURN(MATCH_NOMATCH);
3665 GETCHARINCTEST(c, eptr);
3666 prop_category = UCD_CATEGORY(c);
3667 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3668 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3669 == prop_fail_result)
3670 MRRETURN(MATCH_NOMATCH);
3672 break;
3674 case PT_WORD:
3675 for (i = 1; i <= min; i++)
3677 if (eptr >= md->end_subject)
3679 SCHECK_PARTIAL();
3680 MRRETURN(MATCH_NOMATCH);
3682 GETCHARINCTEST(c, eptr);
3683 prop_category = UCD_CATEGORY(c);
3684 if ((prop_category == ucp_L || prop_category == ucp_N ||
3685 c == CHAR_UNDERSCORE)
3686 == prop_fail_result)
3687 MRRETURN(MATCH_NOMATCH);
3689 break;
3691 /* This should not occur */
3693 default:
3694 RRETURN(PCRE_ERROR_INTERNAL);
3698 /* Match extended Unicode sequences. We will get here only if the
3699 support is in the binary; otherwise a compile-time error occurs. */
3701 else if (ctype == OP_EXTUNI)
3703 for (i = 1; i <= min; i++)
3705 if (eptr >= md->end_subject)
3707 SCHECK_PARTIAL();
3708 MRRETURN(MATCH_NOMATCH);
3710 GETCHARINCTEST(c, eptr);
3711 prop_category = UCD_CATEGORY(c);
3712 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3713 while (eptr < md->end_subject)
3715 int len = 1;
3716 if (!utf8) c = *eptr;
3717 else { GETCHARLEN(c, eptr, len); }
3718 prop_category = UCD_CATEGORY(c);
3719 if (prop_category != ucp_M) break;
3720 eptr += len;
3725 else
3726 #endif /* SUPPORT_UCP */
3728 /* Handle all other cases when the coding is UTF-8 */
3730 #ifdef SUPPORT_UTF8
3731 if (utf8) switch(ctype)
3733 case OP_ANY:
3734 for (i = 1; i <= min; i++)
3736 if (eptr >= md->end_subject)
3738 SCHECK_PARTIAL();
3739 MRRETURN(MATCH_NOMATCH);
3741 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3742 eptr++;
3743 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3745 break;
3747 case OP_ALLANY:
3748 for (i = 1; i <= min; i++)
3750 if (eptr >= md->end_subject)
3752 SCHECK_PARTIAL();
3753 MRRETURN(MATCH_NOMATCH);
3755 eptr++;
3756 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3758 break;
3760 case OP_ANYBYTE:
3761 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3762 eptr += min;
3763 break;
3765 case OP_ANYNL:
3766 for (i = 1; i <= min; i++)
3768 if (eptr >= md->end_subject)
3770 SCHECK_PARTIAL();
3771 MRRETURN(MATCH_NOMATCH);
3773 GETCHARINC(c, eptr);
3774 switch(c)
3776 default: MRRETURN(MATCH_NOMATCH);
3777 case 0x000d:
3778 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3779 break;
3781 case 0x000a:
3782 break;
3784 case 0x000b:
3785 case 0x000c:
3786 case 0x0085:
3787 case 0x2028:
3788 case 0x2029:
3789 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3790 break;
3793 break;
3795 case OP_NOT_HSPACE:
3796 for (i = 1; i <= min; i++)
3798 if (eptr >= md->end_subject)
3800 SCHECK_PARTIAL();
3801 MRRETURN(MATCH_NOMATCH);
3803 GETCHARINC(c, eptr);
3804 switch(c)
3806 default: break;
3807 case 0x09: /* HT */
3808 case 0x20: /* SPACE */
3809 case 0xa0: /* NBSP */
3810 case 0x1680: /* OGHAM SPACE MARK */
3811 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3812 case 0x2000: /* EN QUAD */
3813 case 0x2001: /* EM QUAD */
3814 case 0x2002: /* EN SPACE */
3815 case 0x2003: /* EM SPACE */
3816 case 0x2004: /* THREE-PER-EM SPACE */
3817 case 0x2005: /* FOUR-PER-EM SPACE */
3818 case 0x2006: /* SIX-PER-EM SPACE */
3819 case 0x2007: /* FIGURE SPACE */
3820 case 0x2008: /* PUNCTUATION SPACE */
3821 case 0x2009: /* THIN SPACE */
3822 case 0x200A: /* HAIR SPACE */
3823 case 0x202f: /* NARROW NO-BREAK SPACE */
3824 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3825 case 0x3000: /* IDEOGRAPHIC SPACE */
3826 MRRETURN(MATCH_NOMATCH);
3829 break;
3831 case OP_HSPACE:
3832 for (i = 1; i <= min; i++)
3834 if (eptr >= md->end_subject)
3836 SCHECK_PARTIAL();
3837 MRRETURN(MATCH_NOMATCH);
3839 GETCHARINC(c, eptr);
3840 switch(c)
3842 default: MRRETURN(MATCH_NOMATCH);
3843 case 0x09: /* HT */
3844 case 0x20: /* SPACE */
3845 case 0xa0: /* NBSP */
3846 case 0x1680: /* OGHAM SPACE MARK */
3847 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3848 case 0x2000: /* EN QUAD */
3849 case 0x2001: /* EM QUAD */
3850 case 0x2002: /* EN SPACE */
3851 case 0x2003: /* EM SPACE */
3852 case 0x2004: /* THREE-PER-EM SPACE */
3853 case 0x2005: /* FOUR-PER-EM SPACE */
3854 case 0x2006: /* SIX-PER-EM SPACE */
3855 case 0x2007: /* FIGURE SPACE */
3856 case 0x2008: /* PUNCTUATION SPACE */
3857 case 0x2009: /* THIN SPACE */
3858 case 0x200A: /* HAIR SPACE */
3859 case 0x202f: /* NARROW NO-BREAK SPACE */
3860 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3861 case 0x3000: /* IDEOGRAPHIC SPACE */
3862 break;
3865 break;
3867 case OP_NOT_VSPACE:
3868 for (i = 1; i <= min; i++)
3870 if (eptr >= md->end_subject)
3872 SCHECK_PARTIAL();
3873 MRRETURN(MATCH_NOMATCH);
3875 GETCHARINC(c, eptr);
3876 switch(c)
3878 default: break;
3879 case 0x0a: /* LF */
3880 case 0x0b: /* VT */
3881 case 0x0c: /* FF */
3882 case 0x0d: /* CR */
3883 case 0x85: /* NEL */
3884 case 0x2028: /* LINE SEPARATOR */
3885 case 0x2029: /* PARAGRAPH SEPARATOR */
3886 MRRETURN(MATCH_NOMATCH);
3889 break;
3891 case OP_VSPACE:
3892 for (i = 1; i <= min; i++)
3894 if (eptr >= md->end_subject)
3896 SCHECK_PARTIAL();
3897 MRRETURN(MATCH_NOMATCH);
3899 GETCHARINC(c, eptr);
3900 switch(c)
3902 default: MRRETURN(MATCH_NOMATCH);
3903 case 0x0a: /* LF */
3904 case 0x0b: /* VT */
3905 case 0x0c: /* FF */
3906 case 0x0d: /* CR */
3907 case 0x85: /* NEL */
3908 case 0x2028: /* LINE SEPARATOR */
3909 case 0x2029: /* PARAGRAPH SEPARATOR */
3910 break;
3913 break;
3915 case OP_NOT_DIGIT:
3916 for (i = 1; i <= min; i++)
3918 if (eptr >= md->end_subject)
3920 SCHECK_PARTIAL();
3921 MRRETURN(MATCH_NOMATCH);
3923 GETCHARINC(c, eptr);
3924 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3925 MRRETURN(MATCH_NOMATCH);
3927 break;
3929 case OP_DIGIT:
3930 for (i = 1; i <= min; i++)
3932 if (eptr >= md->end_subject)
3934 SCHECK_PARTIAL();
3935 MRRETURN(MATCH_NOMATCH);
3937 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3938 MRRETURN(MATCH_NOMATCH);
3939 /* No need to skip more bytes - we know it's a 1-byte character */
3941 break;
3943 case OP_NOT_WHITESPACE:
3944 for (i = 1; i <= min; i++)
3946 if (eptr >= md->end_subject)
3948 SCHECK_PARTIAL();
3949 MRRETURN(MATCH_NOMATCH);
3951 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3952 MRRETURN(MATCH_NOMATCH);
3953 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3955 break;
3957 case OP_WHITESPACE:
3958 for (i = 1; i <= min; i++)
3960 if (eptr >= md->end_subject)
3962 SCHECK_PARTIAL();
3963 MRRETURN(MATCH_NOMATCH);
3965 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3966 MRRETURN(MATCH_NOMATCH);
3967 /* No need to skip more bytes - we know it's a 1-byte character */
3969 break;
3971 case OP_NOT_WORDCHAR:
3972 for (i = 1; i <= min; i++)
3974 if (eptr >= md->end_subject)
3976 SCHECK_PARTIAL();
3977 MRRETURN(MATCH_NOMATCH);
3979 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3980 MRRETURN(MATCH_NOMATCH);
3981 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3983 break;
3985 case OP_WORDCHAR:
3986 for (i = 1; i <= min; i++)
3988 if (eptr >= md->end_subject)
3990 SCHECK_PARTIAL();
3991 MRRETURN(MATCH_NOMATCH);
3993 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3994 MRRETURN(MATCH_NOMATCH);
3995 /* No need to skip more bytes - we know it's a 1-byte character */
3997 break;
3999 default:
4000 RRETURN(PCRE_ERROR_INTERNAL);
4001 } /* End switch(ctype) */
4003 else
4004 #endif /* SUPPORT_UTF8 */
4006 /* Code for the non-UTF-8 case for minimum matching of operators other
4007 than OP_PROP and OP_NOTPROP. */
4009 switch(ctype)
4011 case OP_ANY:
4012 for (i = 1; i <= min; i++)
4014 if (eptr >= md->end_subject)
4016 SCHECK_PARTIAL();
4017 MRRETURN(MATCH_NOMATCH);
4019 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4020 eptr++;
4022 break;
4024 case OP_ALLANY:
4025 if (eptr > md->end_subject - min)
4027 SCHECK_PARTIAL();
4028 MRRETURN(MATCH_NOMATCH);
4030 eptr += min;
4031 break;
4033 case OP_ANYBYTE:
4034 if (eptr > md->end_subject - min)
4036 SCHECK_PARTIAL();
4037 MRRETURN(MATCH_NOMATCH);
4039 eptr += min;
4040 break;
4042 case OP_ANYNL:
4043 for (i = 1; i <= min; i++)
4045 if (eptr >= md->end_subject)
4047 SCHECK_PARTIAL();
4048 MRRETURN(MATCH_NOMATCH);
4050 switch(*eptr++)
4052 default: MRRETURN(MATCH_NOMATCH);
4053 case 0x000d:
4054 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4055 break;
4056 case 0x000a:
4057 break;
4059 case 0x000b:
4060 case 0x000c:
4061 case 0x0085:
4062 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4063 break;
4066 break;
4068 case OP_NOT_HSPACE:
4069 for (i = 1; i <= min; i++)
4071 if (eptr >= md->end_subject)
4073 SCHECK_PARTIAL();
4074 MRRETURN(MATCH_NOMATCH);
4076 switch(*eptr++)
4078 default: break;
4079 case 0x09: /* HT */
4080 case 0x20: /* SPACE */
4081 case 0xa0: /* NBSP */
4082 MRRETURN(MATCH_NOMATCH);
4085 break;
4087 case OP_HSPACE:
4088 for (i = 1; i <= min; i++)
4090 if (eptr >= md->end_subject)
4092 SCHECK_PARTIAL();
4093 MRRETURN(MATCH_NOMATCH);
4095 switch(*eptr++)
4097 default: MRRETURN(MATCH_NOMATCH);
4098 case 0x09: /* HT */
4099 case 0x20: /* SPACE */
4100 case 0xa0: /* NBSP */
4101 break;
4104 break;
4106 case OP_NOT_VSPACE:
4107 for (i = 1; i <= min; i++)
4109 if (eptr >= md->end_subject)
4111 SCHECK_PARTIAL();
4112 MRRETURN(MATCH_NOMATCH);
4114 switch(*eptr++)
4116 default: break;
4117 case 0x0a: /* LF */
4118 case 0x0b: /* VT */
4119 case 0x0c: /* FF */
4120 case 0x0d: /* CR */
4121 case 0x85: /* NEL */
4122 MRRETURN(MATCH_NOMATCH);
4125 break;
4127 case OP_VSPACE:
4128 for (i = 1; i <= min; i++)
4130 if (eptr >= md->end_subject)
4132 SCHECK_PARTIAL();
4133 MRRETURN(MATCH_NOMATCH);
4135 switch(*eptr++)
4137 default: MRRETURN(MATCH_NOMATCH);
4138 case 0x0a: /* LF */
4139 case 0x0b: /* VT */
4140 case 0x0c: /* FF */
4141 case 0x0d: /* CR */
4142 case 0x85: /* NEL */
4143 break;
4146 break;
4148 case OP_NOT_DIGIT:
4149 for (i = 1; i <= min; i++)
4151 if (eptr >= md->end_subject)
4153 SCHECK_PARTIAL();
4154 MRRETURN(MATCH_NOMATCH);
4156 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4158 break;
4160 case OP_DIGIT:
4161 for (i = 1; i <= min; i++)
4163 if (eptr >= md->end_subject)
4165 SCHECK_PARTIAL();
4166 MRRETURN(MATCH_NOMATCH);
4168 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4170 break;
4172 case OP_NOT_WHITESPACE:
4173 for (i = 1; i <= min; i++)
4175 if (eptr >= md->end_subject)
4177 SCHECK_PARTIAL();
4178 MRRETURN(MATCH_NOMATCH);
4180 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4182 break;
4184 case OP_WHITESPACE:
4185 for (i = 1; i <= min; i++)
4187 if (eptr >= md->end_subject)
4189 SCHECK_PARTIAL();
4190 MRRETURN(MATCH_NOMATCH);
4192 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4194 break;
4196 case OP_NOT_WORDCHAR:
4197 for (i = 1; i <= min; i++)
4199 if (eptr >= md->end_subject)
4201 SCHECK_PARTIAL();
4202 MRRETURN(MATCH_NOMATCH);
4204 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4205 MRRETURN(MATCH_NOMATCH);
4207 break;
4209 case OP_WORDCHAR:
4210 for (i = 1; i <= min; i++)
4212 if (eptr >= md->end_subject)
4214 SCHECK_PARTIAL();
4215 MRRETURN(MATCH_NOMATCH);
4217 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4218 MRRETURN(MATCH_NOMATCH);
4220 break;
4222 default:
4223 RRETURN(PCRE_ERROR_INTERNAL);
4227 /* If min = max, continue at the same level without recursing */
4229 if (min == max) continue;
4231 /* If minimizing, we have to test the rest of the pattern before each
4232 subsequent match. Again, separate the UTF-8 case for speed, and also
4233 separate the UCP cases. */
4235 if (minimize)
4237 #ifdef SUPPORT_UCP
4238 if (prop_type >= 0)
4240 switch(prop_type)
4242 case PT_ANY:
4243 for (fi = min;; fi++)
4245 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4246 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4247 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4248 if (eptr >= md->end_subject)
4250 SCHECK_PARTIAL();
4251 MRRETURN(MATCH_NOMATCH);
4253 GETCHARINCTEST(c, eptr);
4254 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4256 /* Control never gets here */
4258 case PT_LAMP:
4259 for (fi = min;; fi++)
4261 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4263 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4264 if (eptr >= md->end_subject)
4266 SCHECK_PARTIAL();
4267 MRRETURN(MATCH_NOMATCH);
4269 GETCHARINCTEST(c, eptr);
4270 prop_chartype = UCD_CHARTYPE(c);
4271 if ((prop_chartype == ucp_Lu ||
4272 prop_chartype == ucp_Ll ||
4273 prop_chartype == ucp_Lt) == prop_fail_result)
4274 MRRETURN(MATCH_NOMATCH);
4276 /* Control never gets here */
4278 case PT_GC:
4279 for (fi = min;; fi++)
4281 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4283 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4284 if (eptr >= md->end_subject)
4286 SCHECK_PARTIAL();
4287 MRRETURN(MATCH_NOMATCH);
4289 GETCHARINCTEST(c, eptr);
4290 prop_category = UCD_CATEGORY(c);
4291 if ((prop_category == prop_value) == prop_fail_result)
4292 MRRETURN(MATCH_NOMATCH);
4294 /* Control never gets here */
4296 case PT_PC:
4297 for (fi = min;; fi++)
4299 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4300 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4301 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4302 if (eptr >= md->end_subject)
4304 SCHECK_PARTIAL();
4305 MRRETURN(MATCH_NOMATCH);
4307 GETCHARINCTEST(c, eptr);
4308 prop_chartype = UCD_CHARTYPE(c);
4309 if ((prop_chartype == prop_value) == prop_fail_result)
4310 MRRETURN(MATCH_NOMATCH);
4312 /* Control never gets here */
4314 case PT_SC:
4315 for (fi = min;; fi++)
4317 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4318 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4319 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4320 if (eptr >= md->end_subject)
4322 SCHECK_PARTIAL();
4323 MRRETURN(MATCH_NOMATCH);
4325 GETCHARINCTEST(c, eptr);
4326 prop_script = UCD_SCRIPT(c);
4327 if ((prop_script == prop_value) == prop_fail_result)
4328 MRRETURN(MATCH_NOMATCH);
4330 /* Control never gets here */
4332 case PT_ALNUM:
4333 for (fi = min;; fi++)
4335 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
4336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4337 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4338 if (eptr >= md->end_subject)
4340 SCHECK_PARTIAL();
4341 MRRETURN(MATCH_NOMATCH);
4343 GETCHARINCTEST(c, eptr);
4344 prop_category = UCD_CATEGORY(c);
4345 if ((prop_category == ucp_L || prop_category == ucp_N)
4346 == prop_fail_result)
4347 MRRETURN(MATCH_NOMATCH);
4349 /* Control never gets here */
4351 case PT_SPACE: /* Perl space */
4352 for (fi = min;; fi++)
4354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
4355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4356 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4357 if (eptr >= md->end_subject)
4359 SCHECK_PARTIAL();
4360 MRRETURN(MATCH_NOMATCH);
4362 GETCHARINCTEST(c, eptr);
4363 prop_category = UCD_CATEGORY(c);
4364 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4365 c == CHAR_FF || c == CHAR_CR)
4366 == prop_fail_result)
4367 MRRETURN(MATCH_NOMATCH);
4369 /* Control never gets here */
4371 case PT_PXSPACE: /* POSIX space */
4372 for (fi = min;; fi++)
4374 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
4375 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4376 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4377 if (eptr >= md->end_subject)
4379 SCHECK_PARTIAL();
4380 MRRETURN(MATCH_NOMATCH);
4382 GETCHARINCTEST(c, eptr);
4383 prop_category = UCD_CATEGORY(c);
4384 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4385 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4386 == prop_fail_result)
4387 MRRETURN(MATCH_NOMATCH);
4389 /* Control never gets here */
4391 case PT_WORD:
4392 for (fi = min;; fi++)
4394 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
4395 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4396 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4397 if (eptr >= md->end_subject)
4399 SCHECK_PARTIAL();
4400 MRRETURN(MATCH_NOMATCH);
4402 GETCHARINCTEST(c, eptr);
4403 prop_category = UCD_CATEGORY(c);
4404 if ((prop_category == ucp_L ||
4405 prop_category == ucp_N ||
4406 c == CHAR_UNDERSCORE)
4407 == prop_fail_result)
4408 MRRETURN(MATCH_NOMATCH);
4410 /* Control never gets here */
4412 /* This should never occur */
4414 default:
4415 RRETURN(PCRE_ERROR_INTERNAL);
4419 /* Match extended Unicode sequences. We will get here only if the
4420 support is in the binary; otherwise a compile-time error occurs. */
4422 else if (ctype == OP_EXTUNI)
4424 for (fi = min;; fi++)
4426 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4427 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4428 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4429 if (eptr >= md->end_subject)
4431 SCHECK_PARTIAL();
4432 MRRETURN(MATCH_NOMATCH);
4434 GETCHARINCTEST(c, eptr);
4435 prop_category = UCD_CATEGORY(c);
4436 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4437 while (eptr < md->end_subject)
4439 int len = 1;
4440 if (!utf8) c = *eptr;
4441 else { GETCHARLEN(c, eptr, len); }
4442 prop_category = UCD_CATEGORY(c);
4443 if (prop_category != ucp_M) break;
4444 eptr += len;
4449 else
4450 #endif /* SUPPORT_UCP */
4452 #ifdef SUPPORT_UTF8
4453 /* UTF-8 mode */
4454 if (utf8)
4456 for (fi = min;; fi++)
4458 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4460 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4461 if (eptr >= md->end_subject)
4463 SCHECK_PARTIAL();
4464 MRRETURN(MATCH_NOMATCH);
4466 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4467 MRRETURN(MATCH_NOMATCH);
4468 GETCHARINC(c, eptr);
4469 switch(ctype)
4471 case OP_ANY: /* This is the non-NL case */
4472 case OP_ALLANY:
4473 case OP_ANYBYTE:
4474 break;
4476 case OP_ANYNL:
4477 switch(c)
4479 default: MRRETURN(MATCH_NOMATCH);
4480 case 0x000d:
4481 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4482 break;
4483 case 0x000a:
4484 break;
4486 case 0x000b:
4487 case 0x000c:
4488 case 0x0085:
4489 case 0x2028:
4490 case 0x2029:
4491 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4492 break;
4494 break;
4496 case OP_NOT_HSPACE:
4497 switch(c)
4499 default: break;
4500 case 0x09: /* HT */
4501 case 0x20: /* SPACE */
4502 case 0xa0: /* NBSP */
4503 case 0x1680: /* OGHAM SPACE MARK */
4504 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4505 case 0x2000: /* EN QUAD */
4506 case 0x2001: /* EM QUAD */
4507 case 0x2002: /* EN SPACE */
4508 case 0x2003: /* EM SPACE */
4509 case 0x2004: /* THREE-PER-EM SPACE */
4510 case 0x2005: /* FOUR-PER-EM SPACE */
4511 case 0x2006: /* SIX-PER-EM SPACE */
4512 case 0x2007: /* FIGURE SPACE */
4513 case 0x2008: /* PUNCTUATION SPACE */
4514 case 0x2009: /* THIN SPACE */
4515 case 0x200A: /* HAIR SPACE */
4516 case 0x202f: /* NARROW NO-BREAK SPACE */
4517 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4518 case 0x3000: /* IDEOGRAPHIC SPACE */
4519 MRRETURN(MATCH_NOMATCH);
4521 break;
4523 case OP_HSPACE:
4524 switch(c)
4526 default: MRRETURN(MATCH_NOMATCH);
4527 case 0x09: /* HT */
4528 case 0x20: /* SPACE */
4529 case 0xa0: /* NBSP */
4530 case 0x1680: /* OGHAM SPACE MARK */
4531 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4532 case 0x2000: /* EN QUAD */
4533 case 0x2001: /* EM QUAD */
4534 case 0x2002: /* EN SPACE */
4535 case 0x2003: /* EM SPACE */
4536 case 0x2004: /* THREE-PER-EM SPACE */
4537 case 0x2005: /* FOUR-PER-EM SPACE */
4538 case 0x2006: /* SIX-PER-EM SPACE */
4539 case 0x2007: /* FIGURE SPACE */
4540 case 0x2008: /* PUNCTUATION SPACE */
4541 case 0x2009: /* THIN SPACE */
4542 case 0x200A: /* HAIR SPACE */
4543 case 0x202f: /* NARROW NO-BREAK SPACE */
4544 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4545 case 0x3000: /* IDEOGRAPHIC SPACE */
4546 break;
4548 break;
4550 case OP_NOT_VSPACE:
4551 switch(c)
4553 default: break;
4554 case 0x0a: /* LF */
4555 case 0x0b: /* VT */
4556 case 0x0c: /* FF */
4557 case 0x0d: /* CR */
4558 case 0x85: /* NEL */
4559 case 0x2028: /* LINE SEPARATOR */
4560 case 0x2029: /* PARAGRAPH SEPARATOR */
4561 MRRETURN(MATCH_NOMATCH);
4563 break;
4565 case OP_VSPACE:
4566 switch(c)
4568 default: MRRETURN(MATCH_NOMATCH);
4569 case 0x0a: /* LF */
4570 case 0x0b: /* VT */
4571 case 0x0c: /* FF */
4572 case 0x0d: /* CR */
4573 case 0x85: /* NEL */
4574 case 0x2028: /* LINE SEPARATOR */
4575 case 0x2029: /* PARAGRAPH SEPARATOR */
4576 break;
4578 break;
4580 case OP_NOT_DIGIT:
4581 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4582 MRRETURN(MATCH_NOMATCH);
4583 break;
4585 case OP_DIGIT:
4586 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4587 MRRETURN(MATCH_NOMATCH);
4588 break;
4590 case OP_NOT_WHITESPACE:
4591 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4592 MRRETURN(MATCH_NOMATCH);
4593 break;
4595 case OP_WHITESPACE:
4596 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4597 MRRETURN(MATCH_NOMATCH);
4598 break;
4600 case OP_NOT_WORDCHAR:
4601 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4602 MRRETURN(MATCH_NOMATCH);
4603 break;
4605 case OP_WORDCHAR:
4606 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4607 MRRETURN(MATCH_NOMATCH);
4608 break;
4610 default:
4611 RRETURN(PCRE_ERROR_INTERNAL);
4615 else
4616 #endif
4617 /* Not UTF-8 mode */
4619 for (fi = min;; fi++)
4621 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4622 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4623 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4624 if (eptr >= md->end_subject)
4626 SCHECK_PARTIAL();
4627 MRRETURN(MATCH_NOMATCH);
4629 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4630 MRRETURN(MATCH_NOMATCH);
4631 c = *eptr++;
4632 switch(ctype)
4634 case OP_ANY: /* This is the non-NL case */
4635 case OP_ALLANY:
4636 case OP_ANYBYTE:
4637 break;
4639 case OP_ANYNL:
4640 switch(c)
4642 default: MRRETURN(MATCH_NOMATCH);
4643 case 0x000d:
4644 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4645 break;
4647 case 0x000a:
4648 break;
4650 case 0x000b:
4651 case 0x000c:
4652 case 0x0085:
4653 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4654 break;
4656 break;
4658 case OP_NOT_HSPACE:
4659 switch(c)
4661 default: break;
4662 case 0x09: /* HT */
4663 case 0x20: /* SPACE */
4664 case 0xa0: /* NBSP */
4665 MRRETURN(MATCH_NOMATCH);
4667 break;
4669 case OP_HSPACE:
4670 switch(c)
4672 default: MRRETURN(MATCH_NOMATCH);
4673 case 0x09: /* HT */
4674 case 0x20: /* SPACE */
4675 case 0xa0: /* NBSP */
4676 break;
4678 break;
4680 case OP_NOT_VSPACE:
4681 switch(c)
4683 default: break;
4684 case 0x0a: /* LF */
4685 case 0x0b: /* VT */
4686 case 0x0c: /* FF */
4687 case 0x0d: /* CR */
4688 case 0x85: /* NEL */
4689 MRRETURN(MATCH_NOMATCH);
4691 break;
4693 case OP_VSPACE:
4694 switch(c)
4696 default: MRRETURN(MATCH_NOMATCH);
4697 case 0x0a: /* LF */
4698 case 0x0b: /* VT */
4699 case 0x0c: /* FF */
4700 case 0x0d: /* CR */
4701 case 0x85: /* NEL */
4702 break;
4704 break;
4706 case OP_NOT_DIGIT:
4707 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4708 break;
4710 case OP_DIGIT:
4711 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4712 break;
4714 case OP_NOT_WHITESPACE:
4715 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4716 break;
4718 case OP_WHITESPACE:
4719 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4720 break;
4722 case OP_NOT_WORDCHAR:
4723 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4724 break;
4726 case OP_WORDCHAR:
4727 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4728 break;
4730 default:
4731 RRETURN(PCRE_ERROR_INTERNAL);
4735 /* Control never gets here */
4738 /* If maximizing, it is worth using inline code for speed, doing the type
4739 test once at the start (i.e. keep it out of the loop). Again, keep the
4740 UTF-8 and UCP stuff separate. */
4742 else
4744 pp = eptr; /* Remember where we started */
4746 #ifdef SUPPORT_UCP
4747 if (prop_type >= 0)
4749 switch(prop_type)
4751 case PT_ANY:
4752 for (i = min; i < max; i++)
4754 int len = 1;
4755 if (eptr >= md->end_subject)
4757 SCHECK_PARTIAL();
4758 break;
4760 GETCHARLENTEST(c, eptr, len);
4761 if (prop_fail_result) break;
4762 eptr+= len;
4764 break;
4766 case PT_LAMP:
4767 for (i = min; i < max; i++)
4769 int len = 1;
4770 if (eptr >= md->end_subject)
4772 SCHECK_PARTIAL();
4773 break;
4775 GETCHARLENTEST(c, eptr, len);
4776 prop_chartype = UCD_CHARTYPE(c);
4777 if ((prop_chartype == ucp_Lu ||
4778 prop_chartype == ucp_Ll ||
4779 prop_chartype == ucp_Lt) == prop_fail_result)
4780 break;
4781 eptr+= len;
4783 break;
4785 case PT_GC:
4786 for (i = min; i < max; i++)
4788 int len = 1;
4789 if (eptr >= md->end_subject)
4791 SCHECK_PARTIAL();
4792 break;
4794 GETCHARLENTEST(c, eptr, len);
4795 prop_category = UCD_CATEGORY(c);
4796 if ((prop_category == prop_value) == prop_fail_result)
4797 break;
4798 eptr+= len;
4800 break;
4802 case PT_PC:
4803 for (i = min; i < max; i++)
4805 int len = 1;
4806 if (eptr >= md->end_subject)
4808 SCHECK_PARTIAL();
4809 break;
4811 GETCHARLENTEST(c, eptr, len);
4812 prop_chartype = UCD_CHARTYPE(c);
4813 if ((prop_chartype == prop_value) == prop_fail_result)
4814 break;
4815 eptr+= len;
4817 break;
4819 case PT_SC:
4820 for (i = min; i < max; i++)
4822 int len = 1;
4823 if (eptr >= md->end_subject)
4825 SCHECK_PARTIAL();
4826 break;
4828 GETCHARLENTEST(c, eptr, len);
4829 prop_script = UCD_SCRIPT(c);
4830 if ((prop_script == prop_value) == prop_fail_result)
4831 break;
4832 eptr+= len;
4834 break;
4836 case PT_ALNUM:
4837 for (i = min; i < max; i++)
4839 int len = 1;
4840 if (eptr >= md->end_subject)
4842 SCHECK_PARTIAL();
4843 break;
4845 GETCHARLENTEST(c, eptr, len);
4846 prop_category = UCD_CATEGORY(c);
4847 if ((prop_category == ucp_L || prop_category == ucp_N)
4848 == prop_fail_result)
4849 break;
4850 eptr+= len;
4852 break;
4854 case PT_SPACE: /* Perl space */
4855 for (i = min; i < max; i++)
4857 int len = 1;
4858 if (eptr >= md->end_subject)
4860 SCHECK_PARTIAL();
4861 break;
4863 GETCHARLENTEST(c, eptr, len);
4864 prop_category = UCD_CATEGORY(c);
4865 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4866 c == CHAR_FF || c == CHAR_CR)
4867 == prop_fail_result)
4868 break;
4869 eptr+= len;
4871 break;
4873 case PT_PXSPACE: /* POSIX space */
4874 for (i = min; i < max; i++)
4876 int len = 1;
4877 if (eptr >= md->end_subject)
4879 SCHECK_PARTIAL();
4880 break;
4882 GETCHARLENTEST(c, eptr, len);
4883 prop_category = UCD_CATEGORY(c);
4884 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4885 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4886 == prop_fail_result)
4887 break;
4888 eptr+= len;
4890 break;
4892 case PT_WORD:
4893 for (i = min; i < max; i++)
4895 int len = 1;
4896 if (eptr >= md->end_subject)
4898 SCHECK_PARTIAL();
4899 break;
4901 GETCHARLENTEST(c, eptr, len);
4902 prop_category = UCD_CATEGORY(c);
4903 if ((prop_category == ucp_L || prop_category == ucp_N ||
4904 c == CHAR_UNDERSCORE) == prop_fail_result)
4905 break;
4906 eptr+= len;
4908 break;
4910 default:
4911 RRETURN(PCRE_ERROR_INTERNAL);
4914 /* eptr is now past the end of the maximum run */
4916 if (possessive) continue;
4917 for(;;)
4919 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4921 if (eptr-- == pp) break; /* Stop if tried at original pos */
4922 if (utf8) BACKCHAR(eptr);
4926 /* Match extended Unicode sequences. We will get here only if the
4927 support is in the binary; otherwise a compile-time error occurs. */
4929 else if (ctype == OP_EXTUNI)
4931 for (i = min; i < max; i++)
4933 if (eptr >= md->end_subject)
4935 SCHECK_PARTIAL();
4936 break;
4938 GETCHARINCTEST(c, eptr);
4939 prop_category = UCD_CATEGORY(c);
4940 if (prop_category == ucp_M) break;
4941 while (eptr < md->end_subject)
4943 int len = 1;
4944 if (!utf8) c = *eptr; else
4946 GETCHARLEN(c, eptr, len);
4948 prop_category = UCD_CATEGORY(c);
4949 if (prop_category != ucp_M) break;
4950 eptr += len;
4954 /* eptr is now past the end of the maximum run */
4956 if (possessive) continue;
4958 for(;;)
4960 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4961 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4962 if (eptr-- == pp) break; /* Stop if tried at original pos */
4963 for (;;) /* Move back over one extended */
4965 int len = 1;
4966 if (!utf8) c = *eptr; else
4968 BACKCHAR(eptr);
4969 GETCHARLEN(c, eptr, len);
4971 prop_category = UCD_CATEGORY(c);
4972 if (prop_category != ucp_M) break;
4973 eptr--;
4978 else
4979 #endif /* SUPPORT_UCP */
4981 #ifdef SUPPORT_UTF8
4982 /* UTF-8 mode */
4984 if (utf8)
4986 switch(ctype)
4988 case OP_ANY:
4989 if (max < INT_MAX)
4991 for (i = min; i < max; i++)
4993 if (eptr >= md->end_subject)
4995 SCHECK_PARTIAL();
4996 break;
4998 if (IS_NEWLINE(eptr)) break;
4999 eptr++;
5000 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5004 /* Handle unlimited UTF-8 repeat */
5006 else
5008 for (i = min; i < max; i++)
5010 if (eptr >= md->end_subject)
5012 SCHECK_PARTIAL();
5013 break;
5015 if (IS_NEWLINE(eptr)) break;
5016 eptr++;
5017 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5020 break;
5022 case OP_ALLANY:
5023 if (max < INT_MAX)
5025 for (i = min; i < max; i++)
5027 if (eptr >= md->end_subject)
5029 SCHECK_PARTIAL();
5030 break;
5032 eptr++;
5033 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5036 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5037 break;
5039 /* The byte case is the same as non-UTF8 */
5041 case OP_ANYBYTE:
5042 c = max - min;
5043 if (c > (unsigned int)(md->end_subject - eptr))
5045 eptr = md->end_subject;
5046 SCHECK_PARTIAL();
5048 else eptr += c;
5049 break;
5051 case OP_ANYNL:
5052 for (i = min; i < max; i++)
5054 int len = 1;
5055 if (eptr >= md->end_subject)
5057 SCHECK_PARTIAL();
5058 break;
5060 GETCHARLEN(c, eptr, len);
5061 if (c == 0x000d)
5063 if (++eptr >= md->end_subject) break;
5064 if (*eptr == 0x000a) eptr++;
5066 else
5068 if (c != 0x000a &&
5069 (md->bsr_anycrlf ||
5070 (c != 0x000b && c != 0x000c &&
5071 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5072 break;
5073 eptr += len;
5076 break;
5078 case OP_NOT_HSPACE:
5079 case OP_HSPACE:
5080 for (i = min; i < max; i++)
5082 BOOL gotspace;
5083 int len = 1;
5084 if (eptr >= md->end_subject)
5086 SCHECK_PARTIAL();
5087 break;
5089 GETCHARLEN(c, eptr, len);
5090 switch(c)
5092 default: gotspace = FALSE; break;
5093 case 0x09: /* HT */
5094 case 0x20: /* SPACE */
5095 case 0xa0: /* NBSP */
5096 case 0x1680: /* OGHAM SPACE MARK */
5097 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5098 case 0x2000: /* EN QUAD */
5099 case 0x2001: /* EM QUAD */
5100 case 0x2002: /* EN SPACE */
5101 case 0x2003: /* EM SPACE */
5102 case 0x2004: /* THREE-PER-EM SPACE */
5103 case 0x2005: /* FOUR-PER-EM SPACE */
5104 case 0x2006: /* SIX-PER-EM SPACE */
5105 case 0x2007: /* FIGURE SPACE */
5106 case 0x2008: /* PUNCTUATION SPACE */
5107 case 0x2009: /* THIN SPACE */
5108 case 0x200A: /* HAIR SPACE */
5109 case 0x202f: /* NARROW NO-BREAK SPACE */
5110 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5111 case 0x3000: /* IDEOGRAPHIC SPACE */
5112 gotspace = TRUE;
5113 break;
5115 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5116 eptr += len;
5118 break;
5120 case OP_NOT_VSPACE:
5121 case OP_VSPACE:
5122 for (i = min; i < max; i++)
5124 BOOL gotspace;
5125 int len = 1;
5126 if (eptr >= md->end_subject)
5128 SCHECK_PARTIAL();
5129 break;
5131 GETCHARLEN(c, eptr, len);
5132 switch(c)
5134 default: gotspace = FALSE; break;
5135 case 0x0a: /* LF */
5136 case 0x0b: /* VT */
5137 case 0x0c: /* FF */
5138 case 0x0d: /* CR */
5139 case 0x85: /* NEL */
5140 case 0x2028: /* LINE SEPARATOR */
5141 case 0x2029: /* PARAGRAPH SEPARATOR */
5142 gotspace = TRUE;
5143 break;
5145 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5146 eptr += len;
5148 break;
5150 case OP_NOT_DIGIT:
5151 for (i = min; i < max; i++)
5153 int len = 1;
5154 if (eptr >= md->end_subject)
5156 SCHECK_PARTIAL();
5157 break;
5159 GETCHARLEN(c, eptr, len);
5160 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5161 eptr+= len;
5163 break;
5165 case OP_DIGIT:
5166 for (i = min; i < max; i++)
5168 int len = 1;
5169 if (eptr >= md->end_subject)
5171 SCHECK_PARTIAL();
5172 break;
5174 GETCHARLEN(c, eptr, len);
5175 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5176 eptr+= len;
5178 break;
5180 case OP_NOT_WHITESPACE:
5181 for (i = min; i < max; i++)
5183 int len = 1;
5184 if (eptr >= md->end_subject)
5186 SCHECK_PARTIAL();
5187 break;
5189 GETCHARLEN(c, eptr, len);
5190 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5191 eptr+= len;
5193 break;
5195 case OP_WHITESPACE:
5196 for (i = min; i < max; i++)
5198 int len = 1;
5199 if (eptr >= md->end_subject)
5201 SCHECK_PARTIAL();
5202 break;
5204 GETCHARLEN(c, eptr, len);
5205 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5206 eptr+= len;
5208 break;
5210 case OP_NOT_WORDCHAR:
5211 for (i = min; i < max; i++)
5213 int len = 1;
5214 if (eptr >= md->end_subject)
5216 SCHECK_PARTIAL();
5217 break;
5219 GETCHARLEN(c, eptr, len);
5220 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5221 eptr+= len;
5223 break;
5225 case OP_WORDCHAR:
5226 for (i = min; i < max; i++)
5228 int len = 1;
5229 if (eptr >= md->end_subject)
5231 SCHECK_PARTIAL();
5232 break;
5234 GETCHARLEN(c, eptr, len);
5235 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5236 eptr+= len;
5238 break;
5240 default:
5241 RRETURN(PCRE_ERROR_INTERNAL);
5244 /* eptr is now past the end of the maximum run */
5246 if (possessive) continue;
5247 for(;;)
5249 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5250 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5251 if (eptr-- == pp) break; /* Stop if tried at original pos */
5252 BACKCHAR(eptr);
5255 else
5256 #endif /* SUPPORT_UTF8 */
5258 /* Not UTF-8 mode */
5260 switch(ctype)
5262 case OP_ANY:
5263 for (i = min; i < max; i++)
5265 if (eptr >= md->end_subject)
5267 SCHECK_PARTIAL();
5268 break;
5270 if (IS_NEWLINE(eptr)) break;
5271 eptr++;
5273 break;
5275 case OP_ALLANY:
5276 case OP_ANYBYTE:
5277 c = max - min;
5278 if (c > (unsigned int)(md->end_subject - eptr))
5280 eptr = md->end_subject;
5281 SCHECK_PARTIAL();
5283 else eptr += c;
5284 break;
5286 case OP_ANYNL:
5287 for (i = min; i < max; i++)
5289 if (eptr >= md->end_subject)
5291 SCHECK_PARTIAL();
5292 break;
5294 c = *eptr;
5295 if (c == 0x000d)
5297 if (++eptr >= md->end_subject) break;
5298 if (*eptr == 0x000a) eptr++;
5300 else
5302 if (c != 0x000a &&
5303 (md->bsr_anycrlf ||
5304 (c != 0x000b && c != 0x000c && c != 0x0085)))
5305 break;
5306 eptr++;
5309 break;
5311 case OP_NOT_HSPACE:
5312 for (i = min; i < max; i++)
5314 if (eptr >= md->end_subject)
5316 SCHECK_PARTIAL();
5317 break;
5319 c = *eptr;
5320 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5321 eptr++;
5323 break;
5325 case OP_HSPACE:
5326 for (i = min; i < max; i++)
5328 if (eptr >= md->end_subject)
5330 SCHECK_PARTIAL();
5331 break;
5333 c = *eptr;
5334 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5335 eptr++;
5337 break;
5339 case OP_NOT_VSPACE:
5340 for (i = min; i < max; i++)
5342 if (eptr >= md->end_subject)
5344 SCHECK_PARTIAL();
5345 break;
5347 c = *eptr;
5348 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5349 break;
5350 eptr++;
5352 break;
5354 case OP_VSPACE:
5355 for (i = min; i < max; i++)
5357 if (eptr >= md->end_subject)
5359 SCHECK_PARTIAL();
5360 break;
5362 c = *eptr;
5363 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5364 break;
5365 eptr++;
5367 break;
5369 case OP_NOT_DIGIT:
5370 for (i = min; i < max; i++)
5372 if (eptr >= md->end_subject)
5374 SCHECK_PARTIAL();
5375 break;
5377 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5378 eptr++;
5380 break;
5382 case OP_DIGIT:
5383 for (i = min; i < max; i++)
5385 if (eptr >= md->end_subject)
5387 SCHECK_PARTIAL();
5388 break;
5390 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5391 eptr++;
5393 break;
5395 case OP_NOT_WHITESPACE:
5396 for (i = min; i < max; i++)
5398 if (eptr >= md->end_subject)
5400 SCHECK_PARTIAL();
5401 break;
5403 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5404 eptr++;
5406 break;
5408 case OP_WHITESPACE:
5409 for (i = min; i < max; i++)
5411 if (eptr >= md->end_subject)
5413 SCHECK_PARTIAL();
5414 break;
5416 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5417 eptr++;
5419 break;
5421 case OP_NOT_WORDCHAR:
5422 for (i = min; i < max; i++)
5424 if (eptr >= md->end_subject)
5426 SCHECK_PARTIAL();
5427 break;
5429 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5430 eptr++;
5432 break;
5434 case OP_WORDCHAR:
5435 for (i = min; i < max; i++)
5437 if (eptr >= md->end_subject)
5439 SCHECK_PARTIAL();
5440 break;
5442 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5443 eptr++;
5445 break;
5447 default:
5448 RRETURN(PCRE_ERROR_INTERNAL);
5451 /* eptr is now past the end of the maximum run */
5453 if (possessive) continue;
5454 while (eptr >= pp)
5456 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5457 eptr--;
5458 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5462 /* Get here if we can't make it match with any permitted repetitions */
5464 MRRETURN(MATCH_NOMATCH);
5466 /* Control never gets here */
5468 /* There's been some horrible disaster. Arrival here can only mean there is
5469 something seriously wrong in the code above or the OP_xxx definitions. */
5471 default:
5472 DPRINTF(("Unknown opcode %d\n", *ecode));
5473 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5476 /* Do not stick any code in here without much thought; it is assumed
5477 that "continue" in the code above comes out to here to repeat the main
5478 loop. */
5480 } /* End of main loop */
5481 /* Control never reaches here */
5484 /* When compiling to use the heap rather than the stack for recursive calls to
5485 match(), the RRETURN() macro jumps here. The number that is saved in
5486 frame->Xwhere indicates which label we actually want to return to. */
5488 #ifdef NO_RECURSE
5489 #define LBL(val) case val: goto L_RM##val;
5490 HEAP_RETURN:
5491 switch (frame->Xwhere)
5493 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5494 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5495 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5496 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5497 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5498 #ifdef SUPPORT_UTF8
5499 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5500 LBL(32) LBL(34) LBL(42) LBL(46)
5501 #ifdef SUPPORT_UCP
5502 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5503 LBL(59) LBL(60) LBL(61) LBL(62)
5504 #endif /* SUPPORT_UCP */
5505 #endif /* SUPPORT_UTF8 */
5506 default:
5507 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5508 return PCRE_ERROR_INTERNAL;
5510 #undef LBL
5511 #endif /* NO_RECURSE */
5515 /***************************************************************************
5516 ****************************************************************************
5517 RECURSION IN THE match() FUNCTION
5519 Undefine all the macros that were defined above to handle this. */
5521 #ifdef NO_RECURSE
5522 #undef eptr
5523 #undef ecode
5524 #undef mstart
5525 #undef offset_top
5526 #undef ims
5527 #undef eptrb
5528 #undef flags
5530 #undef callpat
5531 #undef charptr
5532 #undef data
5533 #undef next
5534 #undef pp
5535 #undef prev
5536 #undef saved_eptr
5538 #undef new_recursive
5540 #undef cur_is_word
5541 #undef condition
5542 #undef prev_is_word
5544 #undef original_ims
5546 #undef ctype
5547 #undef length
5548 #undef max
5549 #undef min
5550 #undef number
5551 #undef offset
5552 #undef op
5553 #undef save_capture_last
5554 #undef save_offset1
5555 #undef save_offset2
5556 #undef save_offset3
5557 #undef stacksave
5559 #undef newptrb
5561 #endif
5563 /* These two are defined as macros in both cases */
5565 #undef fc
5566 #undef fi
5568 /***************************************************************************
5569 ***************************************************************************/
5573 /*************************************************
5574 * Execute a Regular Expression *
5575 *************************************************/
5577 /* This function applies a compiled re to a subject string and picks out
5578 portions of the string if it matches. Two elements in the vector are set for
5579 each substring: the offsets to the start and end of the substring.
5581 Arguments:
5582 argument_re points to the compiled expression
5583 extra_data points to extra data or is NULL
5584 subject points to the subject string
5585 length length of subject string (may contain binary zeros)
5586 start_offset where to start in the subject string
5587 options option bits
5588 offsets points to a vector of ints to be filled in with offsets
5589 offsetcount the number of elements in the vector
5591 Returns: > 0 => success; value is the number of elements filled in
5592 = 0 => success, but offsets is not big enough
5593 -1 => failed to match
5594 < -1 => some kind of unexpected problem
5597 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5598 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5599 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5600 int offsetcount)
5602 int rc, resetcount, ocount;
5603 int first_byte = -1;
5604 int req_byte = -1;
5605 int req_byte2 = -1;
5606 int newline;
5607 unsigned long int ims;
5608 BOOL using_temporary_offsets = FALSE;
5609 BOOL anchored;
5610 BOOL startline;
5611 BOOL firstline;
5612 BOOL first_byte_caseless = FALSE;
5613 BOOL req_byte_caseless = FALSE;
5614 BOOL utf8;
5615 match_data match_block;
5616 match_data *md = &match_block;
5617 const uschar *tables;
5618 const uschar *start_bits = NULL;
5619 USPTR start_match = (USPTR)subject + start_offset;
5620 USPTR end_subject;
5621 USPTR start_partial = NULL;
5622 USPTR req_byte_ptr = start_match - 1;
5624 pcre_study_data internal_study;
5625 const pcre_study_data *study;
5627 real_pcre internal_re;
5628 const real_pcre *external_re = (const real_pcre *)argument_re;
5629 const real_pcre *re = external_re;
5631 /* Plausibility checks */
5632 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5633 if (re == NULL || subject == NULL ||
5634 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5635 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5636 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5638 /* This information is for finding all the numbers associated with a given
5639 name, for condition testing. */
5641 md->name_table = (uschar *)re + re->name_table_offset;
5642 md->name_count = re->name_count;
5643 md->name_entry_size = re->name_entry_size;
5645 /* Fish out the optional data from the extra_data structure, first setting
5646 the default values. */
5648 study = NULL;
5649 md->match_limit = MATCH_LIMIT;
5650 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5651 md->callout_data = NULL;
5653 /* The table pointer is always in native byte order. */
5655 tables = external_re->tables;
5657 if (extra_data != NULL)
5659 register unsigned int flags = extra_data->flags;
5660 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5661 study = (const pcre_study_data *)extra_data->study_data;
5662 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5663 md->match_limit = extra_data->match_limit;
5664 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5665 md->match_limit_recursion = extra_data->match_limit_recursion;
5666 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5667 md->callout_data = extra_data->callout_data;
5668 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5671 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5672 is a feature that makes it possible to save compiled regex and re-use them
5673 in other programs later. */
5675 if (tables == NULL) tables = _pcre_default_tables;
5677 /* Check that the first field in the block is the magic number. If it is not,
5678 test for a regex that was compiled on a host of opposite endianness. If this is
5679 the case, flipped values are put in internal_re and internal_study if there was
5680 study data too. */
5682 if (re->magic_number != MAGIC_NUMBER)
5684 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5685 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5686 if (study != NULL) study = &internal_study;
5689 /* Set up other data */
5691 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5692 startline = (re->flags & PCRE_STARTLINE) != 0;
5693 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5695 /* The code starts after the real_pcre block and the capture name table. */
5697 md->start_code = (const uschar *)external_re + re->name_table_offset +
5698 re->name_count * re->name_entry_size;
5700 md->start_subject = (USPTR)subject;
5701 md->start_offset = start_offset;
5702 md->end_subject = md->start_subject + length;
5703 end_subject = md->end_subject;
5705 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5706 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5707 md->use_ucp = (re->options & PCRE_UCP) != 0;
5708 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5710 md->notbol = (options & PCRE_NOTBOL) != 0;
5711 md->noteol = (options & PCRE_NOTEOL) != 0;
5712 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5713 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5714 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5715 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5716 md->hitend = FALSE;
5717 md->mark = NULL; /* In case never set */
5719 md->recursive = NULL; /* No recursion at top level */
5721 md->lcc = tables + lcc_offset;
5722 md->ctypes = tables + ctypes_offset;
5724 /* Handle different \R options. */
5726 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5728 case 0:
5729 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5730 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5731 else
5732 #ifdef BSR_ANYCRLF
5733 md->bsr_anycrlf = TRUE;
5734 #else
5735 md->bsr_anycrlf = FALSE;
5736 #endif
5737 break;
5739 case PCRE_BSR_ANYCRLF:
5740 md->bsr_anycrlf = TRUE;
5741 break;
5743 case PCRE_BSR_UNICODE:
5744 md->bsr_anycrlf = FALSE;
5745 break;
5747 default: return PCRE_ERROR_BADNEWLINE;
5750 /* Handle different types of newline. The three bits give eight cases. If
5751 nothing is set at run time, whatever was used at compile time applies. */
5753 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5754 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5756 case 0: newline = NEWLINE; break; /* Compile-time default */
5757 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5758 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5759 case PCRE_NEWLINE_CR+
5760 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5761 case PCRE_NEWLINE_ANY: newline = -1; break;
5762 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5763 default: return PCRE_ERROR_BADNEWLINE;
5766 if (newline == -2)
5768 md->nltype = NLTYPE_ANYCRLF;
5770 else if (newline < 0)
5772 md->nltype = NLTYPE_ANY;
5774 else
5776 md->nltype = NLTYPE_FIXED;
5777 if (newline > 255)
5779 md->nllen = 2;
5780 md->nl[0] = (newline >> 8) & 255;
5781 md->nl[1] = newline & 255;
5783 else
5785 md->nllen = 1;
5786 md->nl[0] = newline;
5790 /* Partial matching was originally supported only for a restricted set of
5791 regexes; from release 8.00 there are no restrictions, but the bits are still
5792 defined (though never set). So there's no harm in leaving this code. */
5794 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5795 return PCRE_ERROR_BADPARTIAL;
5797 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5798 back the character offset. */
5800 #ifdef SUPPORT_UTF8
5801 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5803 int tb;
5804 if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
5805 return (tb == length && md->partial > 1)?
5806 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5807 if (start_offset > 0 && start_offset < length)
5809 tb = ((USPTR)subject)[start_offset] & 0xc0;
5810 if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
5813 #endif
5815 /* The ims options can vary during the matching as a result of the presence
5816 of (?ims) items in the pattern. They are kept in a local variable so that
5817 restoring at the exit of a group is easy. */
5819 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5821 /* If the expression has got more back references than the offsets supplied can
5822 hold, we get a temporary chunk of working store to use during the matching.
5823 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5824 of 3. */
5826 ocount = offsetcount - (offsetcount % 3);
5828 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5830 ocount = re->top_backref * 3 + 3;
5831 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5832 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5833 using_temporary_offsets = TRUE;
5834 DPRINTF(("Got memory to hold back references\n"));
5836 else md->offset_vector = offsets;
5838 md->offset_end = ocount;
5839 md->offset_max = (2*ocount)/3;
5840 md->offset_overflow = FALSE;
5841 md->capture_last = -1;
5843 /* Compute the minimum number of offsets that we need to reset each time. Doing
5844 this makes a huge difference to execution time when there aren't many brackets
5845 in the pattern. */
5847 resetcount = 2 + re->top_bracket * 2;
5848 if (resetcount > offsetcount) resetcount = ocount;
5850 /* Reset the working variable associated with each extraction. These should
5851 never be used unless previously set, but they get saved and restored, and so we
5852 initialize them to avoid reading uninitialized locations. */
5854 if (md->offset_vector != NULL)
5856 register int *iptr = md->offset_vector + ocount;
5857 register int *iend = iptr - resetcount/2 + 1;
5858 while (--iptr >= iend) *iptr = -1;
5861 /* Set up the first character to match, if available. The first_byte value is
5862 never set for an anchored regular expression, but the anchoring may be forced
5863 at run time, so we have to test for anchoring. The first char may be unset for
5864 an unanchored pattern, of course. If there's no first char and the pattern was
5865 studied, there may be a bitmap of possible first characters. */
5867 if (!anchored)
5869 if ((re->flags & PCRE_FIRSTSET) != 0)
5871 first_byte = re->first_byte & 255;
5872 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5873 first_byte = md->lcc[first_byte];
5875 else
5876 if (!startline && study != NULL &&
5877 (study->flags & PCRE_STUDY_MAPPED) != 0)
5878 start_bits = study->start_bits;
5881 /* For anchored or unanchored matches, there may be a "last known required
5882 character" set. */
5884 if ((re->flags & PCRE_REQCHSET) != 0)
5886 req_byte = re->req_byte & 255;
5887 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5888 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5892 /* ==========================================================================*/
5894 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5895 the loop runs just once. */
5897 for(;;)
5899 USPTR save_end_subject = end_subject;
5900 USPTR new_start_match;
5902 /* Reset the maximum number of extractions we might see. */
5904 if (md->offset_vector != NULL)
5906 register int *iptr = md->offset_vector;
5907 register int *iend = iptr + resetcount;
5908 while (iptr < iend) *iptr++ = -1;
5911 /* If firstline is TRUE, the start of the match is constrained to the first
5912 line of a multiline string. That is, the match must be before or at the first
5913 newline. Implement this by temporarily adjusting end_subject so that we stop
5914 scanning at a newline. If the match fails at the newline, later code breaks
5915 this loop. */
5917 if (firstline)
5919 USPTR t = start_match;
5920 #ifdef SUPPORT_UTF8
5921 if (utf8)
5923 while (t < md->end_subject && !IS_NEWLINE(t))
5925 t++;
5926 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5929 else
5930 #endif
5931 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5932 end_subject = t;
5935 /* There are some optimizations that avoid running the match if a known
5936 starting point is not found, or if a known later character is not present.
5937 However, there is an option that disables these, for testing and for ensuring
5938 that all callouts do actually occur. The option can be set in the regex by
5939 (*NO_START_OPT) or passed in match-time options. */
5941 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
5943 /* Advance to a unique first byte if there is one. */
5945 if (first_byte >= 0)
5947 if (first_byte_caseless)
5948 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5949 start_match++;
5950 else
5951 while (start_match < end_subject && *start_match != first_byte)
5952 start_match++;
5955 /* Or to just after a linebreak for a multiline match */
5957 else if (startline)
5959 if (start_match > md->start_subject + start_offset)
5961 #ifdef SUPPORT_UTF8
5962 if (utf8)
5964 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5966 start_match++;
5967 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5968 start_match++;
5971 else
5972 #endif
5973 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5974 start_match++;
5976 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5977 and we are now at a LF, advance the match position by one more character.
5980 if (start_match[-1] == CHAR_CR &&
5981 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5982 start_match < end_subject &&
5983 *start_match == CHAR_NL)
5984 start_match++;
5988 /* Or to a non-unique first byte after study */
5990 else if (start_bits != NULL)
5992 while (start_match < end_subject)
5994 register unsigned int c = *start_match;
5995 if ((start_bits[c/8] & (1 << (c&7))) == 0)
5997 start_match++;
5998 #ifdef SUPPORT_UTF8
5999 if (utf8)
6000 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6001 start_match++;
6002 #endif
6004 else break;
6007 } /* Starting optimizations */
6009 /* Restore fudged end_subject */
6011 end_subject = save_end_subject;
6013 /* The following two optimizations are disabled for partial matching or if
6014 disabling is explicitly requested. */
6016 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6018 /* If the pattern was studied, a minimum subject length may be set. This is
6019 a lower bound; no actual string of that length may actually match the
6020 pattern. Although the value is, strictly, in characters, we treat it as
6021 bytes to avoid spending too much time in this optimization. */
6023 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6024 (pcre_uint32)(end_subject - start_match) < study->minlength)
6026 rc = MATCH_NOMATCH;
6027 break;
6030 /* If req_byte is set, we know that that character must appear in the
6031 subject for the match to succeed. If the first character is set, req_byte
6032 must be later in the subject; otherwise the test starts at the match point.
6033 This optimization can save a huge amount of backtracking in patterns with
6034 nested unlimited repeats that aren't going to match. Writing separate code
6035 for cased/caseless versions makes it go faster, as does using an
6036 autoincrement and backing off on a match.
6038 HOWEVER: when the subject string is very, very long, searching to its end
6039 can take a long time, and give bad performance on quite ordinary patterns.
6040 This showed up when somebody was matching something like /^\d+C/ on a
6041 32-megabyte string... so we don't do this when the string is sufficiently
6042 long. */
6044 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6046 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6048 /* We don't need to repeat the search if we haven't yet reached the
6049 place we found it at last time. */
6051 if (p > req_byte_ptr)
6053 if (req_byte_caseless)
6055 while (p < end_subject)
6057 register int pp = *p++;
6058 if (pp == req_byte || pp == req_byte2) { p--; break; }
6061 else
6063 while (p < end_subject)
6065 if (*p++ == req_byte) { p--; break; }
6069 /* If we can't find the required character, break the matching loop,
6070 forcing a match failure. */
6072 if (p >= end_subject)
6074 rc = MATCH_NOMATCH;
6075 break;
6078 /* If we have found the required character, save the point where we
6079 found it, so that we don't search again next time round the loop if
6080 the start hasn't passed this character yet. */
6082 req_byte_ptr = p;
6087 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6088 printf(">>>> Match against: ");
6089 pchars(start_match, end_subject - start_match, TRUE, md);
6090 printf("\n");
6091 #endif
6093 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6094 first starting point for which a partial match was found. */
6096 md->start_match_ptr = start_match;
6097 md->start_used_ptr = start_match;
6098 md->match_call_count = 0;
6099 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6100 0, 0);
6101 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6103 switch(rc)
6105 /* SKIP passes back the next starting point explicitly, but if it is the
6106 same as the match we have just done, treat it as NOMATCH. */
6108 case MATCH_SKIP:
6109 if (md->start_match_ptr != start_match)
6111 new_start_match = md->start_match_ptr;
6112 break;
6114 /* Fall through */
6116 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6117 the SKIP's arg was not found. We also treat this as NOMATCH. */
6119 case MATCH_SKIP_ARG:
6120 /* Fall through */
6122 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6123 exactly like PRUNE. */
6125 case MATCH_NOMATCH:
6126 case MATCH_PRUNE:
6127 case MATCH_THEN:
6128 new_start_match = start_match + 1;
6129 #ifdef SUPPORT_UTF8
6130 if (utf8)
6131 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6132 new_start_match++;
6133 #endif
6134 break;
6136 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6138 case MATCH_COMMIT:
6139 rc = MATCH_NOMATCH;
6140 goto ENDLOOP;
6142 /* Any other return is either a match, or some kind of error. */
6144 default:
6145 goto ENDLOOP;
6148 /* Control reaches here for the various types of "no match at this point"
6149 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6151 rc = MATCH_NOMATCH;
6153 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6154 newline in the subject (though it may continue over the newline). Therefore,
6155 if we have just failed to match, starting at a newline, do not continue. */
6157 if (firstline && IS_NEWLINE(start_match)) break;
6159 /* Advance to new matching position */
6161 start_match = new_start_match;
6163 /* Break the loop if the pattern is anchored or if we have passed the end of
6164 the subject. */
6166 if (anchored || start_match > end_subject) break;
6168 /* If we have just passed a CR and we are now at a LF, and the pattern does
6169 not contain any explicit matches for \r or \n, and the newline option is CRLF
6170 or ANY or ANYCRLF, advance the match position by one more character. */
6172 if (start_match[-1] == CHAR_CR &&
6173 start_match < end_subject &&
6174 *start_match == CHAR_NL &&
6175 (re->flags & PCRE_HASCRORLF) == 0 &&
6176 (md->nltype == NLTYPE_ANY ||
6177 md->nltype == NLTYPE_ANYCRLF ||
6178 md->nllen == 2))
6179 start_match++;
6181 md->mark = NULL; /* Reset for start of next match attempt */
6182 } /* End of for(;;) "bumpalong" loop */
6184 /* ==========================================================================*/
6186 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6187 conditions is true:
6189 (1) The pattern is anchored or the match was failed by (*COMMIT);
6191 (2) We are past the end of the subject;
6193 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6194 this option requests that a match occur at or before the first newline in
6195 the subject.
6197 When we have a match and the offset vector is big enough to deal with any
6198 backreferences, captured substring offsets will already be set up. In the case
6199 where we had to get some local store to hold offsets for backreference
6200 processing, copy those that we can. In this case there need not be overflow if
6201 certain parts of the pattern were not used, even though there are more
6202 capturing parentheses than vector slots. */
6204 ENDLOOP:
6206 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6208 if (using_temporary_offsets)
6210 if (offsetcount >= 4)
6212 memcpy(offsets + 2, md->offset_vector + 2,
6213 (offsetcount - 2) * sizeof(int));
6214 DPRINTF(("Copied offsets from temporary memory\n"));
6216 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6217 DPRINTF(("Freeing temporary memory\n"));
6218 (pcre_free)(md->offset_vector);
6221 /* Set the return code to the number of captured strings, or 0 if there are
6222 too many to fit into the vector. */
6224 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6226 /* If there is space, set up the whole thing as substring 0. The value of
6227 md->start_match_ptr might be modified if \K was encountered on the success
6228 matching path. */
6230 if (offsetcount < 2) rc = 0; else
6232 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6233 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6236 DPRINTF((">>>> returning %d\n", rc));
6237 goto RETURN_MARK;
6240 /* Control gets here if there has been an error, or if the overall match
6241 attempt has failed at all permitted starting positions. */
6243 if (using_temporary_offsets)
6245 DPRINTF(("Freeing temporary memory\n"));
6246 (pcre_free)(md->offset_vector);
6249 /* For anything other than nomatch or partial match, just return the code. */
6251 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6253 DPRINTF((">>>> error: returning %d\n", rc));
6254 return rc;
6257 /* Handle partial matches - disable any mark data */
6259 if (start_partial != NULL)
6261 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6262 md->mark = NULL;
6263 if (offsetcount > 1)
6265 offsets[0] = (int)(start_partial - (USPTR)subject);
6266 offsets[1] = (int)(end_subject - (USPTR)subject);
6268 rc = PCRE_ERROR_PARTIAL;
6271 /* This is the classic nomatch case */
6273 else
6275 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6276 rc = PCRE_ERROR_NOMATCH;
6279 /* Return the MARK data if it has been requested. */
6281 RETURN_MARK:
6283 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6284 *(extra_data->mark) = (unsigned char *)(md->mark);
6285 return rc;
6288 /* End of pcre_exec.c */