330 #define DPRINTF(p) /*nothing */
331 #define DPRINTF(p) printf p
332 #define GETCHAR(c, eptr) c = *eptr;
333 #define GETCHARINC(c, eptr) c = *eptr++;
334 #define class pcre_class
335 #define match_condassert 0x01 /* Called to check a condition assertion */
336 #define match_isgroup 0x02 /* Set if start of bracketed group */
354 #ifdef DEBUG /* Sigh. Some compilers never learn. */
355 #ifdef DEBUG /* Sigh. Some compilers never learn. */
370 #include "internal.h"
371 && length - re->max_match_size > start_offset)
372 ((*ecode++ == OP_BEG_WORD) ? prev_is_word : cur_is_word))
373 ((md->ctypes[*eptr] & ctype_word) != 0);
374 ((md->ctypes[*eptr] & ctype_word) != 0);
375 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
376 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
377 (eptr == md->end_subject - 1 && *eptr != '\n'))
378 (eptr == md->end_subject - 1 && *eptr != '\n'))
379 (i.e. keep it out of the loop). Also we can test that there are at least
380 (md->ctypes[*eptr++] & ctype_digit) != 0)
381 (md->ctypes[*eptr++] & ctype_digit) == 0)
382 (md->ctypes[*eptr++] & ctype_space) != 0)
383 (md->ctypes[*eptr++] & ctype_space) == 0)
384 (md->ctypes[*eptr++] & ctype_word) != 0)
385 (md->ctypes[*eptr++] & ctype_word) == 0)
386 (offsetcount - 2) * sizeof (int));
387 (offsets == NULL && offsetcount > 0))
388 (pcre_free) (match_block.offset_vector);
389 (pcre_free) (match_block.offset_vector);
391 (re->tables + fcc_offset)[req_char] : req_char;
392 * Match a back-reference *
393 * Execute a Regular Expression *
394 * Match from current position *
395 * Debugging function to print chars *
396 * Perl-Compatible Regular Expressions *
397 * Macros and tables for character handling *
398 *************************************************/
399 *************************************************/
400 *************************************************/
401 *************************************************/
402 *************************************************/
403 *************************************************/
411 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
413 -----------------------------------------------------------------------------
414 -----------------------------------------------------------------------------
415 -1 => failed to match
417 /* "Once" brackets are like assertion brackets except that after a match,
418 /* ... else fall through */
419 /* ... else fall through */
420 /* Advance to a possible match for an initial string after study */
421 /* Allow compilation as C++ source code, should anybody want to do that. */
422 /* Always fail if not enough characters left */
423 /* An alternation is the end of a branch; scan along to find the end of the
424 /* Assert before internal newline if multiline, or before a terminating
425 /* Assertion brackets. Check the alternative branches in turn - the
426 /* At the start of a bracketed group, add the current subject pointer to the
427 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
428 /* Caseful comparisons */
429 /* Caseful comparisons */
430 /* Change option settings */
431 /* Common code for all repeated single character type matches */
432 /* Common code for all repeated single-character matches. We can give
433 /* Common code for all repeated single-character matches. We can give
434 /* Compute the minimum number of offsets that we need to reset each time. Doing
435 /* Conditional group: compilation checked that there are no more than
436 /* Continue as from after the assertion, updating the offsets high water
437 /* Continue from after the assertion, updating the offsets high water
438 /* Control never gets here */
439 /* Control never gets here */
440 /* Control never gets here */
441 /* Control never gets here */
442 /* Control never gets here */
443 /* Control never gets here */
444 /* Control never gets here */
445 /* Control never gets here */
446 /* Control never gets here */
447 /* Control never gets here */
448 /* Control never gets here */
449 /* Control never gets here */
450 /* Control never gets here */
451 /* Control never gets here */
452 /* Control never reaches here */
453 /* Control never reaches here */
454 /* Copy the offset information from temporary store if necessary */
455 /* Do a single test if no case difference is set up */
456 /* Do not stick any code in here without much thought; it is assumed
457 /* End of a group, repeated or non-repeating. If we are at the end of
458 /* End of subject assertion (\z) */
459 /* End of subject or ending \n assertion (\Z) */
460 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
461 /* First, ensure the minimum number of matches are present. */
462 /* First, ensure the minimum number of matches are present. Use inline
463 /* First, ensure the minimum number of matches are present. We get back
464 /* Flag bits for the match() function */
465 /* For a non-repeating ket, just continue at this level. This also
466 /* For a non-repeating ket, just continue at this level. This also
467 /* For anchored or unanchored matches, there may be a "last known required
468 /* For extended extraction brackets (large number), we have to fish out
469 /* For extended extraction brackets (large number), we have to fish out the
470 /* For matches anchored to the end of the pattern, we can often avoid
471 /* If a back reference hasn't been set, the length that is passed is greater
472 /* If checking an assertion for a condition, return TRUE. */
473 /* If hit the end of the group (which could be repeated), fail */
474 /* If max == min we can continue with the main loop without the
475 /* If maximizing it is worth using inline code for speed, doing the type
476 /* If maximizing, find the longest possible run, then work backwards. */
477 /* If maximizing, find the longest string and work backwards */
478 /* If min = max, continue at the same level without recursing */
479 /* If min = max, continue at the same level without recursion.
480 /* If minimizing, keep testing the rest of the expression and advancing
481 /* If minimizing, keep trying and advancing the pointer */
482 /* If minimizing, we have to test the rest of the pattern before each
483 /* If req_char is set, we know that that character must appear in the subject
484 /* If the expression has got more back references than the offsets supplied can
485 /* If the length of the reference is zero, just continue with the
486 /* If the reference is unset, set the length to be longer than the amount
487 /* If we can't find the required character, break the matching loop */
488 /* If we have found the required character, save the point where we
489 /* In all other cases except a conditional group we have to check the
490 /* In case the recursion has set more capturing values, save the final
491 /* Include the internals header, which itself includes Standard C headers plus
492 /* Insufficient room for saving captured contents */
493 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
494 /* Match a back reference, possibly repeatedly. Look past the end of the
495 /* Match a character class, possibly repeatedly. Look past the end of the
496 /* Match a negated single character */
497 /* Match a negated single character repeatedly. This is almost a repeat of
498 /* Match a run of characters */
499 /* Match a single character repeatedly; different opcodes share code. */
500 /* Match a single character type repeatedly; several different opcodes
501 /* Match a single character type; inline for speed */
502 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
503 /* Move the subject pointer back. This occurs only at the start of
504 /* Negative assertion: all branches must fail to match */
505 /* Now start processing the operations. */
508 /* On entry ecode points to the first opcode, and eptr to the first character
509 /* Opening capturing bracket. If there is space in the offset vector, save
510 /* Or to a non-unique first char after study */
511 /* Or to a unique first char if possible */
512 /* Or to just after \n for a multiline match if possible */
513 /* Other types of node can be handled by a switch */
514 /* Otherwise test for either case */
515 /* Print a sequence of chars in printable format, stopping at the end of the
516 /* Recursion matches the current regex, nested. If there are any capturing
517 /* Reset the maximum number of extractions we might see. */
518 /* Reset the value of the ims flags, in case they got changed during
519 /* Reset the working variable associated with each extraction. These should
520 /* Separate the caselesss case for speed */
521 /* Set up for repetition, or handle the non-repeated case */
522 /* Set up the first character to match, if available. The first_char value is
523 /* Skip over conditional reference data or large extraction number data if
524 /* Start of subject assertion */
525 /* Start of subject unless notbol, or after internal newline if multiline */
526 /* Structure for building a chain of data that actually lives on the
527 /* The code is duplicated for the caseless and caseful cases, for speed,
528 /* The code is duplicated for the caseless and caseful cases, for speed,
529 /* The condition is an assertion. Call match() to evaluate it - setting
530 /* The ims options can vary during the matching as a result of the presence
531 /* The repeating kets try the rest of the pattern or restart from the
532 /* The repeating kets try the rest of the pattern or restart from the
533 /* There's been some horrible disaster. */
534 /* This "while" is the end of the "do" above */
535 /* This function applies a compiled re to a subject string and picks out
536 /* Use a macro for debugging printing, 'cause that limits the use of #ifdef
537 /* We don't need to repeat the search if we haven't yet reached the
538 /* When a match occurs, substrings will be set for all internal extractions;
539 /* Word boundary assertions */
540 /*************************************************
541 /*************************************************
542 /*************************************************
543 /*************************************************
544 /*************************************************
545 /*************************************************
546 1. This software is distributed in the hope that it will be useful,
547 2. The origin of this software must not be misrepresented, either by
548 3. Altered versions must be plainly marked as such, and must not be
549 4. If PCRE is embedded in any software that is released under the GNU
550 5.005. If there is an options reset, it will get obeyed in the normal
551 5.005. If there is an options reset, it will get obeyed in the normal
552 6 : 3 + (ecode[1] << 8) + ecode[2]),
553 < -1 => some kind of unexpected problem
554 = 0 => success, but offsets is not big enough
560 BOOL cur_is_word = (eptr < md->end_subject) &&
561 BOOL cur_is_word = (eptr < md->end_subject) &&
563 BOOL minimize = FALSE;
564 BOOL prev_is_word = (eptr != md->start_subject) &&
565 BOOL prev_is_word = (eptr != md->start_subject) &&
568 BOOL using_temporary_offsets = FALSE;
569 Copyright (c) 1997-2000 University of Cambridge
570 DPRINTF ((">>>> returning %d\n", match_block.errorcode));
571 DPRINTF ((">>>> returning %d\n", rc));
572 DPRINTF (("Copied offsets from temporary memory\n"));
573 DPRINTF (("Freeing temporary memory\n"));
574 DPRINTF (("Freeing temporary memory\n"));
575 DPRINTF (("Got memory to hold back references\n"));
576 DPRINTF (("Unknown opcode %d\n", *ecode));
577 DPRINTF (("bracket %d failed\n", number));
578 DPRINTF (("bracket 0 failed\n"));
579 DPRINTF (("ims reset to %02lx\n", ims));
580 DPRINTF (("ims set to %02lx at group repeat\n", ims));
581 DPRINTF (("ims set to %02lx\n", ims));
582 DPRINTF (("matching %c{%d,%d} against subject %.*s\n", c, min, max,
583 DPRINTF (("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
584 DPRINTF (("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
585 DPRINTF (("start bracket 0\n"));
586 GETCHAR (c, eptr) /* Get character */
587 GETCHARINC (c, eptr) /* Get character; increment eptr */
588 GETCHARINC (c, eptr) /* Get character; increment eptr */
589 General Purpose Licence (GPL), then the terms of that licence shall
590 However, if the referenced string is the empty string, always treat
591 If the bracket fails to match, we need to restore this value and also the
592 If there isn't enough space in the offset vector, treat this as if it were a
593 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
594 Otherwise, we can use the vector supplied, rounding down its size to a multiple
595 Permission is granted to anyone to use this software for any purpose on any
599 Returns: > 0 => success; value is the number of elements filled in
600 Returns: TRUE if matched
601 Returns: TRUE if matched
603 They are not both allowed to be zero. */
604 This is a library of functions to support regular expressions whose syntax
605 This is the forcible breaking of infinite loops as implemented in Perl
606 This is the forcible breaking of infinite loops as implemented in Perl
607 Writing separate code makes it go faster, as does using an autoincrement and
608 Written by: Philip Hazel <ph10@cam.ac.uk>
609 a move back into the brackets. Check the alternative branches in turn - the
610 address of eptr, so that eptr can be a register variable. */
611 an assertion "group", stop matching and return TRUE, but record the
612 an empty string - recursion will then try other alternatives, if any. */
613 an error. Save the top 15 values on the stack, and accept that the rest
614 an unanchored pattern, of course. If there's no first char and the pattern was
615 analyzing most of the pattern. length > re->max_match_size is
616 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
617 and advance one byte in the pattern code. */
618 and reinstate them after the recursion. However, we don't know how many
619 and semantics are as close as possible to those of the Perl 5 language. See
620 and the required character in fact is caseful. */
621 at run time, so we have to test for anchoring. The first char may be unset for
622 avoid duplicate testing (which takes significant time). This covers the vast
623 backing off on a match. */
624 bmtable = extra->data.bmtable;
625 both cases of the character. Otherwise set the two values the same, which will
626 bracketed group and go to there. */
627 brackets - for testing for empty matches
628 brackets started but not finished, we have to save their starting points
702 but WITHOUT ANY WARRANTY; without even the implied warranty of
703 c != md->lcc[*eptr++])
704 c = *ecode++ - OP_CRSTAR;
705 c = *ecode++ - OP_CRSTAR;
706 c = *ecode++ - OP_NOTSTAR;
707 c = *ecode++ - OP_STAR;
708 c = *ecode++ - OP_TYPESTAR;
714 c = md->end_subject - eptr;
718 c == md->lcc[*eptr++])
719 can't just fail here, because of the possibility of quantifiers with zero
727 case OP_ASSERTBACK_NOT:
730 case OP_BRA: /* Non-capturing bracket: optimized */
786 case OP_NOT_WHITESPACE:
787 case OP_NOT_WHITESPACE:
788 case OP_NOT_WHITESPACE:
789 case OP_NOT_WHITESPACE:
790 case OP_NOT_WORDCHAR:
791 case OP_NOT_WORDCHAR:
792 case OP_NOT_WORDCHAR:
793 case OP_NOT_WORDCHAR:
794 case OP_NOT_WORD_BOUNDARY:
806 case OP_TYPEMINQUERY:
822 case OP_WORD_BOUNDARY:
823 case matching may be when this character is hit, so test for it in both its
824 caselessly, or if there are any changes of this flag within the regex, set up
825 cases if necessary. However, the different cased versions will not be set up
826 character" set. If the PCRE_CASELESS is set, implying that the match starts
827 characters and work backwards. */
828 characters and work backwards. */
829 code for maximizing the speed, and do the type test once at the start
830 code to character type repeats - written out again for speed. */
831 commoning these up that doesn't require a test of the positive/negative
832 computer system, and to redistribute it freely, subject to the following
835 const pcre_extra *extra;
836 const uschar *bmtable = NULL;
837 const uschar *data = ecode + 1; /* Save for matching */
838 const uschar *end_subject;
839 const uschar *next = ecode + 1;
840 const uschar *next = ecode + 1;
841 const uschar *p = md->start_subject + md->offset_vector[offset];
843 const uschar *pp = eptr;
844 const uschar *pp = eptr;
845 const uschar *pp = eptr;
846 const uschar *pp = eptr;
847 const uschar *pp = eptr;
848 const uschar *pp = eptr;
849 const uschar *pp = eptr;
850 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
851 const uschar *prev = ecode;
852 const uschar *req_char_ptr = start_match - 1;
853 const uschar *saved_eptr = eptr;
854 const uschar *saved_eptr = eptrb->saved_eptr;
855 const uschar *saved_eptr;
856 const uschar *start_bits = NULL;
857 const uschar *start_match = (const uschar *) subject + start_offset;
858 continue; /* With the main loop */
874 ctype = *ecode++; /* Code for the character type */
875 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
876 current high water mark for use by positive assertions. Do this also
877 default: /* No repeat follows */
878 default: /* No repeat follows */
891 each branch of a lookbehind assertion. If we are too close to the start to
892 each substring: the offsets to the start and end of the substring.
893 ecode position in code
894 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0) ?
895 ecode += (ecode[1] << 8) + ecode[2];
896 ecode += (ecode[1] << 8) + ecode[2];
897 ecode += (ecode[1] << 8) + ecode[2];
898 ecode += (ecode[1] << 8) + ecode[2];
899 ecode += (ecode[1] << 8) + ecode[2];
900 ecode += (ecode[1] << 8) + ecode[2];
901 ecode += (ecode[1] << 8) + ecode[2];
902 ecode += (ecode[1] << 8) + ecode[2];
903 ecode += (ecode[1] << 8) + ecode[2];
904 ecode += (ecode[1] << 8) + ecode[2];
907 ecode += 3 + (ecode[4] << 8) + ecode[5];
908 ecode += 33; /* Advance past the item */
909 ecode += 3; /* Advance past the item */
973 else if ((extra->options & PCRE_STUDY_BM) != 0)
974 else if (first_char >= 0)
975 else if (start_bits != NULL)
978 end_subject = match_block.end_subject;
979 eptr pointer in subject
980 eptr points into the subject
987 eptr -= (ecode[1] << 8) + ecode[2];
989 eptr = md->end_match_ptr;
990 eptr = md->end_match_ptr;
1003 eptrb pointer to chain of blocks containing eptr at start of
1005 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
1009 exactly what going to the ket would do. */
1010 explicit claim or by omission.
1011 external_extra points to "hints" from pcre_study() or is NULL
1012 external_re points to the compiled expression
1013 extraction by setting the offsets and bumping the high water mark. */
1014 first_char = match_block.lcc[first_char];
1015 first_char = re->first_char;
1018 for (i = 1; i <= c; i++)
1019 for (i = 1; i <= c; i++)
1020 for (i = 1; i <= min; i++)
1021 for (i = 1; i <= min; i++)
1022 for (i = 1; i <= min; i++)
1023 for (i = 1; i <= min; i++)
1024 for (i = 1; i <= min; i++)
1025 for (i = 1; i <= min; i++)
1026 for (i = 1; i <= min; i++)
1027 for (i = 1; i <= min; i++)
1028 for (i = 1; i <= min; i++)
1029 for (i = 1; i <= min; i++)
1030 for (i = 1; i <= min; i++)
1031 for (i = 1; i <= min; i++)
1032 for (i = 1; i <= min; i++)
1033 for (i = min; i < max; i++)
1034 for (i = min; i < max; i++)
1035 for (i = min; i < max; i++)
1036 for (i = min; i < max; i++)
1037 for (i = min; i < max; i++)
1038 for (i = min; i < max; i++)
1039 for (i = min; i < max; i++)
1040 for (i = min; i < max; i++)
1041 for (i = min; i < max; i++)
1042 for (i = min; i < max; i++)
1043 for (i = min; i < max; i++)
1044 for (i = min; i < max; i++)
1045 for (i = min; i < max; i++)
1053 for the "once" (not-backup up) groups. */
1054 for the match to succeed. If the first character is set, req_char must be
1055 found it, so that we don't search again next time round the loop if
1056 from a previous iteration of this group, and be referred to by a reference
1063 group number back at the start and if necessary complete handling an
1064 happens for a repeating ket if no characters were matched in the group.
1065 happens for a repeating ket if no characters were matched in the group.
1066 here; that is handled in the code for KET. */
1067 hold, we get a temporary bit of working store to use during the matching.
1068 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1070 if (!match (start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
1071 if (!match_ref (offset, eptr, length, md, ims))
1072 if (!match_ref (offset, eptr, length, md, ims))
1073 if (!match_ref (offset, eptr, length, md, ims))
1076 if (!startline && extra != NULL)
1077 if ((*ecode++ == OP_WORD_BOUNDARY) ?
1078 if ((data[c / 8] & (1 << (c & 7))) != 0)
1079 if ((data[c / 8] & (1 << (c & 7))) != 0)
1080 if ((data[c / 8] & (1 << (c & 7))) == 0)
1081 if ((extra->options & PCRE_STUDY_MAPPED) != 0)
1082 if ((flags & match_condassert) != 0)
1083 if ((flags & match_condassert) != 0)
1084 if ((flags & match_isgroup) != 0)
1085 if ((ims & PCRE_CASELESS) != 0)
1086 if ((ims & PCRE_CASELESS) != 0)
1087 if ((ims & PCRE_CASELESS) != 0)
1088 if ((ims & PCRE_CASELESS) != 0)
1089 if ((ims & PCRE_CASELESS) != 0)
1090 if ((ims & PCRE_CASELESS) != 0)
1091 if ((ims & PCRE_CASELESS) != 0)
1092 if ((ims & PCRE_DOTALL) == 0 && c == '\n')
1093 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
1094 if ((ims & PCRE_DOTALL) == 0)
1095 if ((ims & PCRE_DOTALL) == 0)
1096 if ((ims & PCRE_MULTILINE) != 0)
1097 if ((ims & PCRE_MULTILINE) != 0)
1098 if ((md->ctypes[*eptr++] & ctype_digit) != 0)
1099 if ((md->ctypes[*eptr++] & ctype_digit) == 0)
1100 if ((md->ctypes[*eptr++] & ctype_space) != 0)
1101 if ((md->ctypes[*eptr++] & ctype_space) == 0)
1102 if ((md->ctypes[*eptr++] & ctype_word) != 0)
1103 if ((md->ctypes[*eptr++] & ctype_word) == 0)
1104 if ((md->ctypes[c] & ctype_digit) != 0)
1105 if ((md->ctypes[c] & ctype_digit) == 0)
1106 if ((md->ctypes[c] & ctype_space) != 0)
1107 if ((md->ctypes[c] & ctype_space) == 0)
1108 if ((md->ctypes[c] & ctype_word) != 0)
1109 if ((md->ctypes[c] & ctype_word) == 0)
1110 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0)
1111 if ((re->options & PCRE_FIRSTSET) != 0)
1112 if ((re->options & PCRE_REQCHSET) != 0)
1113 if ((start_bits[c / 8] & (1 << (c & 7))) == 0)
1114 if (*ecode != OP_ONCE && *ecode != OP_ALT)
1115 if (*ecode == OP_KET || eptr == saved_eptr)
1116 if (*ecode == OP_KET || eptr == saved_eptr)
1117 if (*ecode == OP_KET)
1118 if (*ecode == OP_KETRMIN)
1119 if (*ecode == OP_KETRMIN)
1120 if (*ecode++ != *eptr++)
1121 if (*ecode++ == *eptr++)
1123 if (*eptr++ == '\n')
1124 if (*p++ != *eptr++)
1125 if (*p++ == req_char)
1126 if (*prev != OP_COND)
1127 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1128 if (bmtable != NULL)
1129 if (bmtable[*start_match])
1131 if (c != md->lcc[*eptr++])
1134 if (c == md->lcc[*eptr++])
1135 if (c > md->end_subject - eptr)
1136 if (cur_is_word == prev_is_word ||
1137 if (ecode[3] == OP_CREF) /* Condition is extraction test */
1138 if (ecode[3] == OP_OPT)
1139 if (eptr != md->start_subject && eptr[-1] != '\n')
1140 if (eptr != md->start_subject)
1141 if (eptr < md->end_subject - 1 ||
1142 if (eptr < md->end_subject - 1 ||
1143 if (eptr < md->end_subject)
1144 if (eptr < md->end_subject)
1145 if (eptr < md->start_subject)
1146 if (eptr >= md->end_subject ||
1147 if (eptr >= md->end_subject ||
1148 if (eptr >= md->end_subject ||
1149 if (eptr >= md->end_subject ||
1150 if (eptr >= md->end_subject ||
1151 if (eptr >= md->end_subject ||
1152 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
1153 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
1154 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
1155 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
1156 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
1157 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
1158 if (eptr >= md->end_subject || *eptr == '\n')
1159 if (eptr >= md->end_subject || c != *eptr)
1160 if (eptr >= md->end_subject || c != md->lcc[*eptr])
1161 if (eptr >= md->end_subject || c == *eptr)
1162 if (eptr >= md->end_subject || c == md->lcc[*eptr])
1163 if (eptr >= md->end_subject)
1164 if (eptr >= md->end_subject)
1165 if (eptr >= md->end_subject)
1166 if (eptr >= md->end_subject)
1167 if (eptr >= md->end_subject)
1168 if (eptr++ >= md->end_subject)
1169 if (i >= max || !match_ref (offset, eptr, length, md, ims))
1170 if (i >= max || eptr >= md->end_subject ||
1171 if (i >= max || eptr >= md->end_subject ||
1172 if (i >= max || eptr >= md->end_subject || c != *eptr++)
1173 if (i >= max || eptr >= md->end_subject || c == *eptr++)
1174 if (i >= max || eptr >= md->end_subject)
1175 if (i >= max || eptr >= md->end_subject)
1176 if (is_subject && length > md->end_subject - p)
1177 if (isprint (c = *(p++)))
1179 if (length > md->end_subject - eptr)
1180 if (length > md->end_subject - eptr)
1181 if (match (eptr, ecode + 3, offset_top, md, ims, NULL,
1182 if (match (eptr, ecode + 3, offset_top, md, ims, NULL, match_isgroup))
1183 if (match (eptr, ecode + 3, offset_top, md, ims, NULL, match_isgroup))
1184 if (match (eptr, ecode + 3, offset_top, md, ims, eptrb, 0) ||
1185 if (match (eptr, ecode + 3, offset_top, md, ims, eptrb, 0) ||
1186 if (match (eptr, ecode + 3, offset_top, md, ims, eptrb, match_isgroup))
1187 if (match (eptr, ecode + 3, offset_top, md, ims, eptrb, match_isgroup))
1188 if (match (eptr, ecode + 3, offset_top, md, ims, eptrb, match_isgroup))
1189 if (match (eptr, ecode, offset_top, md, ims, eptrb, 0))
1190 if (match (eptr, ecode, offset_top, md, ims, eptrb, 0))
1191 if (match (eptr, ecode, offset_top, md, ims, eptrb, 0))
1192 if (match (eptr, ecode, offset_top, md, ims, eptrb, 0))
1193 if (match (eptr, ecode, offset_top, md, ims, eptrb, 0))
1194 if (match (eptr, ecode, offset_top, md, ims, eptrb, 0))
1195 if (match (eptr, ecode, offset_top, md, ims, eptrb, 0))
1196 if (match (eptr, ecode, offset_top, md, ims, eptrb, 0))
1197 if (match (eptr, next + 3, offset_top, md, ims, eptrb, match_isgroup))
1198 if (match (eptr, next, offset_top, md, ims, eptrb, match_isgroup))
1199 if (match (eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
1200 if (match (eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
1201 if (match (eptr--, ecode, offset_top, md, ims, eptrb, 0))
1202 if (match (eptr--, ecode, offset_top, md, ims, eptrb, 0))
1203 if (match (eptr--, ecode, offset_top, md, ims, eptrb, 0))
1204 if (match (eptr--, ecode, offset_top, md, ims, eptrb, 0))
1205 if (match (eptr--, ecode, offset_top, md, ims, eptrb, 0))
1206 if (match (eptr--, ecode, offset_top, md, ims, eptrb, 0))
1207 if (match_block.end_offset_top > offsetcount)
1208 if (match_block.offset_vector != NULL)
1209 if (match_block.offset_vector == NULL)
1217 if (md->lcc[*ecode++] != md->lcc[*eptr++])
1218 if (md->lcc[*ecode++] == md->lcc[*eptr++])
1219 if (md->lcc[*p++] != md->lcc[*eptr++])
1220 if (md->notbol && eptr == md->start_subject)
1221 if (md->notempty && eptr == md->start_match)
1232 if (min > md->end_subject - eptr)
1233 if (min > md->end_subject - eptr)
1234 if (min > md->end_subject - eptr)
1243 if (number > EXTRACT_BASIC_MAX)
1244 if (number > EXTRACT_BASIC_MAX)
1245 if (offset < md->offset_max)
1246 if (offset >= md->offset_max)
1247 if (offset_top <= offset)
1248 if (offsetcount < 2)
1249 if (offsetcount >= 4)
1251 if (p > req_char_ptr)
1252 if (p >= end_subject)
1253 if (pp == req_char || pp == req_char2)
1254 if (re == NULL || subject == NULL ||
1255 if (re->magic_number != MAGIC_NUMBER)
1256 if (re->max_match_size >= 0
1257 if (re->top_backref > 0 && re->top_backref >= ocount / 3)
1258 if (req_char == req_char2)
1260 if (resetcount > offsetcount)
1261 if (save != stacksave)
1264 if (start_match + bmtable[256] > end_subject)
1265 if (start_match > match_block.start_subject + start_offset)
1266 if (using_temporary_offsets)
1267 if (using_temporary_offsets)
1268 if certain parts of the pattern were not used. */
1269 if the malloc fails ... there is no way of returning to the top level with
1270 implied in the second condition, because start_offset > 0. */
1271 ims current /i, /m, and /s options
1273 ims = (ims & ~PCRE_IMS) | ecode[4];
1276 ims = re->options & (PCRE_CASELESS | PCRE_MULTILINE | PCRE_DOTALL);
1278 in the subject string, while eptrb holds the value of eptr at the start of the
1279 initialize them to avoid reading uninitialized locations. */
1280 inline, and there are *still* stupid compilers about that don't like indented
1286 int first_char = -1;
1292 int min, max, ctype;
1293 int number = *prev - OP_BRA;
1294 int number = op - OP_BRA;
1295 int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled reference number */
1296 int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled reference number */
1302 int op = (int) *ecode;
1307 int resetcount, ocount;
1308 int save_offset1 = md->offset_vector[offset];
1309 int save_offset2 = md->offset_vector[offset + 1];
1310 int save_offset3 = md->offset_vector[md->offset_end - number];
1311 int skipped_chars = 0;
1314 is a bit large to put on the stack, but using malloc for small numbers
1315 is_subject TRUE if printing from within md->start_subject
1316 it as matched, any number of times (otherwise there could be infinite
1317 item to see if there is repeat information following. The code is similar
1318 item to see if there is repeat information following. Then obey similar
1319 last bracketed group - used for breaking infinite loops matching zero-length
1320 later in the subject; otherwise the test starts at the match point. This
1321 length length of subject string (may contain binary zeros)
1322 length length to be matched
1323 length number to print
1324 length = (offset >= offset_top || md->offset_vector[offset] < 0) ?
1325 length = md->end_subject - p;
1326 level without recursing. Otherwise, if minimizing, keep trying the rest of
1327 level without recursing. Otherwise, if minimizing, keep trying the rest of
1331 majority of cases. It will be suboptimal when the case flag changes in a regex
1332 mark, since extracts may have been taken during the assertion. */
1333 mark, since extracts may have been taken. */
1334 match (eptr, ecode + 3, offset_top, md, ims, eptrb, 0))
1335 match (eptr, ecode + 3, offset_top, md, ims, eptrb, 0))
1336 match (eptr, ecode, offset_top, md, ims, eptrb, flags)
1337 match (eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
1338 match (eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
1339 match_block.ctypes = re->tables + ctypes_offset;
1340 match_block.end_subject = match_block.start_subject + length;
1341 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
1342 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
1343 match_block.errorcode == PCRE_ERROR_NOMATCH &&
1344 match_block.lcc = re->tables + lcc_offset;
1345 match_block.lcc[*start_match] != first_char)
1346 match_block.notbol = (options & PCRE_NOTBOL) != 0;
1347 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
1348 match_block.noteol = (options & PCRE_NOTEOL) != 0;
1349 match_block.offset_end = ocount;
1350 match_block.offset_max = (2 * ocount) / 3;
1351 match_block.offset_overflow = FALSE;
1352 match_block.offset_overflow = TRUE;
1353 match_block.offset_vector = (int *) (pcre_malloc) (ocount * sizeof (int));
1354 match_block.offset_vector = offsets;
1355 match_block.start_match = start_match;
1356 match_block.start_pattern = re->code;
1357 match_block.start_subject = (const uschar *) subject;
1358 match_condassert - this is an assertion condition
1359 match_condassert | match_isgroup))
1363 match_data match_block;
1364 match_isgroup - this is the start of a bracketed group
1366 match_ref (offset, eptr, length, md, ims)
1367 matches, we carry on as at the end of a normal bracket, leaving the subject
1368 matching won't pass the KET for an assertion. If any one branch matches,
1369 matching won't pass the KET for this kind of subpattern. If any one branch
1370 max = (ecode[1] << 8) + ecode[2];
1371 max = (ecode[1] << 8) + ecode[2];
1372 max = (ecode[1] << 8) + ecode[2];
1373 max = (ecode[3] << 8) + ecode[4];
1374 max = (ecode[3] << 8) + ecode[4];
1382 max = rep_max[c]; /* zero for max => infinity */
1383 max = rep_max[c]; /* zero for max => infinity */
1384 max = rep_max[c]; /* zero for max => infinity */
1385 max = rep_max[c]; /* zero for max => infinity */
1386 max = rep_max[c]; /* zero for max => infinity */
1389 maximum. Alternatively, if maximizing, find the maximum number of
1390 maximum. Alternatively, if maximizing, find the maximum number of
1392 md pointer to "static" info for the match
1393 md pointer to matching data block, if is_subject is TRUE
1394 md points to match data block
1395 md->end_match_ptr = eptr; /* For ONCE */
1396 md->end_match_ptr = eptr; /* Record where we ended */
1397 md->end_offset_top = offset_top; /* and how many extracts were taken */
1398 md->end_offset_top = offset_top;
1399 md->end_subject - eptr + 1 :
1400 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
1401 md->offset_overflow = TRUE;
1402 md->offset_vector[md->offset_end - i] = save[i];
1403 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
1404 md->offset_vector[md->offset_end - number] = save_offset3;
1405 md->offset_vector[md->offset_end - number];
1406 md->offset_vector[offset + 1] - md->offset_vector[offset];
1407 md->offset_vector[offset + 1] = eptr - md->start_subject;
1408 md->offset_vector[offset + 1] = save_offset2;
1409 md->offset_vector[offset] =
1410 md->offset_vector[offset] = save_offset1;
1411 memcpy (offsets + 2, match_block.offset_vector + 2,
1412 min = (ecode[1] << 8) + ecode[2];
1413 min = (ecode[1] << 8) + ecode[2];
1417 min = max = (ecode[1] << 8) + ecode[2];
1418 min = max = (ecode[1] << 8) + ecode[2];
1419 min = max = (ecode[1] << 8) + ecode[2];
1421 min = rep_min[c]; /* Pick up values from tables; */
1422 min = rep_min[c]; /* Pick up values from tables; */
1423 min = rep_min[c]; /* Pick up values from tables; */
1424 min = rep_min[c]; /* Pick up values from tables; */
1425 min = rep_min[c]; /* Pick up values from tables; */
1427 minimize = (*ecode == OP_CRMINRANGE);
1428 minimize = (*ecode == OP_CRMINRANGE);
1429 minimize = (c & 1) != 0;
1430 minimize = (c & 1) != 0;
1431 minimize = (c & 1) != 0;
1432 minimize = (c & 1) != 0;
1433 minimize = (c & 1) != 0;
1434 minimize = *ecode == OP_MINUPTO;
1435 minimize = *ecode == OP_NOTMINUPTO;
1436 minimize = *ecode == OP_TYPEMINUPTO;
1438 minimum number of matches are present. If min = max, continue at the same
1439 minimum number of matches are present. If min = max, continue at the same
1440 misrepresented as being the original software.
1441 move back, this match function fails. */
1442 mustn't change the current values of the data slot, because they may be set
1444 never be used unless previously set, but they get saved and restored, and so we
1445 never set for an anchored regular expression, but the anchoring may be forced
1446 newline unless endonly is set, else end of subject unless noteol is set. */
1447 newptrb.prev = eptrb;
1448 newptrb.saved_eptr = eptr;
1449 next += (next[1] << 8) + next[2];
1450 next += (next[1] << 8) + next[2];
1451 non-capturing bracket. Don't worry about setting the flag for the error case
1452 number = (ecode[4] << 8) | ecode[5];
1453 number = (prev[4] << 8) | prev[5];
1454 number from a dummy opcode at the start. */
1455 number, then move along the subject till after the recursive match,
1456 ocount = offsetcount - (offsetcount % 3);
1457 ocount = re->top_backref * 3 + 3;
1458 of (?ims) items in the pattern. They are kept in a local variable so that
1460 of subject left; this ensures that every attempt at a match fails. We
1461 offset index into the offset vector
1462 offset = number << 1;
1463 offset = number << 1;
1464 offset_top current top pointer
1465 offset_top = md->end_offset_top;
1466 offset_top = md->end_offset_top;
1467 offset_top = md->end_offset_top;
1468 offset_top = offset + 2;
1469 offset_top, md, ims, eptrb, match_isgroup);
1470 offsetcount the number of elements in the vector
1471 offsets points to a vector of ints to be filled in with offsets
1472 offsets[0] = start_match - match_block.start_subject;
1473 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
1476 optimization can save a huge amount of backtracking in patterns with nested
1477 option for each character match. Maybe that wouldn't add very much to the
1479 p points to characters
1482 past the end if there is only one branch, but that's OK because that is
1483 pchars (ecode, length, FALSE, md);
1484 pchars (eptr, 16, TRUE, md);
1485 pchars (eptr, length, TRUE, md);
1486 pchars (eptr, length, TRUE, md);
1487 pchars (p, length, FALSE, md);
1488 pchars (p, length, is_subject, md)
1489 pchars (start_match, end_subject - start_match, TRUE, &match_block);
1490 pcre_exec (re, extra, subject, length, start_offset, options, offsets, offsetcount)
1491 place we found it at last time. */
1493 portions of the string if it matches. Two elements in the vector are set for
1494 pre-processor statements. I suppose it's only been 10 years... */
1495 preceded by BRAZERO or BRAMINZERO. */
1496 preceding bracket, in the appropriate order. */
1497 preceding bracket, in the appropriate order. We need to reset any options
1498 printf (" against backref ");
1499 printf (" against pattern ");
1501 printf (">>>> Match against: ");
1502 printf (">>>>> Skipped %d chars to reach first character\n",
1503 printf ("\\x%02x", c);
1509 printf ("end bracket %d", number);
1510 printf ("matching subject ");
1511 printf ("matching subject ");
1512 printf ("matching subject <null> against pattern ");
1513 printf ("matching subject <null>");
1514 printf ("start bracket %d subject=", number);
1516 rc = match (eptr, md->start_pattern, offset_top, md, ims, eptrb,
1517 rc = match_block.offset_overflow ? 0 : match_block.end_offset_top / 2;
1518 register const uschar *ecode;
1519 register const uschar *eptr;
1520 register const uschar *eptr;
1521 register const uschar *p = start_match + ((first_char >= 0) ? 1 : 0);
1522 register int *iend = iptr + resetcount;
1523 register int *iend = iptr - resetcount / 2 + 1;
1524 register int *iptr = match_block.offset_vector + ocount;
1525 register int *iptr = match_block.offset_vector;
1526 register int c = *start_match;
1529 register int length = ecode[1];
1530 register int pp = *p++;
1531 repeat it in the interests of efficiency. */
1532 repeat limits are compiled as a number of copies, with the optional ones
1533 req_char = re->req_char;
1534 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0) ?
1536 resetcount = 2 + re->top_bracket * 2;
1537 resetcount = ocount;
1538 restoring at the exit of a group is easy. */
1619 return PCRE_ERROR_BADMAGIC;
1620 return PCRE_ERROR_BADOPTION;
1621 return PCRE_ERROR_NOMATCH;
1622 return PCRE_ERROR_NOMEMORY;
1623 return PCRE_ERROR_NULL;
1652 return match (eptr, ecode + 3, offset_top, md, ims, eptrb, match_isgroup);
1653 return match_block.errorcode;
1655 save = (int *) (pcre_malloc) ((c + 1) * sizeof (int));
1658 save[i] = md->offset_vector[md->offset_end - i];
1659 seems expensive. As a compromise, the stack is used when there are fewer
1660 share code. This is very similar to the code for single characters, but we
1661 similar code to character type repeats - written out again for speed.
1662 since matching characters is likely to be quite common. First, ensure the
1663 since matching characters is likely to be quite common. First, ensure the
1664 skipped_chars += bmtable[*start_match],
1665 skipped_chars += bmtable[256] - 1;
1666 skipped_chars -= bmtable[256] - 1;
1672 stack of such pointers, to be re-instated at the end of the group when we hit
1673 stack, for holding the values of the subject pointer at the start of each
1674 start of each branch to move the current point backwards, so the code at
1675 start_bits = extra->data.start_bits;
1676 start_match += bmtable[*start_match];
1677 start_match += bmtable[256] - 1;
1678 start_match -= bmtable[256] - 1;
1679 start_match = (const uschar *) subject + length - re->max_match_size;
1680 start_match++ < end_subject);
1685 start_offset where to start in the subject string
1686 startline = (re->options & PCRE_STARTLINE) != 0;
1689 static const char rep_max[] =
1690 static const char rep_min[] =
1693 struct eptrblock *prev;
1694 studied, there may be a bitmap of possible first characters. */
1695 subject points to the subject string
1696 subject if the requested.
1697 subpattern - to break infinite loops. */
1698 subpattern, so as to detect when an empty string has been matched by a
1699 subsequent match. */
1700 such there are (offset_top records the completed total) so we just have
1701 supersede any condition above with which it is incompatible.
1708 test once at the start (i.e. keep it out of the loop). */
1709 than 16 values to store; otherwise malloc is used. A problem is what to do
1710 than the number of characters left in the string, so the match fails.
1711 that "continue" in the code above comes out to here to repeat the main
1712 that changed within the bracket before re-running it, so check the next
1713 that it may occur zero times. It may repeat infinitely, or not at all -
1714 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1715 the closing ket. When match() is called in other circumstances, we don't add to
1716 the code for a repeated single character, but I haven't found a nice way of
1717 the current subject position in the working slot at the top of the vector. We
1718 the expression and advancing one matching character if failing, up to the
1719 the expression and advancing one matching character if failing, up to the
1720 the external pcre header. */
1721 the file Tech.Notes for some information on the internals.
1722 the final argument TRUE causes it to stop at the end of an assertion. */
1724 the length of the reference string explicitly rather than passing the
1725 the loop runs just once. */
1726 the minimum number of bytes before we start. */
1727 the number from a dummy opcode at the start. */
1728 the point in the subject string is not moved back. Thus there can never be
1729 the pointer while it matches the class. */
1732 the start hasn't passed this character yet. */
1735 there were too many extractions, set the return code to zero. In the case
1736 this level is identical to the lookahead case. */
1737 this makes a huge difference to execution time when there aren't many brackets
1738 those back references that we can. In this case there need not be overflow
1739 time taken, but character matching *is* what this is all about... */
1740 to save all the potential data. There may be up to 99 such values, which
1741 to that for character classes, but repeated for efficiency. Then obey
1742 two branches. If the condition is false, skipping the first branch takes us
1743 typedef struct eptrblock
1744 unless PCRE_CASELESS was given or the casing state changes within the regex.
1745 unlimited repeats that aren't going to match. We don't know what the state of
1746 unsigned long int ims = 0;
1747 unsigned long int ims;
1748 unsigned long int ims;
1749 unsigned long int original_ims = ims; /* Save for resetting on ')' */
1750 up quickly if there are fewer than the minimum number of characters left in
1751 up quickly if there are fewer than the minimum number of characters left in
1752 using_temporary_offsets = TRUE;
1753 values of the final offsets, in case they were set by a previous iteration of
1754 we just need to set up the whole thing as substring 0 before returning. If
1755 where we had to get some local store to hold offsets for backreferences, copy
1757 while (*ecode == OP_ALT)
1758 while (*ecode == OP_ALT);
1759 while (*ecode == OP_ALT);
1760 while (*ecode == OP_ALT);
1761 while (*ecode == OP_ALT);
1762 while (*ecode == OP_ALT);
1763 while (*ecode == OP_ALT);
1764 while (*ecode == OP_ALT);
1765 while (*ecode == OP_ALT);
1766 while (*next == OP_ALT);
1767 while (*next == OP_ALT);
1768 while (--iptr >= iend)
1777 while (length-- > 0)
1778 while (length-- > 0)
1779 while (length-- > 0)
1780 while (length-- > 0)
1781 while (length-- > 0)
1782 while (p < end_subject)
1783 while (p < end_subject)
1784 while (start_match < end_subject &&
1785 while (start_match < end_subject && *start_match != first_char)
1786 while (start_match < end_subject && start_match[-1] != '\n')
1787 while (start_match < end_subject)
1788 while (start_match < end_subject)
1925 } /* End of main loop */