2 /*************************************************
3 * Perl-Compatible Regular Expressions *
4 *************************************************/
6 /* DO NOT EDIT THIS FILE! */
8 /* This file is automatically written by the merge-files.py script
9 included with the PCRE distribution for Python; it's produced from
10 several C files, and code is removed in the process. If you want to
11 modify the code or track down bugs, it will be much easier to work
12 with the code in its original, multiple-file form. Don't edit this
13 file by hand, or submit patches to it.
15 The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
18 The unmodified original PCRE distribution is available at
19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20 written by: Philip Hazel <ph10@cam.ac.uk>
22 Extensively modified by the Python String-SIG: <string-sig@python.org>
23 Send bug reports to: <string-sig@python.org>
24 (They'll figure out if it's a bug in PCRE or in the Python-specific
27 Copyright (c) 1997 University of Cambridge
29 -----------------------------------------------------------------------------
30 Permission is granted to anyone to use this software for any purpose on any
31 computer system, and to redistribute it freely, subject to the following
34 1. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 2. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
41 3. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43 -----------------------------------------------------------------------------
53 /*************************************************
54 * Perl-Compatible Regular Expressions *
55 *************************************************/
57 /* This file is automatically written by the makechartables auxiliary
58 program. If you edit it by hand, you might like to edit the Makefile to
59 prevent its ever being regenerated. */
61 /* This table is a lower casing table. */
63 unsigned char pcre_lcc
[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
97 /* This table is a case flipping table. */
99 unsigned char pcre_fcc
[] = {
100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
108 64, 97, 98, 99,100,101,102,103,
109 104,105,106,107,108,109,110,111,
110 112,113,114,115,116,117,118,119,
111 120,121,122, 91, 92, 93, 94, 95,
112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
133 /* This table contains bit maps for digits, letters, 'word' chars, and
134 white space. Each map is 32 bytes long and the bits run from the least
135 significant end of each byte. */
137 unsigned char pcre_cbits
[] = {
138 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
139 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
149 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158 /* This table identifies various classes of character by individual bits:
159 0x01 white space character
162 0x08 hexadecimal digit
163 0x10 alphanumeric or '_'
164 0x80 regular expression metacharacter or binary zero
167 unsigned char pcre_ctypes
[] = {
168 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
169 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
172 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
173 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
174 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
175 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
176 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
177 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
179 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
180 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
181 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
183 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201 /* End of chartables.c */
202 /*************************************************
203 * Perl-Compatible Regular Expressions *
204 *************************************************/
207 This is a library of functions to support regular expressions whose syntax
208 and semantics are as close as possible to those of the Perl 5 language. See
209 the file Tech.Notes for some information on the internals.
211 Written by: Philip Hazel <ph10@cam.ac.uk>
213 Copyright (c) 1998 University of Cambridge
215 -----------------------------------------------------------------------------
216 Permission is granted to anyone to use this software for any purpose on any
217 computer system, and to redistribute it freely, subject to the following
220 1. This software is distributed in the hope that it will be useful,
221 but WITHOUT ANY WARRANTY; without even the implied warranty of
222 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224 2. The origin of this software must not be misrepresented, either by
225 explicit claim or by omission.
227 3. Altered versions must be plainly marked as such, and must not be
228 misrepresented as being the original software.
229 -----------------------------------------------------------------------------
233 /* Include the internals header, which itself includes Standard C headers plus
234 the external pcre header. */
239 /*************************************************
240 * Create bitmap of starting chars *
241 *************************************************/
243 /* This function scans a compiled unanchored expression and attempts to build a
244 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
245 goes by, we may be able to get more clever at doing this.
248 code points to an expression
249 start_bits points to a 32-byte table, initialized to 0
251 Returns: TRUE if table built, FALSE otherwise
255 set_start_bits(const uschar
*code
, uschar
*start_bits
)
262 const uschar
*tcode
= code
+ 3;
263 BOOL try_next
= TRUE
;
269 if ((int)*tcode
>= OP_BRA
|| *tcode
== OP_ASSERT
)
271 if (!set_start_bits(tcode
, start_bits
)) return FALSE
;
279 /* BRAZERO does the bracket, but carries on. */
283 if (!set_start_bits(++tcode
, start_bits
)) return FALSE
;
285 do tcode
+= (tcode
[1] << 8) + tcode
[2]; while (*tcode
== OP_ALT
);
290 /* Single-char * or ? sets the bit and tries the next item */
296 start_bits
[tcode
[1]/8] |= (1 << (tcode
[1]&7));
301 /* Single-char upto sets the bit and tries the next */
305 start_bits
[tcode
[3]/8] |= (1 << (tcode
[3]&7));
310 /* At least one single char sets the bit and stops */
312 case OP_EXACT
: /* Fall through */
315 case OP_CHARS
: /* Fall through */
320 start_bits
[tcode
[1]/8] |= (1 << (tcode
[1]&7));
323 /* Single character type sets the bits and stops */
326 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_digit
];
330 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_digit
];
333 case OP_NOT_WHITESPACE
:
334 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_space
];
338 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_space
];
341 case OP_NOT_WORDCHAR
:
342 for (c
= 0; c
< 32; c
++)
343 start_bits
[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
347 for (c
= 0; c
< 32; c
++)
348 start_bits
[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
351 /* One or more character type fudges the pointer and restarts, knowing
352 it will hit a single character type and stop there. */
365 /* Zero or more repeats of character types set the bits and then
370 tcode
+= 2; /* Fall through */
375 case OP_TYPEMINQUERY
:
379 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_digit
];
383 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_digit
];
386 case OP_NOT_WHITESPACE
:
387 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_space
];
391 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_space
];
394 case OP_NOT_WORDCHAR
:
395 for (c
= 0; c
< 32; c
++)
396 start_bits
[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
400 for (c
= 0; c
< 32; c
++)
401 start_bits
[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
409 /* Character class: set the bits and either carry on or not,
410 according to the repeat count. */
416 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= tcode
[c
];
430 if (((tcode
[1] << 8) + tcode
[2]) == 0)
438 break; /* End of class handling */
440 } /* End of switch */
441 } /* End of try_next loop */
443 code
+= (code
[1] << 8) + code
[2]; /* Advance to next branch */
445 while (*code
== OP_ALT
);
451 /*************************************************
452 * Study a compiled expression *
453 *************************************************/
455 /* This function is handed a compiled expression that it must study to produce
456 information that will speed up the matching. It returns a pcre_extra block
457 which then gets handed back to pcre_exec().
460 re points to the compiled expression
461 options contains option bits
462 errorptr points to where to place error messages;
463 set NULL unless error
465 Returns: pointer to a pcre_extra block,
466 NULL on error or if no optimization possible
470 pcre_study(const pcre
*external_re
, int options
, const char **errorptr
)
473 uschar start_bits
[32];
474 real_pcre_extra
*extra
;
475 const real_pcre
*re
= (const real_pcre
*)external_re
;
479 if (re
== NULL
|| re
->magic_number
!= MAGIC_NUMBER
)
481 *errorptr
= "argument is not a compiled regular expression";
485 if ((options
& ~PUBLIC_STUDY_OPTIONS
) != 0)
487 *errorptr
= "unknown or incorrect option bit(s) set";
491 /* Caseless can either be from the compiled regex or from options. */
493 caseless
= ((re
->options
| options
) & PCRE_CASELESS
) != 0;
495 /* For an anchored pattern, or an unanchored pattern that has a first char, or a
496 multiline pattern that matches only at "line starts", no further processing at
499 if ((re
->options
& (PCRE_ANCHORED
|PCRE_FIRSTSET
|PCRE_STARTLINE
)) != 0)
502 /* See if we can find a fixed set of initial characters for the pattern. */
504 memset(start_bits
, 0, 32 * sizeof(uschar
));
505 if (!set_start_bits(re
->code
, start_bits
)) return NULL
;
507 /* If this studying is caseless, scan the created bit map and duplicate the
508 bits for any letters. */
513 for (c
= 0; c
< 256; c
++)
515 if ((start_bits
[c
/8] & (1 << (c
&7))) != 0 &&
516 (pcre_ctypes
[c
] & ctype_letter
) != 0)
519 start_bits
[d
/8] |= (1 << (d
&7));
524 /* Get an "extra" block and put the information therein. */
526 extra
= (real_pcre_extra
*)(pcre_malloc
)(sizeof(real_pcre_extra
));
530 *errorptr
= "failed to get memory";
534 extra
->options
= PCRE_STUDY_MAPPED
| (caseless
? PCRE_STUDY_CASELESS
: 0);
535 memcpy(extra
->start_bits
, start_bits
, sizeof(start_bits
));
537 return (pcre_extra
*)extra
;
541 /*************************************************
542 * Perl-Compatible Regular Expressions *
543 *************************************************/
546 This is a library of functions to support regular expressions whose syntax
547 and semantics are as close as possible to those of the Perl 5 language. See
548 the file Tech.Notes for some information on the internals.
550 Written by: Philip Hazel <ph10@cam.ac.uk>
552 Copyright (c) 1998 University of Cambridge
554 -----------------------------------------------------------------------------
555 Permission is granted to anyone to use this software for any purpose on any
556 computer system, and to redistribute it freely, subject to the following
559 1. This software is distributed in the hope that it will be useful,
560 but WITHOUT ANY WARRANTY; without even the implied warranty of
561 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
563 2. The origin of this software must not be misrepresented, either by
564 explicit claim or by omission.
566 3. Altered versions must be plainly marked as such, and must not be
567 misrepresented as being the original software.
568 -----------------------------------------------------------------------------
572 /* Define DEBUG to get debugging output on stdout. */
576 /* Use a macro for debugging printing, 'cause that eliminates the the use
577 of #ifdef inline, and there are *still* stupid compilers about that don't like
578 indented pre-processor statements. I suppose it's only been 10 years... */
581 #define DPRINTF(p) printf p
583 #define DPRINTF(p) /*nothing*/
586 /* Include the internals header, which itself includes Standard C headers plus
587 the external pcre header. */
592 #ifndef Py_eval_input
593 /* For Python 1.4, graminit.h has to be explicitly included */
594 #define Py_eval_input eval_input
596 #endif /* FOR_PYTHON */
598 /* Allow compilation as C++ source code, should anybody want to do that. */
601 #define class pcre_class
605 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
607 static const char rep_min
[] = { 0, 0, 1, 1, 0, 0 };
608 static const char rep_max
[] = { 0, 0, 0, 0, 1, 1 };
610 /* Text forms of OP_ values and things, for debugging (not all used) */
613 static const char *OP_names
[] = {
614 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
615 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
616 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
617 "^", "$", "Any", "chars",
619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{",
623 "class", "negclass", "classL", "Ref",
624 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
625 "Brazero", "Braminzero", "Bra"
629 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
630 are simple data values; negative values are for special things like \d and so
631 on. Zero means further processing is needed (for things like \x), or the escape
634 static const short int escapes
[] = {
635 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
636 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
637 '@', -ESC_A
, -ESC_B
, 0, -ESC_D
, 0, 0, 0, /* @ - G */
638 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
639 0, 0, 0, -ESC_S
, 0, 0, 0, -ESC_W
, /* P - W */
640 0, 0, -ESC_Z
, '[', '\\', ']', '^', '_', /* X - _ */
641 '`', 7, -ESC_b
, 0, -ESC_d
, 0, '\f', 0, /* ` - g */
642 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
643 0, 0, '\r', -ESC_s
, '\t', 0, '\v', -ESC_w
, /* p - w */
647 /* Definition to allow mutual recursion */
650 compile_regex(int, int *, uschar
**, const uschar
**, const char **,
653 /* Structure for passing "static" information around between the functions
654 doing the matching, so that they are thread-safe. */
656 typedef struct match_data
{
657 int errorcode
; /* As it says */
658 int *offset_vector
; /* Offset vector */
659 int offset_end
; /* One past the end */
660 BOOL offset_overflow
; /* Set if too many extractions */
661 BOOL caseless
; /* Case-independent flag */
662 BOOL runtime_caseless
; /* Caseless forced at run time */
663 BOOL multiline
; /* Multiline flag */
664 BOOL notbol
; /* NOTBOL flag */
665 BOOL noteol
; /* NOTEOL flag */
666 BOOL dotall
; /* Dot matches any char */
667 BOOL endonly
; /* Dollar not before final \n */
668 const uschar
*start_subject
; /* Start of the subject string */
669 const uschar
*end_subject
; /* End of the subject string */
670 jmp_buf fail_env
; /* Environment for longjump() break out */
671 const uschar
*end_match_ptr
; /* Subject position at end match */
672 int end_offset_top
; /* Highwater mark at end of match */
673 jmp_buf error_env
; /* For longjmp() if an error occurs deep inside a
674 matching operation */
675 int length
; /* Length of the allocated stacks */
676 int point
; /* Point to add next item pushed onto stacks */
677 /* Pointers to the 6 stacks */
678 int *off_num
, *offset_top
, *r1
, *r2
;
679 const uschar
**eptr
, **ecode
;
684 /*************************************************
686 *************************************************/
688 /* PCRE is thread-clean and doesn't use any global variables in the normal
689 sense. However, it calls memory allocation and free functions via the two
690 indirections below, which are can be changed by the caller, but are shared
691 between all threads. */
693 void *(*pcre_malloc
)(size_t) = malloc
;
694 void (*pcre_free
)(void *) = free
;
699 /*************************************************
700 * Return version string *
701 *************************************************/
712 /*************************************************
713 * Return info about a compiled pattern *
714 *************************************************/
716 /* This function picks potentially useful data out of the private
720 external_re points to compiled code
721 optptr where to pass back the options
722 first_char where to pass back the first character,
723 or -1 if multiline and all branches start ^,
726 Returns: number of identifying extraction brackets
727 or negative values on error
731 pcre_info(const pcre
*external_re
, int *optptr
, int *first_char
)
733 const real_pcre
*re
= (real_pcre
*)external_re
;
734 if (re
== NULL
) return PCRE_ERROR_NULL
;
735 if (re
->magic_number
!= MAGIC_NUMBER
) return PCRE_ERROR_BADMAGIC
;
736 if (optptr
!= NULL
) *optptr
= (re
->options
& PUBLIC_OPTIONS
);
737 if (first_char
!= NULL
)
738 *first_char
= ((re
->options
& PCRE_FIRSTSET
) != 0)? re
->first_char
:
739 ((re
->options
& PCRE_STARTLINE
) != 0)? -1 : -2;
740 return re
->top_bracket
;
747 /*************************************************
748 * Debugging function to print chars *
749 *************************************************/
751 /* Print a sequence of chars in printable format, stopping at the end of the
752 subject if the requested.
755 p points to characters
756 length number to print
757 is_subject TRUE if printing from within md->start_subject
758 md pointer to matching data block, if is_subject is TRUE
764 pchars(const uschar
*p
, int length
, BOOL is_subject
, match_data
*md
)
767 if (is_subject
&& length
> md
->end_subject
- p
) length
= md
->end_subject
- p
;
769 if (isprint(c
= *(p
++))) printf("%c", c
); else printf("\\x%02x", c
);
776 /*************************************************
777 * Check subpattern for empty operand *
778 *************************************************/
780 /* This function checks a bracketed subpattern to see if any of the paths
781 through it could match an empty string. This is used to diagnose an error if
782 such a subpattern is followed by a quantifier with an unlimited upper bound.
785 code points to the opening bracket
787 Returns: TRUE or FALSE
791 could_be_empty(uschar
*code
)
794 uschar
*cc
= code
+ 3;
796 /* Scan along the opcodes for this branch; as soon as we find something
797 that matches a non-empty string, break out and advance to test the next
798 branch. If we get to the end of the branch, return TRUE for the whole
803 /* Test an embedded subpattern; if it could not be empty, break the
804 loop. Otherwise carry on in the branch. */
806 if ((int)(*cc
) >= OP_BRA
|| (int)(*cc
) == OP_ONCE
)
808 if (!could_be_empty(cc
)) break;
809 do cc
+= (cc
[1] << 8) + cc
[2]; while (*cc
== OP_ALT
);
815 /* Reached end of a branch: the subpattern may match the empty string */
823 /* Skip over entire bracket groups with zero lower bound */
830 /* Skip over assertive subpatterns */
834 do cc
+= (cc
[1] << 8) + cc
[2]; while (*cc
== OP_ALT
);
838 /* Skip over things that don't match chars */
844 case OP_NOT_WORD_BOUNDARY
:
845 case OP_WORD_BOUNDARY
:
846 case OP_NOT_WORD_BOUNDARY_L
:
847 case OP_WORD_BOUNDARY_L
:
851 /* Skip over simple repeats with zero lower bound */
864 case OP_TYPEMINQUERY
:
868 /* Skip over UPTOs (lower bound is zero) */
877 /* Check a class or a back reference for a zero minimum */
885 case (OP_REF
): cc
+= 2; break;
886 case (OP_CLASS
): case (OP_NEGCLASS
): cc
+= 1+32; break;
887 case (OP_CLASS_L
): cc
+= 1+1+32; break;
901 if ((cc
[1] << 8) + cc
[2] != 0) goto NEXT_BRANCH
;
910 /* Anything else matches at least one character */
918 code
+= (code
[1] << 8) + code
[2];
920 while (*code
== OP_ALT
);
922 /* No branches match the empty string */
927 /* Determine the length of a group ID in an expression like
930 ptr pattern position pointer (say that 3 times fast)
931 finalchar the character that will mark the end of the ID
932 errorptr points to the pointer to the error message
936 get_group_id(const uschar
*ptr
, char finalchar
, const char **errorptr
)
938 const uschar
*start
= ptr
;
940 /* If the first character is not in \w, or is in \w but is a digit,
942 if (!(pcre_ctypes
[*ptr
] & ctype_word
) ||
943 (pcre_ctypes
[*ptr
++] & ctype_digit
))
945 *errorptr
= "(?P identifier must start with a letter or underscore";
949 /* Increment ptr until we either hit a null byte, the desired
950 final character, or a non-word character */
951 for(; (*ptr
!= 0) && (*ptr
!= finalchar
) &&
952 (pcre_ctypes
[*ptr
] & ctype_word
); ptr
++)
954 /* Empty loop body */
960 *errorptr
= "unterminated (?P identifier";
963 *errorptr
= "illegal character in (?P identifier";
967 /*************************************************
969 *************************************************/
971 /* This function is called when a \ has been encountered. It either returns a
972 positive value for a simple escape such as \n, or a negative value which
973 encodes one of the more complicated things such as \d. On entry, ptr is
974 pointing at the \. On exit, it is on the final character of the escape
978 ptrptr points to the pattern position pointer
979 errorptr points to the pointer to the error message
980 bracount number of previous extracting brackets
981 options the options bits
982 isclass TRUE if inside a character class
984 Returns: zero or positive => a data character
985 negative => a special escape sequence
986 on error, errorptr is set
990 check_escape(const uschar
**ptrptr
, const char **errorptr
, int bracount
,
991 int options
, BOOL isclass
)
993 const uschar
*ptr
= *ptrptr
;
994 int c
= *(++ptr
) & 255; /* Ensure > 0 on signed-char systems */
997 if (c
== 0) *errorptr
= ERR1
;
999 /* Digits or letters may have special meaning; all others are literals. */
1001 else if (c
< '0' || c
> 'z') {}
1003 /* Do an initial lookup in a table. A non-zero result is something that can be
1004 returned immediately. Otherwise further processing may be required. */
1006 else if ((i
= escapes
[c
- '0']) != 0) c
= i
;
1008 /* Escapes that need further processing, or are illegal. */
1015 /* The handling of escape sequences consisting of a string of digits
1016 starting with one that is not zero is not straightforward. By experiment,
1017 the way Perl works seems to be as follows:
1019 Outside a character class, the digits are read as a decimal number. If the
1020 number is less than 10, or if there are that many previous extracting
1021 left brackets, then it is a back reference. Otherwise, up to three octal
1022 digits are read to form an escaped byte. Thus \123 is likely to be octal
1023 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1024 value is greater than 377, the least significant 8 bits are taken. Inside a
1025 character class, \ followed by a digit is always an octal number. */
1027 case '1': case '2': case '3': case '4': case '5':
1028 case '6': case '7': case '8': case '9':
1031 /* PYTHON: Try to compute an octal value for a character */
1032 for(c
=0, i
=0; ptr
[i
]!=0 && i
<3; i
++)
1034 if (( pcre_ctypes
[ ptr
[i
] ] & ctype_odigit
) != 0)
1035 c
= (c
* 8 + ptr
[i
]-'0') & 255;
1037 break; /* Non-octal character--break out of the loop */
1039 /* It's a character if there were exactly 3 octal digits, or if
1040 we're inside a character class and there was at least one
1042 if ( (i
== 3) || (isclass
&& i
!=0) )
1047 c
= ptr
[0]; /* Restore the first character after the \ */
1049 while (i
<2 && (pcre_ctypes
[ptr
[1]] & ctype_digit
) != 0)
1051 c
= c
* 10 + ptr
[1] - '0';
1054 if (c
> 255 - ESC_REF
) *errorptr
= "back reference too big";
1059 /* \0 always starts an octal number, but we may drop through to here with a
1060 larger first octal digit */
1064 while(i
++ < 2 && (pcre_ctypes
[ptr
[1]] & ctype_digit
) != 0 &&
1065 ptr
[1] != '8' && ptr
[1] != '9')
1066 c
= (c
* 8 + *(++ptr
) - '0') & 255;
1069 /* Special escapes not starting with a digit are straightforward */
1073 while ( (pcre_ctypes
[ptr
[1]] & ctype_xdigit
) != 0)
1076 c
= c
* 16 + pcre_lcc
[*ptr
] -
1077 (((pcre_ctypes
[*ptr
] & ctype_digit
) != 0)? '0' : 'W');
1083 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1084 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1085 for Perl compatibility, it is a literal. */
1088 if ((options
& PCRE_EXTRA
) != 0) switch(c
)
1091 c
= -ESC_X
; /* This could be a lookup if it ever got into Perl */
1108 /*************************************************
1109 * Check for counted repeat *
1110 *************************************************/
1112 /* This function is called when a '{' is encountered in a place where it might
1113 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1114 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1115 where the ddds are digits.
1118 p pointer to the first char after '{'
1120 Returns: TRUE or FALSE
1124 is_counted_repeat(const uschar
*p
)
1126 if ((pcre_ctypes
[*p
++] & ctype_digit
) == 0) return FALSE
;
1127 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) p
++;
1128 if (*p
== '}') return TRUE
;
1130 if (*p
++ != ',') return FALSE
;
1131 if (*p
== '}') return TRUE
;
1133 if ((pcre_ctypes
[*p
++] & ctype_digit
) == 0) return FALSE
;
1134 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) p
++;
1140 /*************************************************
1141 * Read repeat counts *
1142 *************************************************/
1144 /* Read an item of the form {n,m} and return the values. This is called only
1145 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1146 so the syntax is guaranteed to be correct, but we need to check the values.
1149 p pointer to first char after '{'
1150 minp pointer to int for min
1151 maxp pointer to int for max
1152 returned as -1 if no max
1153 errorptr points to pointer to error message
1155 Returns: pointer to '}' on success;
1156 current ptr on error, with errorptr set
1159 static const uschar
*
1160 read_repeat_counts(const uschar
*p
, int *minp
, int *maxp
, const char **errorptr
)
1165 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) min
= min
* 10 + *p
++ - '0';
1167 if (*p
== '}') max
= min
; else
1172 while((pcre_ctypes
[*p
] & ctype_digit
) != 0) max
= max
* 10 + *p
++ - '0';
1181 /* Do paranoid checks, then fill in the required variables, and pass back the
1182 pointer to the terminating '}'. */
1184 if (min
> 65535 || max
> 65535)
1196 /*************************************************
1197 * Compile one branch *
1198 *************************************************/
1200 /* Scan the pattern, compiling it into the code vector.
1203 options the option bits
1204 bracket points to number of brackets used
1205 code points to the pointer to the current code point
1206 ptrptr points to the current pattern pointer
1207 errorptr points to pointer to error message
1209 Returns: TRUE on success
1210 FALSE, with *errorptr set on error
1214 compile_branch(int options
, int *brackets
, uschar
**codeptr
,
1215 const uschar
**ptrptr
, const char **errorptr
, PyObject
*dictionary
)
1217 int repeat_type
, op_type
;
1218 int repeat_min
, repeat_max
;
1219 int bravalue
, length
;
1220 int greedy_default
, greedy_non_default
;
1222 register uschar
*code
= *codeptr
;
1223 const uschar
*ptr
= *ptrptr
;
1224 const uschar
*oldptr
;
1225 uschar
*previous
= NULL
;
1227 uschar
*class_flag
; /* Pointer to the single-byte flag for OP_CLASS_L */
1229 /* Set up the default and non-default settings for greediness */
1231 greedy_default
= ((options
& PCRE_UNGREEDY
) != 0);
1232 greedy_non_default
= greedy_default
^ 1;
1234 /* Switch on next character until the end of the branch */
1239 int class_charcount
;
1243 if ((options
& PCRE_EXTENDED
) != 0)
1245 if ((pcre_ctypes
[c
] & ctype_space
) != 0) continue;
1248 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
1255 /* The branch terminates at end of string, |, or ). */
1264 /* Handle single-character metacharacters */
1281 /* Character classes. These always build a 32-byte bitmap of the permitted
1282 characters, except in the special case where there is only one character.
1283 For negated classes, we build the map as usual, then invert it at the end.
1288 if (options
& PCRE_LOCALE
)
1290 *code
++ = OP_CLASS_L
;
1291 /* Set the flag for localized classes (like \w) to 0 */
1301 /* If the first character is '^', set the negation flag, and use a
1302 different opcode. This only matters if caseless matching is specified at
1305 if ((c
= *(++ptr
)) == '^')
1307 negate_class
= TRUE
;
1308 if (*(code
-1)==OP_CLASS
) *(code
-1) = OP_NEGCLASS
;
1311 else negate_class
= FALSE
;
1313 /* Keep a count of chars so that we can optimize the case of just a single
1316 class_charcount
= 0;
1317 class_lastchar
= -1;
1319 /* Initialize the 32-char bit map to all zeros. We have to build the
1320 map in a temporary bit of store, in case the class contains only 1
1321 character, because in that case the compiled code doesn't use the
1324 memset(class, 0, 32 * sizeof(uschar
));
1326 /* Process characters until ] is reached. By writing this as a "do" it
1327 means that an initial ] is taken as a data character. */
1337 /* Backslash may introduce a single character, or it may introduce one
1338 of the specials, which just set a flag. Escaped items are checked for
1339 validity in the pre-compiling pass. The sequence \b is a special case.
1340 Inside a class (and only there) it is treated as backspace. Elsewhere
1341 it marks a word boundary. Other escapes have preset maps ready to
1342 or into the one we are building. We assume they have more than one
1343 character in them, so set class_count bigger than one. */
1347 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, TRUE
);
1348 if (-c
== ESC_b
) c
= '\b';
1351 class_charcount
= 10;
1356 for (c
= 0; c
< 32; c
++) class[c
] |= pcre_cbits
[c
+cbit_digit
];
1362 for (c
= 0; c
< 32; c
++) class[c
] |= ~pcre_cbits
[c
+cbit_digit
];
1367 if (options
& PCRE_LOCALE
)
1373 for (c
= 0; c
< 32; c
++)
1374 class[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
1379 if (options
& PCRE_LOCALE
)
1385 for (c
= 0; c
< 32; c
++)
1386 class[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
1392 for (c
= 0; c
< 32; c
++) class[c
] |= pcre_cbits
[c
+cbit_space
];
1398 for (c
= 0; c
< 32; c
++) class[c
] |= ~pcre_cbits
[c
+cbit_space
];
1407 /* Fall through if single character */
1410 /* A single character may be followed by '-' to form a range. However,
1411 Perl does not permit ']' to be the end of the range. A '-' character
1412 here is treated as a literal. */
1414 if (ptr
[1] == '-' && ptr
[2] != ']')
1426 /* The second part of a range can be a single-character escape, but
1427 not any of the other escapes. */
1431 d
= check_escape(&ptr
, errorptr
, *brackets
, options
, TRUE
);
1434 if (d
== -ESC_b
) d
= '\b'; else
1450 class[c
/8] |= (1 << (c
&7));
1451 if ((options
& PCRE_CASELESS
) != 0)
1453 int uc
= pcre_fcc
[c
]; /* flip case */
1454 class[uc
/8] |= (1 << (uc
&7));
1456 class_charcount
++; /* in case a one-char range */
1459 continue; /* Go get the next char in the class */
1462 /* Handle a lone single character - we can get here for a normal
1463 non-escape char, or after \ that introduces a single character. */
1465 class [c
/8] |= (1 << (c
&7));
1466 if ((options
& PCRE_CASELESS
) != 0)
1468 c
= pcre_fcc
[c
]; /* flip case */
1469 class[c
/8] |= (1 << (c
&7));
1475 /* Loop until ']' reached; the check for end of string happens inside the
1476 loop. This "while" is the end of the "do" above. */
1478 while ((c
= *(++ptr
)) != ']');
1480 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1481 precisely one character. This doesn't need the whole 32-byte bit map.
1482 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1485 if (class_charcount
== 1 && class_lastchar
>= 0)
1493 code
[-1] = OP_CHARS
;
1496 *code
++ = class_lastchar
;
1499 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1504 /* If this is a localized opcode, bump the code pointer up */
1505 if (class_flag
) code
++;
1508 if (class_flag
) *class_flag
= (*class_flag
) ^ 63;
1509 for (c
= 0; c
< 32; c
++) code
[c
] = ~class[c
];
1512 memcpy(code
, class, 32);
1517 /* Various kinds of repeat */
1520 if (!is_counted_repeat(ptr
+1)) goto NORMAL_CHAR
;
1521 ptr
= read_repeat_counts(ptr
+1, &repeat_min
, &repeat_max
, errorptr
);
1522 if (*errorptr
!= NULL
) goto FAILED
;
1540 if (previous
== NULL
)
1546 /* If the next character is '?' this is a minimizing repeat, by default,
1547 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1551 { repeat_type
= greedy_non_default
; ptr
++; }
1552 else repeat_type
= greedy_default
;
1554 /* If the maximum is zero then the minimum must also be zero; Perl allows
1555 this case, so we do too - by simply omitting the item altogether. */
1557 if (repeat_max
== 0) code
= previous
;
1559 /* If previous was a string of characters, chop off the last one and use it
1560 as the subject of the repeat. If there was only one character, we can
1561 abolish the previous item altogether. */
1563 else if (*previous
== OP_CHARS
)
1565 int len
= previous
[1];
1573 c
= previous
[len
+1];
1577 op_type
= 0; /* Use single-char op codes */
1578 goto OUTPUT_SINGLE_REPEAT
; /* Code shared with single character types */
1581 /* If previous was a single negated character ([^a] or similar), we use
1582 one of the special opcodes, replacing it. The code is shared with single-
1583 character repeats by adding a suitable offset into repeat_type. */
1585 else if ((int)*previous
== OP_NOT
)
1587 op_type
= OP_NOTSTAR
- OP_STAR
; /* Use "not" opcodes */
1590 goto OUTPUT_SINGLE_REPEAT
;
1593 /* If previous was a character type match (\d or similar), abolish it and
1594 create a suitable repeat item. The code is shared with single-character
1595 repeats by adding a suitable offset into repeat_type. */
1597 else if ((int)*previous
< OP_CIRC
|| *previous
== OP_ANY
)
1599 op_type
= OP_TYPESTAR
- OP_STAR
; /* Use type opcodes */
1603 OUTPUT_SINGLE_REPEAT
:
1604 repeat_type
+= op_type
; /* Combine both values for many cases */
1606 /* A minimum of zero is handled either as the special case * or ?, or as
1607 an UPTO, with the maximum given. */
1609 if (repeat_min
== 0)
1611 if (repeat_max
== -1) *code
++ = OP_STAR
+ repeat_type
;
1612 else if (repeat_max
== 1) *code
++ = OP_QUERY
+ repeat_type
;
1615 *code
++ = OP_UPTO
+ repeat_type
;
1616 *code
++ = repeat_max
>> 8;
1617 *code
++ = (repeat_max
& 255);
1621 /* The case {1,} is handled as the special case + */
1623 else if (repeat_min
== 1 && repeat_max
== -1)
1624 *code
++ = OP_PLUS
+ repeat_type
;
1626 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1627 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1631 if (repeat_min
!= 1)
1633 *code
++ = OP_EXACT
+ op_type
; /* NB EXACT doesn't have repeat_type */
1634 *code
++ = repeat_min
>> 8;
1635 *code
++ = (repeat_min
& 255);
1638 /* If the minimum is 1 and the previous item was a character string,
1639 we either have to put back the item that got canceled if the string
1640 length was 1, or add the character back onto the end of a longer
1641 string. For a character type nothing need be done; it will just get
1642 put back naturally. Note that the final character is always going to
1645 else if (*previous
== OP_CHARS
)
1647 if (code
== previous
) code
+= 2; else previous
[1]++;
1650 /* For a single negated character we also have to put back the
1651 item that got canceled. */
1653 else if (*previous
== OP_NOT
) code
++;
1655 /* If the maximum is unlimited, insert an OP_STAR. */
1660 *code
++ = OP_STAR
+ repeat_type
;
1663 /* Else insert an UPTO if the max is greater than the min. */
1665 else if (repeat_max
!= repeat_min
)
1668 repeat_max
-= repeat_min
;
1669 *code
++ = OP_UPTO
+ repeat_type
;
1670 *code
++ = repeat_max
>> 8;
1671 *code
++ = (repeat_max
& 255);
1675 /* The character or character type itself comes last in all cases. */
1680 /* If previous was a character class or a back reference, we put the repeat
1683 else if (*previous
== OP_CLASS
|| *previous
== OP_NEGCLASS
||
1684 *previous
==OP_CLASS_L
|| *previous
== OP_REF
)
1686 if (repeat_min
== 0 && repeat_max
== -1)
1687 *code
++ = OP_CRSTAR
+ repeat_type
;
1688 else if (repeat_min
== 1 && repeat_max
== -1)
1689 *code
++ = OP_CRPLUS
+ repeat_type
;
1690 else if (repeat_min
== 0 && repeat_max
== 1)
1691 *code
++ = OP_CRQUERY
+ repeat_type
;
1694 *code
++ = OP_CRRANGE
+ repeat_type
;
1695 *code
++ = repeat_min
>> 8;
1696 *code
++ = repeat_min
& 255;
1697 if (repeat_max
== -1) repeat_max
= 0; /* 2-byte encoding for max */
1698 *code
++ = repeat_max
>> 8;
1699 *code
++ = repeat_max
& 255;
1703 /* If previous was a bracket group, we may have to replicate it in certain
1704 cases. If the maximum repeat count is unlimited, check that the bracket
1705 group cannot match the empty string, and diagnose an error if it can. */
1707 else if ((int)*previous
>= OP_BRA
)
1710 int len
= code
- previous
;
1712 if (repeat_max
== -1 && could_be_empty(previous
))
1718 /* If the minimum is greater than zero, and the maximum is unlimited or
1719 equal to the minimum, the first copy remains where it is, and is
1720 replicated up to the minimum number of times. This case includes the +
1721 repeat, but of course no replication is needed in that case. */
1723 if (repeat_min
> 0 && (repeat_max
== -1 || repeat_max
== repeat_min
))
1725 for (i
= 1; i
< repeat_min
; i
++)
1727 memcpy(code
, previous
, len
);
1732 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1733 Then, if there is a fixed upper limit, replicated up to that many times,
1734 sticking BRAZERO in front of all the optional ones. */
1738 if (repeat_min
== 0)
1740 memmove(previous
+1, previous
, len
);
1742 *previous
++ = OP_BRAZERO
+ repeat_type
;
1745 for (i
= 1; i
< repeat_min
; i
++)
1747 memcpy(code
, previous
, len
);
1751 for (i
= (repeat_min
> 0)? repeat_min
: 1; i
< repeat_max
; i
++)
1753 *code
++ = OP_BRAZERO
+ repeat_type
;
1754 memcpy(code
, previous
, len
);
1759 /* If the maximum is unlimited, set a repeater in the final copy. */
1761 if (repeat_max
== -1) code
[-3] = OP_KETRMAX
+ repeat_type
;
1764 /* Else there's some kind of shambles */
1772 /* In all case we no longer have a previous item. */
1778 /* Start of nested bracket sub-expression, or comment or lookahead.
1779 First deal with special things that can come after a bracket; all are
1780 introduced by ?, and the appearance of any of them means that this is not a
1781 referencing group. They were checked for validity in the first pass over
1782 the string, so we don't have to check for syntax errors here. */
1785 previous
= code
; /* Only real brackets can be repeated */
1786 if (*(++ptr
) == '?')
1799 while (*ptr
!= ')') ptr
++;
1803 case ':': /* Non-extracting bracket */
1807 case '=': /* Assertions can't be repeated */
1808 bravalue
= OP_ASSERT
;
1814 bravalue
= OP_ASSERT_NOT
;
1823 /* (?P<groupname>...) */
1825 PyObject
*string
, *intobj
;
1828 idlen
= get_group_id(ptr
, '>', errorptr
);
1832 string
= PyString_FromStringAndSize((char*)ptr
, idlen
);
1833 intobj
= PyInt_FromLong( brackets
[0] + 1 );
1834 if (intobj
== NULL
|| string
== NULL
)
1838 *errorptr
= "exception raised";
1841 PyDict_SetItem(dictionary
, string
, intobj
);
1842 Py_DECREF(string
); Py_DECREF(intobj
); /* XXX DECREF commented out! */
1843 ptr
+= idlen
+1; /* Point to rest of expression */
1844 goto do_grouping_bracket
;
1848 /* (?P=groupname) */
1850 PyObject
*string
, *intobj
;
1853 idlen
= get_group_id(ptr
, ')', errorptr
);
1857 string
= PyString_FromStringAndSize((char *)ptr
, idlen
);
1859 *errorptr
= "exception raised";
1862 intobj
= PyDict_GetItem(dictionary
, string
);
1865 *errorptr
= "?P= group identifier isn't defined";
1869 refnum
= PyInt_AsLong(intobj
);
1871 /* The caller doesn't own the reference to the value
1872 returned from PyDict_GetItem, so intobj is not
1877 /* The continue will cause the top-level for() loop to
1878 be resumed, so ptr will be immediately incremented.
1879 Therefore, the following line adds just idlen, not
1884 /* The character after ?P is neither < nor =, so
1885 report an error. Add more Python-extensions here. */
1886 *errorptr
="unknown after (?P";
1889 case '>': /* "Match once" brackets */
1890 if ((options
& PCRE_EXTRA
) != 0) /* Not yet standard */
1897 /* Else fall through */
1905 /* Else we have a referencing group */
1909 do_grouping_bracket
:
1910 if (++(*brackets
) > EXTRACT_MAX
)
1915 bravalue
= OP_BRA
+ *brackets
;
1918 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1919 code into a non-register variable in order to be able to pass its address
1920 because some compilers complain otherwise. */
1924 uschar
*mcode
= code
;
1925 if (!compile_regex(options
, brackets
, &mcode
, &ptr
, errorptr
, dictionary
))
1937 /* Check \ for being a real metacharacter; if not, fall through and handle
1938 it as a data character at the start of a string. Escape items are checked
1939 for validity in the pre-compiling pass. */
1943 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, FALSE
);
1945 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1946 are arranged to be the negation of the corresponding OP_values. For the
1947 back references, the values are ESC_REF plus the reference number. Only
1948 back references and those types that consume a character may be repeated.
1949 We can test for values between ESC_b and ESC_Z for the latter; this may
1950 have to change if any new ones are ever created. */
1956 int refnum
= -c
- ESC_REF
;
1957 if (*brackets
< refnum
)
1968 previous
= (-c
> ESC_b
&& -c
< ESC_X
)? code
: NULL
;
1969 if ( (options
& PCRE_LOCALE
) != 0)
1973 case (-ESC_b
): c
= -OP_WORD_BOUNDARY_L
; break;
1974 case (-ESC_B
): c
= -OP_NOT_WORD_BOUNDARY_L
; break;
1975 case (-ESC_w
): c
= -OP_WORDCHAR_L
; break;
1976 case (-ESC_W
): c
= -OP_NOT_WORDCHAR_L
; break;
1984 /* Data character: Reset and fall through */
1989 /* Handle a run of data characters until a metacharacter is encountered.
1990 The first character is guaranteed not to be whitespace or # when the
1991 extended flag is set. */
2002 if ((options
& PCRE_EXTENDED
) != 0)
2004 if ((pcre_ctypes
[c
] & ctype_space
) != 0) continue;
2007 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2013 /* Backslash may introduce a data char or a metacharacter. Escaped items
2014 are checked for validity in the pre-compiling pass. Stop the string
2015 before a metaitem. */
2020 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, FALSE
);
2021 if (c
< 0) { ptr
= oldptr
; break; }
2024 /* Ordinary character or single-char escape */
2030 /* This "while" is the end of the "do" above. */
2032 while (length
< 255 && (pcre_ctypes
[c
= *(++ptr
)] & ctype_meta
) == 0);
2034 /* Compute the length and set it in the data vector, and advance to
2037 previous
[1] = length
;
2038 if (length
< 255) ptr
--;
2041 } /* end of big loop */
2043 /* Control never reaches here by falling through, only by a goto for all the
2044 error states. Pass back the position in the pattern so that it can be displayed
2045 to the user for diagnosing the error. */
2055 /*************************************************
2056 * Compile sequence of alternatives *
2057 *************************************************/
2059 /* On entry, ptr is pointing past the bracket character, but on return
2060 it points to the closing bracket, or vertical bar, or end of string.
2061 The code variable is pointing at the byte into which the BRA operator has been
2065 options the option bits
2066 brackets -> int containing the number of extracting brackets used
2067 codeptr -> the address of the current code pointer
2068 ptrptr -> the address of the current pattern pointer
2069 errorptr -> pointer to error message
2071 Returns: TRUE on success
2075 compile_regex(int options
, int *brackets
, uschar
**codeptr
,
2076 const uschar
**ptrptr
, const char **errorptr
, PyObject
*dictionary
)
2078 const uschar
*ptr
= *ptrptr
;
2079 uschar
*code
= *codeptr
;
2080 uschar
*start_bracket
= code
;
2085 uschar
*last_branch
= code
;
2088 if (!compile_branch(options
, brackets
, &code
, &ptr
, errorptr
, dictionary
))
2094 /* Fill in the length of the last branch */
2096 length
= code
- last_branch
;
2097 last_branch
[1] = length
>> 8;
2098 last_branch
[2] = length
& 255;
2100 /* Reached end of expression, either ')' or end of pattern. Insert a
2101 terminating ket and the length of the whole bracketed item, and return,
2102 leaving the pointer at the terminating char. */
2106 length
= code
- start_bracket
;
2108 *code
++ = length
>> 8;
2109 *code
++ = length
& 255;
2115 /* Another branch follows; insert an "or" node and advance the pointer. */
2120 /* Control never reaches here */
2125 /*************************************************
2126 * Check for anchored expression *
2127 *************************************************/
2129 /* Try to find out if this is an anchored regular expression. Consider each
2130 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2131 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2132 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2133 counts, since OP_CIRC can match in the middle.
2135 A branch is also implicitly anchored if it starts with .* because that will try
2136 the rest of the pattern at all possible matching points, so there is no point
2139 Argument: points to start of expression (the bracket)
2140 Returns: TRUE or FALSE
2144 is_anchored(register const uschar
*code
, BOOL multiline
)
2147 int op
= (int)code
[3];
2148 if (op
>= OP_BRA
|| op
== OP_ASSERT
|| op
== OP_ONCE
)
2149 { if (!is_anchored(code
+3, multiline
)) return FALSE
; }
2150 else if (op
== OP_TYPESTAR
|| op
== OP_TYPEMINSTAR
)
2151 { if (code
[4] != OP_ANY
) return FALSE
; }
2152 else if (op
!= OP_SOD
&& (multiline
|| op
!= OP_CIRC
)) return FALSE
;
2153 code
+= (code
[1] << 8) + code
[2];
2155 while (*code
== OP_ALT
);
2161 /*************************************************
2162 * Check for start with \n line expression *
2163 *************************************************/
2165 /* This is called for multiline expressions to try to find out if every branch
2166 starts with ^ so that "first char" processing can be done to speed things up.
2168 Argument: points to start of expression (the bracket)
2169 Returns: TRUE or FALSE
2173 is_startline(const uschar
*code
)
2176 if ((int)code
[3] >= OP_BRA
|| code
[3] == OP_ASSERT
)
2177 { if (!is_startline(code
+3)) return FALSE
; }
2178 else if (code
[3] != OP_CIRC
) return FALSE
;
2179 code
+= (code
[1] << 8) + code
[2];
2181 while (*code
== OP_ALT
);
2187 /*************************************************
2188 * Check for fixed first char *
2189 *************************************************/
2191 /* Try to find out if there is a fixed first character. This is called for
2192 unanchored expressions, as it speeds up their processing quite considerably.
2193 Consider each alternative branch. If they all start with the same char, or with
2194 a bracket all of whose alternatives start with the same char (recurse ad lib),
2195 then we return that char, otherwise -1.
2197 Argument: points to start of expression (the bracket)
2198 Returns: -1 or the fixed first char
2202 find_firstchar(uschar
*code
)
2204 register int c
= -1;
2207 register int charoffset
= 4;
2209 if ((int)code
[3] >= OP_BRA
|| code
[3] == OP_ASSERT
)
2212 if ((d
= find_firstchar(code
+3)) < 0) return -1;
2213 if (c
< 0) c
= d
; else if (c
!= d
) return -1;
2216 else switch(code
[3])
2221 case OP_EXACT
: /* Fall through */
2224 case OP_CHARS
: /* Fall through */
2229 if (c
< 0) c
= code
[charoffset
]; else if (c
!= code
[charoffset
]) return -1;
2232 code
+= (code
[1] << 8) + code
[2];
2234 while (*code
== OP_ALT
);
2240 /*************************************************
2241 * Compile a Regular Expression *
2242 *************************************************/
2244 /* This function takes a string and returns a pointer to a block of store
2245 holding a compiled version of the expression.
2248 pattern the regular expression
2249 options various option bits
2250 errorptr pointer to pointer to error text
2251 erroroffset ptr offset in pattern where error was detected
2253 Returns: pointer to compiled data block, or NULL on error,
2254 with errorptr and erroroffset set
2258 pcre_compile(const char *pattern
, int options
, const char **errorptr
,
2259 int *erroroffset
, PyObject
*dictionary
)
2263 int length
= 3; /* For initial BRA plus length */
2268 int top_backref
= 0;
2269 unsigned int brastackptr
= 0;
2274 uschar
*code_base
, *code_end
;
2277 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2278 can do is just return NULL. */
2280 if (errorptr
== NULL
) return NULL
;
2283 /* However, we can give a message for this error */
2285 if (erroroffset
== NULL
)
2292 if ((options
& ~PUBLIC_OPTIONS
) != 0)
2298 DPRINTF(("------------------------------------------------------------------\n"));
2299 DPRINTF(("%s\n", pattern
));
2301 /* The first thing to do is to make a pass over the pattern to compute the
2302 amount of store required to hold the compiled code. This does not have to be
2303 perfect as long as errors are overestimates. At the same time we can detect any
2304 internal flag settings. Make an attempt to correct for any counted white space
2305 if an "extended" flag setting appears late in the pattern. We can't be so
2306 clever for #-comments. */
2308 ptr
= (const uschar
*)(pattern
- 1);
2309 while ((c
= *(++ptr
)) != 0)
2312 int class_charcount
;
2314 if ((pcre_ctypes
[c
] & ctype_space
) != 0)
2316 if ((options
& PCRE_EXTENDED
) != 0) continue;
2320 if (c
== '#' && (options
& PCRE_EXTENDED
) != 0)
2322 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2328 /* A backslashed item may be an escaped "normal" character or a
2329 character type. For a "normal" character, put the pointers and
2330 character back so that tests for whitespace etc. in the input
2331 are done correctly. */
2335 const uschar
*save_ptr
= ptr
;
2336 c
= check_escape(&ptr
, errorptr
, bracount
, options
, FALSE
);
2337 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2347 /* A back reference needs an additional char, plus either one or 5
2348 bytes for a repeat. We also need to keep the value of the highest
2353 int refnum
= -c
- ESC_REF
;
2354 if (refnum
> top_backref
) top_backref
= refnum
;
2355 length
++; /* For single back reference */
2356 if (ptr
[1] == '{' && is_counted_repeat(ptr
+2))
2358 ptr
= read_repeat_counts(ptr
+2, &min
, &max
, errorptr
);
2359 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2360 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2361 (min
== 1 && max
== -1))
2364 if (ptr
[1] == '?') ptr
++;
2372 case '*': /* These repeats won't be after brackets; */
2373 case '+': /* those are handled separately */
2378 /* This covers the cases of repeats after a single char, metachar, class,
2379 or back reference. */
2382 if (!is_counted_repeat(ptr
+1)) goto NORMAL_CHAR
;
2383 ptr
= read_repeat_counts(ptr
+1, &min
, &max
, errorptr
);
2384 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2385 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2386 (min
== 1 && max
== -1))
2390 length
--; /* Uncount the original char or metachar */
2391 if (min
== 1) length
++; else if (min
> 0) length
+= 4;
2392 if (max
> 0) length
+= 4; else length
+= 2;
2394 if (ptr
[1] == '?') ptr
++;
2397 /* An alternation contains an offset to the next branch or ket. */
2402 /* A character class uses 33 characters. Don't worry about character types
2403 that aren't allowed in classes - they'll get picked up during the compile.
2404 A character class that contains only one character uses 2 or 3 bytes,
2405 depending on whether it is negated or not. Notice this where we can. */
2408 class_charcount
= 0;
2409 if (*(++ptr
) == '^') ptr
++;
2414 int ch
= check_escape(&ptr
, errorptr
, bracount
, options
, TRUE
);
2415 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2416 if (-ch
== ESC_b
) class_charcount
++; else class_charcount
= 10;
2418 else class_charcount
++;
2421 while (*ptr
!= 0 && *ptr
!= ']');
2423 /* Repeats for negated single chars are handled by the general code */
2425 if (class_charcount
== 1) length
+= 3; else
2428 if (options
& PCRE_LOCALE
) length
++; /* Add a byte for the localization flag */
2430 /* A repeat needs either 1 or 5 bytes. */
2432 if (*ptr
!= 0 && ptr
[1] == '{' && is_counted_repeat(ptr
+2))
2434 ptr
= read_repeat_counts(ptr
+2, &min
, &max
, errorptr
);
2435 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2436 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2437 (min
== 1 && max
== -1))
2440 if (ptr
[1] == '?') ptr
++;
2445 /* Brackets may be genuine groups or special things */
2449 /* Handle special forms of bracket, which all start (? */
2451 if (ptr
[1] == '?') switch (c
= ptr
[2])
2453 /* Skip over comments entirely */
2456 while (*ptr
!= 0 && *ptr
!= ')') ptr
++;
2460 goto PCRE_ERROR_RETURN
;
2464 /* Non-referencing groups and lookaheads just move the pointer on, and
2465 then behave like a non-special bracket, except that they don't increment
2466 the count of extracting brackets. */
2479 idlen
= get_group_id(ptr
++, '>', errorptr
);
2480 if (*errorptr
) goto PCRE_ERROR_RETURN
;
2484 idlen
= get_group_id(ptr
++, ')', errorptr
);
2485 if (*errorptr
) goto PCRE_ERROR_RETURN
;
2493 /* Ditto for the "once only" bracket, allowed only if the extra bit
2497 if ((options
& PCRE_EXTRA
) != 0)
2502 /* Else fall through */
2504 /* Else loop setting valid options until ) is met. Anything else is an
2511 if ((c
= *ptr
) == 'i')
2513 options
|= PCRE_CASELESS
;
2516 else if ((c
= *ptr
) == 'L')
2518 options
|= PCRE_LOCALE
;
2521 else if ((c
= *ptr
) == 'm')
2523 options
|= PCRE_MULTILINE
;
2528 options
|= PCRE_DOTALL
;
2533 options
|= PCRE_EXTENDED
;
2534 length
-= spaces
; /* Already counted spaces */
2537 else if (c
== ')') break;
2540 goto PCRE_ERROR_RETURN
;
2542 continue; /* End of this bracket handling */
2545 /* Extracting brackets must be counted so we can process escapes in a
2550 /* Non-special forms of bracket. Save length for computing whole length
2551 at end if there's a repeat that requires duplication of the group. */
2553 if (brastackptr
>= sizeof(brastack
)/sizeof(int))
2556 goto PCRE_ERROR_RETURN
;
2559 brastack
[brastackptr
++] = length
;
2563 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2564 have to replicate this bracket up to that many times. If brastackptr is
2565 0 this is an unmatched bracket which will generate an error, but take care
2566 not to try to access brastack[-1]. */
2573 int duplength
= (brastackptr
> 0)? length
- brastack
[--brastackptr
] : 0;
2575 /* Leave ptr at the final char; for read_repeat_counts this happens
2576 automatically; for the others we need an increment. */
2578 if ((c
= ptr
[1]) == '{' && is_counted_repeat(ptr
+2))
2580 ptr
= read_repeat_counts(ptr
+2, &minval
, &maxval
, errorptr
);
2581 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2583 else if (c
== '*') { minval
= 0; maxval
= -1; ptr
++; }
2584 else if (c
== '+') { maxval
= -1; ptr
++; }
2585 else if (c
== '?') { minval
= 0; ptr
++; }
2587 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2588 if there is a limited maximum we have to replicate up to maxval-1 times
2589 and allow for a BRAZERO item before each optional copy, as we also have
2590 to do before the first copy if the minimum is zero. */
2592 if (minval
== 0) length
++;
2593 else if (minval
> 1) length
+= (minval
- 1) * duplength
;
2594 if (maxval
> minval
) length
+= (maxval
- minval
) * (duplength
+ 1);
2598 /* Non-special character. For a run of such characters the length required
2599 is the number of characters + 2, except that the maximum run length is 255.
2600 We won't get a skipped space or a non-data escape or the start of a #
2601 comment as the first character, so the length can't be zero. */
2609 if ((pcre_ctypes
[c
] & ctype_space
) != 0)
2611 if ((options
& PCRE_EXTENDED
) != 0) continue;
2615 if (c
== '#' && (options
& PCRE_EXTENDED
) != 0)
2617 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2621 /* Backslash may introduce a data char or a metacharacter; stop the
2622 string before the latter. */
2626 const uschar
*saveptr
= ptr
;
2627 c
= check_escape(&ptr
, errorptr
, bracount
, options
, FALSE
);
2628 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2629 if (c
< 0) { ptr
= saveptr
; break; }
2632 /* Ordinary character or single-char escape */
2637 /* This "while" is the end of the "do" above. */
2639 while (runlength
< 255 && (pcre_ctypes
[c
= *(++ptr
)] & ctype_meta
) == 0);
2642 length
+= runlength
;
2647 length
+= 4; /* For final KET and END */
2655 /* Compute the size of data block needed and get it, either from malloc or
2656 externally provided function. We specify "code[0]" in the offsetof() expression
2657 rather than just "code", because it has been reported that one broken compiler
2658 fails on "code" because it is also an independent variable. It should make no
2659 difference to the value of the offsetof(). */
2661 size
= length
+ offsetof(real_pcre
, code
[0]);
2662 re
= (real_pcre
*)(pcre_malloc
)(size
+50);
2670 /* Put in the magic number and the options. */
2672 re
->magic_number
= MAGIC_NUMBER
;
2673 re
->options
= options
;
2675 /* Set up a starting, non-extracting bracket, then compile the expression. On
2676 error, *errorptr will be set non-NULL, so we don't need to look at the result
2677 of the function here. */
2679 ptr
= (const uschar
*)pattern
;
2683 (void)compile_regex(options
, &bracount
, &code
, &ptr
, errorptr
, dictionary
);
2684 re
->top_bracket
= bracount
;
2685 re
->top_backref
= top_backref
;
2687 /* If not reached end of pattern on success, there's an excess bracket. */
2689 if (*errorptr
== NULL
&& *ptr
!= 0) *errorptr
= ERR22
;
2691 /* Fill in the terminating state and check for disastrous overflow, but
2692 if debugging, leave the test till after things are printed out. */
2698 if (code
- re
->code
> length
) *errorptr
= ERR23
;
2701 /* Failed to compile */
2703 if (*errorptr
!= NULL
)
2707 *erroroffset
= ptr
- (const uschar
*)pattern
;
2711 /* If the anchored option was not passed, set flag if we can determine that it
2712 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2713 we can determine what the first character has to be, because that speeds up
2714 unanchored matches no end. In the case of multiline matches, an alternative is
2715 to set the PCRE_STARTLINE flag if all branches start with ^. */
2717 if ((options
& PCRE_ANCHORED
) == 0)
2719 if (is_anchored(re
->code
, (options
& PCRE_MULTILINE
) != 0))
2720 re
->options
|= PCRE_ANCHORED
;
2723 int ch
= find_firstchar(re
->code
);
2726 re
->first_char
= ch
;
2727 re
->options
|= PCRE_FIRSTSET
;
2729 else if (is_startline(re
->code
))
2730 re
->options
|= PCRE_STARTLINE
;
2734 /* Print out the compiled data for debugging */
2738 printf("Length = %d top_bracket = %d top_backref=%d\n",
2739 length
, re
->top_bracket
, re
->top_backref
);
2741 if (re
->options
!= 0)
2743 printf("%s%s%s%s%s%s%s%s\n",
2744 ((re
->options
& PCRE_ANCHORED
) != 0)? "anchored " : "",
2745 ((re
->options
& PCRE_CASELESS
) != 0)? "caseless " : "",
2746 ((re
->options
& PCRE_EXTENDED
) != 0)? "extended " : "",
2747 ((re
->options
& PCRE_MULTILINE
) != 0)? "multiline " : "",
2748 ((re
->options
& PCRE_DOTALL
) != 0)? "dotall " : "",
2749 ((re
->options
& PCRE_DOLLAR_ENDONLY
) != 0)? "endonly " : "",
2750 ((re
->options
& PCRE_EXTRA
) != 0)? "extra " : "",
2751 ((re
->options
& PCRE_UNGREEDY
) != 0)? "ungreedy " : "");
2754 if ((re
->options
& PCRE_FIRSTSET
) != 0)
2756 if (isprint(re
->first_char
)) printf("First char = %c\n", re
->first_char
);
2757 else printf("First char = \\x%02x\n", re
->first_char
);
2761 code_base
= code
= re
->code
;
2763 while (code
< code_end
)
2767 printf("%3d ", code
- code_base
);
2769 if (*code
>= OP_BRA
)
2771 printf("%3d Bra %d", (code
[1] << 8) + code
[2], *code
- OP_BRA
);
2778 charlength
= *(++code
);
2779 printf("%3d ", charlength
);
2780 while (charlength
-- > 0)
2781 if (isprint(c
= *(++code
))) printf("%c", c
); else printf("\\x%02x", c
);
2791 printf("%3d %s", (code
[1] << 8) + code
[2], OP_names
[*code
]);
2802 case OP_TYPEMINSTAR
:
2804 case OP_TYPEMINPLUS
:
2806 case OP_TYPEMINQUERY
:
2807 if (*code
>= OP_TYPESTAR
)
2808 printf(" %s", OP_names
[code
[1]]);
2809 else if (isprint(c
= code
[1])) printf(" %c", c
);
2810 else printf(" \\x%02x", c
);
2811 printf("%s", OP_names
[*code
++]);
2817 if (isprint(c
= code
[3])) printf(" %c{", c
);
2818 else printf(" \\x%02x{", c
);
2819 if (*code
!= OP_EXACT
) printf("0,");
2820 printf("%d}", (code
[1] << 8) + code
[2]);
2821 if (*code
== OP_MINUPTO
) printf("?");
2827 case OP_TYPEMINUPTO
:
2828 printf(" %s{", OP_names
[code
[3]]);
2829 if (*code
!= OP_TYPEEXACT
) printf(",");
2830 printf("%d}", (code
[1] << 8) + code
[2]);
2831 if (*code
== OP_TYPEMINUPTO
) printf("?");
2836 if (isprint(c
= *(++code
))) printf(" [^%c]", c
);
2837 else printf(" [^\\x%02x]", c
);
2845 case OP_NOTMINQUERY
:
2846 if (isprint(c
= code
[1])) printf(" [^%c]", c
);
2847 else printf(" [^\\x%02x]", c
);
2848 printf("%s", OP_names
[*code
++]);
2854 if (isprint(c
= code
[3])) printf(" [^%c]{", c
);
2855 else printf(" [^\\x%02x]{", c
);
2856 if (*code
!= OP_NOTEXACT
) printf(",");
2857 printf("%d}", (code
[1] << 8) + code
[2]);
2858 if (*code
== OP_NOTMINUPTO
) printf("?");
2863 printf(" \\%d", *(++code
));
2865 goto CLASS_REF_REPEAT
;
2873 if (*code
==OP_CLASS_L
)
2876 printf("Locflag = %i ", *code
++);
2881 if (*code
++ == OP_CLASS
) printf(" [");
2886 for (i
= 0; i
< 256; i
++)
2888 if ((code
[i
/8] & (1 << (i
&7))) != 0)
2891 for (j
= i
+1; j
< 256; j
++)
2892 if ((code
[j
/8] & (1 << (j
&7))) == 0) break;
2893 if (i
== '-' || i
== ']') printf("\\");
2894 if (isprint(i
)) printf("%c", i
); else printf("\\x%02x", i
);
2898 if (j
== '-' || j
== ']') printf("\\");
2899 if (isprint(j
)) printf("%c", j
); else printf("\\x%02x", j
);
2918 printf("%s", OP_names
[*code
]);
2923 min
= (code
[1] << 8) + code
[2];
2924 max
= (code
[3] << 8) + code
[4];
2925 if (max
== 0) printf("{%d,}", min
);
2926 else printf("{%d,%d}", min
, max
);
2927 if (*code
== OP_CRMINRANGE
) printf("?");
2937 /* Anything else is just a one-node item */
2940 printf(" %s", OP_names
[*code
]);
2947 printf("------------------------------------------------------------------\n");
2949 /* This check is done here in the debugging case so that the code that
2950 was compiled can be seen. */
2952 if (code
- re
->code
> length
)
2954 printf("length=%i, code length=%i\n", length
, code
-re
->code
);
2957 *erroroffset
= ptr
- (uschar
*)pattern
;
2967 /*************************************************
2968 * Match a character type *
2969 *************************************************/
2971 /* Not used in all the places it might be as it's sometimes faster
2972 to put the code inline.
2975 type the character type
2977 dotall the dotall flag
2979 Returns: TRUE if character is of the type
2983 match_type(int type
, int c
, BOOL dotall
)
2987 if (isprint(c
)) printf("matching subject %c against ", c
);
2988 else printf("matching subject \\x%02x against ", c
);
2989 printf("%s\n", OP_names
[type
]);
2994 case OP_ANY
: return dotall
|| c
!= '\n';
2995 case OP_NOT_DIGIT
: return (pcre_ctypes
[c
] & ctype_digit
) == 0;
2996 case OP_DIGIT
: return (pcre_ctypes
[c
] & ctype_digit
) != 0;
2997 case OP_NOT_WHITESPACE
: return (pcre_ctypes
[c
] & ctype_space
) == 0;
2998 case OP_WHITESPACE
: return (pcre_ctypes
[c
] & ctype_space
) != 0;
2999 case OP_NOT_WORDCHAR
: return (pcre_ctypes
[c
] & ctype_word
) == 0;
3000 case OP_WORDCHAR
: return (pcre_ctypes
[c
] & ctype_word
) != 0;
3001 case OP_NOT_WORDCHAR_L
: return (c
!='_' && !isalnum(c
));
3002 case OP_WORDCHAR_L
: return (c
=='_' || isalnum(c
));
3009 /*************************************************
3010 * Match a back-reference *
3011 *************************************************/
3013 /* If a back reference hasn't been set, the match fails.
3016 number reference number
3017 eptr points into the subject
3018 length length to be matched
3019 md points to match data block
3021 Returns: TRUE if matched
3025 match_ref(int number
, register const uschar
*eptr
, int length
, match_data
*md
)
3027 const uschar
*p
= md
->start_subject
+ md
->offset_vector
[number
];
3030 if (eptr
>= md
->end_subject
)
3031 printf("matching subject <null>");
3034 printf("matching subject ");
3035 pchars(eptr
, length
, TRUE
, md
);
3037 printf(" against backref ");
3038 pchars(p
, length
, FALSE
, md
);
3042 /* Always fail if not enough characters left */
3044 if (length
> md
->end_subject
- p
) return FALSE
;
3046 /* Separate the caseless case for speed */
3049 { while (length
-- > 0) if (pcre_lcc
[*p
++] != pcre_lcc
[*eptr
++]) return FALSE
; }
3051 { while (length
-- > 0) if (*p
++ != *eptr
++) return FALSE
; }
3056 static int free_stack(match_data
*md
)
3058 /* Free any stack space that was allocated by the call to match(). */
3059 if (md
->off_num
) PyMem_DEL(md
->off_num
);
3060 if (md
->offset_top
) PyMem_DEL(md
->offset_top
);
3061 if (md
->r1
) PyMem_DEL(md
->r1
);
3062 if (md
->r2
) PyMem_DEL(md
->r2
);
3063 if (md
->eptr
) PyMem_DEL((char *)md
->eptr
);
3064 if (md
->ecode
) PyMem_DEL((char *)md
->ecode
);
3068 static int grow_stack(match_data
*md
)
3070 if (md
->length
!= 0)
3072 md
->length
= md
->length
+ md
->length
/2;
3076 int string_len
= md
->end_subject
- md
->start_subject
+ 1;
3077 if (string_len
< 80) {md
->length
= string_len
; }
3078 else {md
->length
= 80;}
3080 PyMem_RESIZE(md
->offset_top
, int, md
->length
);
3081 PyMem_RESIZE(md
->eptr
, const uschar
*, md
->length
);
3082 PyMem_RESIZE(md
->ecode
, const uschar
*, md
->length
);
3083 PyMem_RESIZE(md
->off_num
, int, md
->length
);
3084 PyMem_RESIZE(md
->r1
, int, md
->length
);
3085 PyMem_RESIZE(md
->r2
, int, md
->length
);
3086 if (md
->offset_top
== NULL
|| md
->eptr
== NULL
|| md
->ecode
== NULL
||
3087 md
->off_num
== NULL
|| md
->r1
== NULL
|| md
->r2
== NULL
)
3090 longjmp(md
->error_env
, 1);
3096 /*************************************************
3097 * Match from current position *
3098 *************************************************/
3100 /* On entry ecode points to the first opcode, and eptr to the first character.
3103 eptr pointer in subject
3104 ecode position in code
3105 offset_top current top pointer
3106 md pointer to "static" info for the match
3108 Returns: TRUE if matched
3112 match(register const uschar
*eptr
, register const uschar
*ecode
, int offset_top
,
3115 int save_stack_position
= md
->point
;
3118 #define SUCCEED goto succeed
3119 #define FAIL goto fail
3123 int min
, max
, ctype
;
3126 BOOL minimize
= FALSE
;
3128 /* Opening bracket. Check the alternative branches in turn, failing if none
3129 match. We have to set the start offset if required and there is space
3130 in the offset vector so that it is available for subsequent back references
3131 if the bracket matches. However, if the bracket fails, we must put back the
3132 previous value of both offsets in case they were set by a previous copy of
3133 the same bracket. Don't worry about setting the flag for the error case here;
3134 that is handled in the code for KET. */
3136 if ((int)*ecode
>= OP_BRA
)
3138 int number
= (*ecode
- OP_BRA
) << 1;
3139 int save_offset1
= 0, save_offset2
= 0;
3141 DPRINTF(("start bracket %d\n", number
/2));
3143 if (number
> 0 && number
< md
->offset_end
)
3145 save_offset1
= md
->offset_vector
[number
];
3146 save_offset2
= md
->offset_vector
[number
+1];
3147 md
->offset_vector
[number
] = eptr
- md
->start_subject
;
3149 DPRINTF(("saving %d %d\n", save_offset1
, save_offset2
));
3152 /* Recurse for all the alternatives. */
3156 if (match(eptr
, ecode
+3, offset_top
, md
)) SUCCEED
;
3157 ecode
+= (ecode
[1] << 8) + ecode
[2];
3159 while (*ecode
== OP_ALT
);
3161 DPRINTF(("bracket %d failed\n", number
/2));
3163 if (number
> 0 && number
< md
->offset_end
)
3165 md
->offset_vector
[number
] = save_offset1
;
3166 md
->offset_vector
[number
+1] = save_offset2
;
3172 /* Other types of node can be handled by a switch */
3177 md
->end_match_ptr
= eptr
; /* Record where we ended */
3178 md
->end_offset_top
= offset_top
; /* and how many extracts were taken */
3181 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3182 whole thing doesn't match, so we have to get out via a longjmp(). */
3185 if (match(eptr
, ecode
+1, offset_top
, md
)) SUCCEED
;
3186 longjmp(md
->fail_env
, 1);
3188 /* Assertion brackets. Check the alternative branches in turn - the
3189 matching won't pass the KET for an assertion. If any one branch matches,
3190 the assertion is true. */
3195 if (match(eptr
, ecode
+3, offset_top
, md
)) break;
3196 ecode
+= (ecode
[1] << 8) + ecode
[2];
3198 while (*ecode
== OP_ALT
);
3199 if (*ecode
== OP_KET
) FAIL
;
3201 /* Continue from after the assertion, updating the offsets high water
3202 mark, since extracts may have been taken during the assertion. */
3204 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3206 offset_top
= md
->end_offset_top
;
3209 /* Negative assertion: all branches must fail to match */
3214 if (match(eptr
, ecode
+3, offset_top
, md
)) FAIL
;
3215 ecode
+= (ecode
[1] << 8) + ecode
[2];
3217 while (*ecode
== OP_ALT
);
3221 /* "Once" brackets are like assertion brackets except that after a match,
3222 the point in the subject string is not moved back. Thus there can never be
3223 a move back into the brackets. Check the alternative branches in turn - the
3224 matching won't pass the KET for this kind of subpattern. If any one branch
3225 matches, we carry on, leaving the subject pointer. */
3230 if (match(eptr
, ecode
+3, offset_top
, md
)) break;
3231 ecode
+= (ecode
[1] << 8) + ecode
[2];
3233 while (*ecode
== OP_ALT
);
3234 if (*ecode
== OP_KET
) FAIL
;
3236 /* Continue as from after the assertion, updating the offsets high water
3237 mark, since extracts may have been taken. */
3239 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3241 offset_top
= md
->end_offset_top
;
3242 eptr
= md
->end_match_ptr
;
3245 /* An alternation is the end of a branch; scan along to find the end of the
3246 bracketed group and go to there. */
3249 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3252 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3253 that it may occur zero times. It may repeat infinitely, or not at all -
3254 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3255 repeat limits are compiled as a number of copies, with the optional ones
3256 preceded by BRAZERO or BRAMINZERO. */
3260 const uschar
*next
= ecode
+1;
3261 if (match(eptr
, next
, offset_top
, md
)) SUCCEED
;
3262 do next
+= (next
[1] << 8) + next
[2]; while (*next
== OP_ALT
);
3269 const uschar
*next
= ecode
+1;
3270 do next
+= (next
[1] << 8) + next
[2]; while (*next
== OP_ALT
);
3271 if (match(eptr
, next
+3, offset_top
, md
)) SUCCEED
;
3276 /* End of a group, repeated or non-repeating. If we are at the end of
3277 an assertion "group", stop matching and SUCCEED, but record the
3278 current high water mark for use by positive assertions. */
3285 const uschar
*prev
= ecode
- (ecode
[1] << 8) - ecode
[2];
3287 if (*prev
== OP_ASSERT
|| *prev
== OP_ASSERT_NOT
|| *prev
== OP_ONCE
)
3289 md
->end_match_ptr
= eptr
; /* For ONCE */
3290 md
->end_offset_top
= offset_top
;
3294 /* In all other cases we have to check the group number back at the
3295 start and if necessary complete handling an extraction by setting the
3296 final offset and bumping the high water mark. */
3298 number
= (*prev
- OP_BRA
) << 1;
3300 DPRINTF(("end bracket %d\n", number
/2));
3304 if (number
>= md
->offset_end
) md
->offset_overflow
= TRUE
; else
3306 md
->offset_vector
[number
+1] = eptr
- md
->start_subject
;
3307 if (offset_top
<= number
) offset_top
= number
+ 2;
3311 /* For a non-repeating ket, just advance to the next node and continue at
3314 if (*ecode
== OP_KET
)
3320 /* The repeating kets try the rest of the pattern or restart from the
3321 preceding bracket, in the appropriate order. */
3323 if (*ecode
== OP_KETRMIN
)
3326 if (match(eptr
, ecode
+3, offset_top
, md
)) goto succeed
;
3327 /* Handle alternation inside the BRA...KET; push the additional
3328 alternatives onto the stack */
3331 ptr
+= (ptr
[1]<<8)+ ptr
[2];
3334 if (md
->length
== md
->point
)
3338 md
->offset_top
[md
->point
] = offset_top
;
3339 md
->eptr
[md
->point
] = eptr
;
3340 md
->ecode
[md
->point
] = ptr
+3;
3341 md
->r1
[md
->point
] = 0;
3342 md
->r2
[md
->point
] = 0;
3343 md
->off_num
[md
->point
] = 0;
3346 } while (*ptr
==OP_ALT
);
3347 ecode
=prev
+3; goto match_loop
;
3349 else /* OP_KETRMAX */
3352 /*int points_pushed=0;*/
3354 /* Push one failure point, that will resume matching at the code after
3355 the KETRMAX opcode. */
3356 if (md
->length
== md
->point
)
3360 md
->offset_top
[md
->point
] = offset_top
;
3361 md
->eptr
[md
->point
] = eptr
;
3362 md
->ecode
[md
->point
] = ecode
+3;
3363 md
->r1
[md
->point
] = md
->offset_vector
[number
];
3364 md
->r2
[md
->point
] = md
->offset_vector
[number
+1];
3365 md
->off_num
[md
->point
] = number
;
3368 md
->offset_vector
[number
] = eptr
- md
->start_subject
;
3369 /* Handle alternation inside the BRA...KET; push each of the
3370 additional alternatives onto the stack */
3373 ptr
+= (ptr
[1]<<8)+ ptr
[2];
3376 if (md
->length
== md
->point
)
3377 if (md
->length
== md
->point
)
3381 md
->offset_top
[md
->point
] = offset_top
;
3382 md
->eptr
[md
->point
] = eptr
;
3383 md
->ecode
[md
->point
] = ptr
+3;
3384 md
->r1
[md
->point
] = 0;
3385 md
->r2
[md
->point
] = 0;
3386 md
->off_num
[md
->point
] = 0;
3388 /*points_pushed++;*/
3390 } while (*ptr
==OP_ALT
);
3391 /* Jump to the first (or only) alternative and resume trying to match */
3392 ecode
=prev
+3; goto match_loop
;
3396 /* Start of subject unless notbol, or after internal newline if multiline */
3399 if (md
->notbol
&& eptr
== md
->start_subject
) FAIL
;
3402 if (eptr
!= md
->start_subject
&& eptr
[-1] != '\n') FAIL
;
3406 /* ... else fall through */
3408 /* Start of subject assertion */
3411 if (eptr
!= md
->start_subject
) FAIL
;
3415 /* Assert before internal newline if multiline, or before
3416 a terminating newline unless endonly is set, else end of subject unless
3420 if (md
->noteol
&& eptr
>= md
->end_subject
) FAIL
;
3423 if (eptr
< md
->end_subject
&& *eptr
!= '\n') FAIL
;
3427 else if (!md
->endonly
)
3429 if (eptr
< md
->end_subject
- 1 ||
3430 (eptr
== md
->end_subject
- 1 && *eptr
!= '\n')) FAIL
;
3434 /* ... else fall through */
3436 /* End of subject assertion */
3439 if (eptr
< md
->end_subject
) FAIL
;
3443 /* Word boundary assertions */
3445 case OP_NOT_WORD_BOUNDARY
:
3446 case OP_WORD_BOUNDARY
:
3448 BOOL prev_is_word
= (eptr
!= md
->start_subject
) &&
3449 ((pcre_ctypes
[eptr
[-1]] & ctype_word
) != 0);
3450 BOOL cur_is_word
= (eptr
< md
->end_subject
) &&
3451 ((pcre_ctypes
[*eptr
] & ctype_word
) != 0);
3452 if ((*ecode
++ == OP_WORD_BOUNDARY
)?
3453 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
3458 case OP_NOT_WORD_BOUNDARY_L
:
3459 case OP_WORD_BOUNDARY_L
:
3461 BOOL prev_is_word
= (eptr
!= md
->start_subject
) &&
3462 (isalnum(eptr
[-1]) || eptr
[-1]=='_');
3463 BOOL cur_is_word
= (eptr
< md
->end_subject
) &&
3464 (isalnum(*eptr
) || *eptr
=='_');
3465 if ((*ecode
++ == OP_WORD_BOUNDARY_L
)?
3466 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
3472 /* Match a single character type; inline for speed */
3475 if (!md
->dotall
&& eptr
< md
->end_subject
&& *eptr
== '\n') FAIL
;
3476 if (eptr
++ >= md
->end_subject
) FAIL
;
3481 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_digit
) != 0)
3487 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_digit
) == 0)
3492 case OP_NOT_WHITESPACE
:
3493 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_space
) != 0)
3499 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_space
) == 0)
3504 case OP_NOT_WORDCHAR
:
3505 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_word
) != 0)
3511 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_word
) == 0)
3516 case OP_NOT_WORDCHAR_L
:
3517 if (eptr
>= md
->end_subject
|| (*eptr
=='_' || isalnum(*eptr
) ))
3524 if (eptr
>= md
->end_subject
|| (*eptr
!='_' && !isalnum(*eptr
) ))
3530 /* Match a back reference, possibly repeatedly. Look past the end of the
3531 item to see if there is repeat information following. The code is similar
3532 to that for character classes, but repeated for efficiency. Then obey
3533 similar code to character type repeats - written out again for speed.
3534 However, if the referenced string is the empty string, always treat
3535 it as matched, any number of times (otherwise there could be infinite
3541 int number
= ecode
[1] << 1; /* Doubled reference number */
3542 ecode
+= 2; /* Advance past the item */
3544 if (number
>= offset_top
|| md
->offset_vector
[number
] < 0)
3546 md
->errorcode
= PCRE_ERROR_BADREF
;
3550 length
= md
->offset_vector
[number
+1] - md
->offset_vector
[number
];
3560 c
= *ecode
++ - OP_CRSTAR
;
3561 minimize
= (c
& 1) != 0;
3562 min
= rep_min
[c
]; /* Pick up values from tables; */
3563 max
= rep_max
[c
]; /* zero for max => infinity */
3564 if (max
== 0) max
= INT_MAX
;
3569 minimize
= (*ecode
== OP_CRMINRANGE
);
3570 min
= (ecode
[1] << 8) + ecode
[2];
3571 max
= (ecode
[3] << 8) + ecode
[4];
3572 if (max
== 0) max
= INT_MAX
;
3576 default: /* No repeat follows */
3577 if (!match_ref(number
, eptr
, length
, md
)) FAIL
;
3579 continue; /* With the main loop */
3582 /* If the length of the reference is zero, just continue with the
3585 if (length
== 0) continue;
3587 /* First, ensure the minimum number of matches are present. We get back
3588 the length of the reference string explicitly rather than passing the
3589 address of eptr, so that eptr can be a register variable. */
3591 for (i
= 1; i
<= min
; i
++)
3593 if (!match_ref(number
, eptr
, length
, md
)) FAIL
;
3597 /* If min = max, continue at the same level without recursion.
3598 They are not both allowed to be zero. */
3600 if (min
== max
) continue;
3602 /* If minimizing, keep trying and advancing the pointer */
3608 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3609 if (i
>= max
|| !match_ref(number
, eptr
, length
, md
))
3613 /* Control never gets here */
3616 /* If maximizing, find the longest string and work backwards */
3620 const uschar
*pp
= eptr
;
3621 for (i
= min
; i
< max
; i
++)
3623 if (!match_ref(number
, eptr
, length
, md
)) break;
3628 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3634 /* Control never gets here */
3636 /* Match a character class, possibly repeatedly. Look past the end of the
3637 item to see if there is repeat information following. Then obey similar
3638 code to character type repeats - written out again for speed. If caseless
3639 matching was set at runtime but not at compile time, we have to check both
3640 versions of a character, and we have to behave differently for positive and
3641 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3642 treated differently. */
3647 BOOL nasty_case
= *ecode
== OP_NEGCLASS
&& md
->runtime_caseless
;
3648 const uschar
*data
= ecode
+ 1; /* Save for matching */
3649 ecode
+= 33; /* Advance past the item */
3659 c
= *ecode
++ - OP_CRSTAR
;
3660 minimize
= (c
& 1) != 0;
3661 min
= rep_min
[c
]; /* Pick up values from tables; */
3662 max
= rep_max
[c
]; /* zero for max => infinity */
3663 if (max
== 0) max
= INT_MAX
;
3668 minimize
= (*ecode
== OP_CRMINRANGE
);
3669 min
= (ecode
[1] << 8) + ecode
[2];
3670 max
= (ecode
[3] << 8) + ecode
[4];
3671 if (max
== 0) max
= INT_MAX
;
3675 default: /* No repeat follows */
3680 /* First, ensure the minimum number of matches are present. */
3682 for (i
= 1; i
<= min
; i
++)
3684 if (eptr
>= md
->end_subject
) FAIL
;
3687 /* Either not runtime caseless, or it was a positive class. For
3688 runtime caseless, continue if either case is in the map. */
3692 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3693 if (md
->runtime_caseless
)
3696 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3700 /* Runtime caseless and it was a negative class. Continue only if
3701 both cases are in the map. */
3705 if ((data
[c
/8] & (1 << (c
&7))) == 0) FAIL
;
3707 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3713 /* If max == min we can continue with the main loop without the
3716 if (min
== max
) continue;
3718 /* If minimizing, keep testing the rest of the expression and advancing
3719 the pointer while it matches the class. */
3725 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3726 if (i
>= max
|| eptr
>= md
->end_subject
) FAIL
;
3729 /* Either not runtime caseless, or it was a positive class. For
3730 runtime caseless, continue if either case is in the map. */
3734 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3735 if (md
->runtime_caseless
)
3738 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3742 /* Runtime caseless and it was a negative class. Continue only if
3743 both cases are in the map. */
3747 if ((data
[c
/8] & (1 << (c
&7))) == 0) return FALSE
;
3749 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3754 /* Control never gets here */
3757 /* If maximizing, find the longest possible run, then work backwards. */
3761 const uschar
*pp
= eptr
;
3762 for (i
= min
; i
< max
; eptr
++, i
++)
3764 if (eptr
>= md
->end_subject
) break;
3767 /* Either not runtime caseless, or it was a positive class. For
3768 runtime caseless, continue if either case is in the map. */
3772 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3773 if (md
->runtime_caseless
)
3776 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3780 /* Runtime caseless and it was a negative class. Continue only if
3781 both cases are in the map. */
3785 if ((data
[c
/8] & (1 << (c
&7))) == 0) break;
3787 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3794 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
3798 /* Control never gets here */
3800 /* OP_CLASS_L opcode: handles localized character classes */
3804 const uschar
*data
= ecode
+ 1; /* Save for matching */
3805 const uschar locale_flag
= *data
;
3806 ecode
++; data
++; /* The localization support adds an extra byte */
3808 ecode
+= 33; /* Advance past the item */
3818 c
= *ecode
++ - OP_CRSTAR
;
3819 minimize
= (c
& 1) != 0;
3820 min
= rep_min
[c
]; /* Pick up values from tables; */
3821 max
= rep_max
[c
]; /* zero for max => infinity */
3822 if (max
== 0) max
= INT_MAX
;
3827 minimize
= (*ecode
== OP_CRMINRANGE
);
3828 min
= (ecode
[1] << 8) + ecode
[2];
3829 max
= (ecode
[3] << 8) + ecode
[4];
3830 if (max
== 0) max
= INT_MAX
;
3834 default: /* No repeat follows */
3835 if (eptr
>= md
->end_subject
) FAIL
;
3837 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue; /* With main loop */
3838 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3839 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3841 if ( (locale_flag
& 4) && isdigit(c
) ) continue; /* Locale \d */
3842 if ( (locale_flag
& 8) && !isdigit(c
) ) continue; /* Locale \D */
3843 if ( (locale_flag
& 16) && isspace(c
) ) continue; /* Locale \s */
3844 if ( (locale_flag
& 32) && !isspace(c
) ) continue; /* Locale \S */
3847 if (md
->runtime_caseless
)
3850 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue; /* With main loop */
3852 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3853 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3858 /* First, ensure the minimum number of matches are present. */
3860 for (i
= 1; i
<= min
; i
++)
3862 if (eptr
>= md
->end_subject
) FAIL
;
3864 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3865 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3866 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3868 if (md
->runtime_caseless
)
3871 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3872 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3873 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3878 /* If max == min we can continue with the main loop without the
3881 if (min
== max
) continue;
3883 /* If minimizing, keep testing the rest of the expression and advancing
3884 the pointer while it matches the class. */
3890 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3891 if (i
>= max
|| eptr
>= md
->end_subject
) FAIL
;
3893 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3894 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3895 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3897 if (md
->runtime_caseless
)
3900 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3901 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3902 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3906 /* Control never gets here */
3909 /* If maximizing, find the longest possible run, then work backwards. */
3913 const uschar
*pp
= eptr
;
3914 for (i
= min
; i
< max
; eptr
++, i
++)
3916 if (eptr
>= md
->end_subject
) break;
3918 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3919 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3920 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3921 if (md
->runtime_caseless
)
3924 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3925 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3926 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3932 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
3936 /* Control never gets here */
3938 /* Match a run of characters */
3942 register int length
= ecode
[1];
3945 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3946 if (eptr
>= md
->end_subject
)
3947 printf("matching subject <null> against pattern ");
3950 printf("matching subject ");
3951 pchars(eptr
, length
, TRUE
, md
);
3952 printf(" against pattern ");
3954 pchars(ecode
, length
, FALSE
, md
);
3958 if (length
> md
->end_subject
- eptr
) FAIL
;
3961 while (length
-- > 0) if (pcre_lcc
[*ecode
++] != pcre_lcc
[*eptr
++]) FAIL
;
3965 while (length
-- > 0) if (*ecode
++ != *eptr
++) FAIL
;
3970 /* Match a single character repeatedly; different opcodes share code. */
3973 min
= max
= (ecode
[1] << 8) + ecode
[2];
3980 max
= (ecode
[1] << 8) + ecode
[2];
3981 minimize
= *ecode
== OP_MINUPTO
;
3991 c
= *ecode
++ - OP_STAR
;
3992 minimize
= (c
& 1) != 0;
3993 min
= rep_min
[c
]; /* Pick up values from tables; */
3994 max
= rep_max
[c
]; /* zero for max => infinity */
3995 if (max
== 0) max
= INT_MAX
;
3997 /* Common code for all repeated single-character matches. We can give
3998 up quickly if there are fewer than the minimum number of characters left in
4002 if (min
> md
->end_subject
- eptr
) FAIL
;
4005 /* The code is duplicated for the caseless and caseful cases, for speed,
4006 since matching characters is likely to be quite common. First, ensure the
4007 minimum number of matches are present. If min = max, continue at the same
4008 level without recursing. Otherwise, if minimizing, keep trying the rest of
4009 the expression and advancing one matching character if failing, up to the
4010 maximum. Alternatively, if maximizing, find the maximum number of
4011 characters and work backwards. */
4013 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c
, min
, max
,
4019 for (i
= 1; i
<= min
; i
++) if (c
!= pcre_lcc
[*eptr
++]) FAIL
;
4020 if (min
== max
) continue;
4025 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4026 if (i
>= max
|| eptr
>= md
->end_subject
|| c
!= pcre_lcc
[*eptr
++])
4029 /* Control never gets here */
4033 const uschar
*pp
= eptr
;
4034 for (i
= min
; i
< max
; i
++)
4036 if (eptr
>= md
->end_subject
|| c
!= pcre_lcc
[*eptr
]) break;
4040 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4043 /* Control never gets here */
4046 /* Caseful comparisons */
4050 for (i
= 1; i
<= min
; i
++) if (c
!= *eptr
++) FAIL
;
4051 if (min
== max
) continue;
4056 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4057 if (i
>= max
|| eptr
>= md
->end_subject
|| c
!= *eptr
++) FAIL
;
4059 /* Control never gets here */
4063 const uschar
*pp
= eptr
;
4064 for (i
= min
; i
< max
; i
++)
4066 if (eptr
>= md
->end_subject
|| c
!= *eptr
) break;
4070 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4074 /* Control never gets here */
4076 /* Match a negated single character */
4079 if (eptr
>= md
->end_subject
) FAIL
;
4083 if (pcre_lcc
[*ecode
++] == pcre_lcc
[*eptr
++]) FAIL
;
4087 if (*ecode
++ == *eptr
++) FAIL
;
4091 /* Match a negated single character repeatedly. This is almost a repeat of
4092 the code for a repeated single character, but I haven't found a nice way of
4093 commoning these up that doesn't require a test of the positive/negative
4094 option for each character match. Maybe that wouldn't add very much to the
4095 time taken, but character matching *is* what this is all about... */
4098 min
= max
= (ecode
[1] << 8) + ecode
[2];
4105 max
= (ecode
[1] << 8) + ecode
[2];
4106 minimize
= *ecode
== OP_NOTMINUPTO
;
4115 case OP_NOTMINQUERY
:
4116 c
= *ecode
++ - OP_NOTSTAR
;
4117 minimize
= (c
& 1) != 0;
4118 min
= rep_min
[c
]; /* Pick up values from tables; */
4119 max
= rep_max
[c
]; /* zero for max => infinity */
4120 if (max
== 0) max
= INT_MAX
;
4122 /* Common code for all repeated single-character matches. We can give
4123 up quickly if there are fewer than the minimum number of characters left in
4127 if (min
> md
->end_subject
- eptr
) FAIL
;
4130 /* The code is duplicated for the caseless and caseful cases, for speed,
4131 since matching characters is likely to be quite common. First, ensure the
4132 minimum number of matches are present. If min = max, continue at the same
4133 level without recursing. Otherwise, if minimizing, keep trying the rest of
4134 the expression and advancing one matching character if failing, up to the
4135 maximum. Alternatively, if maximizing, find the maximum number of
4136 characters and work backwards. */
4138 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c
, min
, max
,
4144 for (i
= 1; i
<= min
; i
++) if (c
== pcre_lcc
[*eptr
++]) FAIL
;
4145 if (min
== max
) continue;
4150 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4151 if (i
>= max
|| eptr
>= md
->end_subject
|| c
== pcre_lcc
[*eptr
++])
4154 /* Control never gets here */
4158 const uschar
*pp
= eptr
;
4159 for (i
= min
; i
< max
; i
++)
4161 if (eptr
>= md
->end_subject
|| c
== pcre_lcc
[*eptr
]) break;
4165 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4168 /* Control never gets here */
4171 /* Caseful comparisons */
4175 for (i
= 1; i
<= min
; i
++) if (c
== *eptr
++) FAIL
;
4176 if (min
== max
) continue;
4181 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4182 if (i
>= max
|| eptr
>= md
->end_subject
|| c
== *eptr
++) FAIL
;
4184 /* Control never gets here */
4188 const uschar
*pp
= eptr
;
4189 for (i
= min
; i
< max
; i
++)
4191 if (eptr
>= md
->end_subject
|| c
== *eptr
) break;
4195 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4199 /* Control never gets here */
4201 /* Match a single character type repeatedly; several different opcodes
4202 share code. This is very similar to the code for single characters, but we
4203 repeat it in the interests of efficiency. */
4206 min
= max
= (ecode
[1] << 8) + ecode
[2];
4212 case OP_TYPEMINUPTO
:
4214 max
= (ecode
[1] << 8) + ecode
[2];
4215 minimize
= *ecode
== OP_TYPEMINUPTO
;
4220 case OP_TYPEMINSTAR
:
4222 case OP_TYPEMINPLUS
:
4224 case OP_TYPEMINQUERY
:
4225 c
= *ecode
++ - OP_TYPESTAR
;
4226 minimize
= (c
& 1) != 0;
4227 min
= rep_min
[c
]; /* Pick up values from tables; */
4228 max
= rep_max
[c
]; /* zero for max => infinity */
4229 if (max
== 0) max
= INT_MAX
;
4231 /* Common code for all repeated single character type matches */
4234 ctype
= *ecode
++; /* Code for the character type */
4236 /* First, ensure the minimum number of matches are present. Use inline
4237 code for maximizing the speed, and do the type test once at the start
4238 (i.e. keep it out of the loop). Also test that there are at least the
4239 minimum number of characters before we start. */
4241 if (min
> md
->end_subject
- eptr
) FAIL
;
4242 if (min
> 0) switch(ctype
)
4246 { for (i
= 1; i
<= min
; i
++) if (*eptr
++ == '\n') FAIL
; }
4251 for (i
= 1; i
<= min
; i
++)
4252 if ((pcre_ctypes
[*eptr
++] & ctype_digit
) != 0) FAIL
;
4256 for (i
= 1; i
<= min
; i
++)
4257 if ((pcre_ctypes
[*eptr
++] & ctype_digit
) == 0) FAIL
;
4260 case OP_NOT_WHITESPACE
:
4261 for (i
= 1; i
<= min
; i
++)
4262 if ((pcre_ctypes
[*eptr
++] & ctype_space
) != 0) FAIL
;
4266 for (i
= 1; i
<= min
; i
++)
4267 if ((pcre_ctypes
[*eptr
++] & ctype_space
) == 0) FAIL
;
4270 case OP_NOT_WORDCHAR
:
4271 for (i
= 1; i
<= min
; i
++) if ((pcre_ctypes
[*eptr
++] & ctype_word
) != 0)
4276 for (i
= 1; i
<= min
; i
++) if ((pcre_ctypes
[*eptr
++] & ctype_word
) == 0)
4280 case OP_NOT_WORDCHAR_L
:
4281 for (i
= 1; i
<= min
; i
++, eptr
++) if (*eptr
=='_' || isalnum(*eptr
))
4286 for (i
= 1; i
<= min
; i
++, eptr
++) if (*eptr
!='_' && !isalnum(*eptr
))
4291 /* If min = max, continue at the same level without recursing */
4293 if (min
== max
) continue;
4295 /* If minimizing, we have to test the rest of the pattern before each
4296 subsequent match, so inlining isn't much help; just use the function. */
4302 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4303 if (i
>= max
|| eptr
>= md
->end_subject
||
4304 !match_type(ctype
, *eptr
++, md
->dotall
))
4307 /* Control never gets here */
4310 /* If maximizing it is worth using inline code for speed, doing the type
4311 test once at the start (i.e. keep it out of the loop). */
4315 const uschar
*pp
= eptr
;
4321 for (i
= min
; i
< max
; i
++)
4323 if (eptr
>= md
->end_subject
|| *eptr
== '\n') break;
4330 if (c
> md
->end_subject
- eptr
) c
= md
->end_subject
- eptr
;
4336 for (i
= min
; i
< max
; i
++)
4338 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_digit
) != 0)
4345 for (i
= min
; i
< max
; i
++)
4347 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_digit
) == 0)
4353 case OP_NOT_WHITESPACE
:
4354 for (i
= min
; i
< max
; i
++)
4356 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_space
) != 0)
4363 for (i
= min
; i
< max
; i
++)
4365 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_space
) == 0)
4371 case OP_NOT_WORDCHAR
:
4372 for (i
= min
; i
< max
; i
++)
4374 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_word
) != 0)
4381 for (i
= min
; i
< max
; i
++)
4383 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_word
) == 0)
4388 case OP_NOT_WORDCHAR_L
:
4389 for (i
= min
; i
< max
; i
++)
4391 if (eptr
>= md
->end_subject
|| (*eptr
=='_' || isalnum(*eptr
) ) )
4398 for (i
= min
; i
< max
; i
++)
4400 if (eptr
>= md
->end_subject
|| (*eptr
!='_' && !isalnum(*eptr
) ) )
4408 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4411 /* Control never gets here */
4413 /* There's been some horrible disaster. */
4416 DPRINTF(("Unknown opcode %d\n", *ecode
));
4417 md
->errorcode
= PCRE_ERROR_UNKNOWN_NODE
;
4421 /* Do not stick any code in here without much thought; it is assumed
4422 that "continue" in the code above comes out to here to repeat the main
4425 } /* End of main loop */
4426 /* Control never reaches here */
4429 if (md
->point
> save_stack_position
)
4431 /* If there are still points remaining on the stack, pop the next one off */
4435 offset_top
= md
->offset_top
[md
->point
];
4436 eptr
= md
->eptr
[md
->point
];
4437 ecode
= md
->ecode
[md
->point
];
4438 off_num
= md
->off_num
[md
->point
];
4439 md
->offset_vector
[off_num
] = md
->r1
[md
->point
];
4440 md
->offset_vector
[off_num
+1] = md
->r2
[md
->point
];
4443 /* Failure, and nothing left on the stack, so end this function call */
4445 /* Restore the top of the stack to where it was before this function
4446 call. This lets us use one stack for everything; recursive calls
4447 can push and pop information, and may increase the stack. When
4448 the call returns, the parent function can resume pushing and
4449 popping wherever it was. */
4451 md
->point
= save_stack_position
;
4460 /*************************************************
4461 * Segregate setjmp() *
4462 *************************************************/
4464 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4465 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4466 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4467 since it's needed only for the extension \X option, and with any luck, a good
4468 compiler will spot the tail recursion and compile it efficiently.
4471 eptr pointer in subject
4472 ecode position in code
4473 offset_top current top pointer
4474 md pointer to "static" info for the match
4476 Returns: TRUE if matched
4480 match_with_setjmp(const uschar
*eptr
, const uschar
*ecode
, int offset_top
,
4481 match_data
*match_block
)
4483 return setjmp(match_block
->fail_env
) == 0 &&
4484 match(eptr
, ecode
, offset_top
, match_block
);
4489 /*************************************************
4490 * Execute a Regular Expression *
4491 *************************************************/
4493 /* This function applies a compiled re to a subject string and picks out
4494 portions of the string if it matches. Two elements in the vector are set for
4495 each substring: the offsets to the start and end of the substring.
4498 external_re points to the compiled expression
4499 external_extra points to "hints" from pcre_study() or is NULL
4500 subject points to the subject string
4501 length length of subject string (may contain binary zeros)
4503 offsets points to a vector of ints to be filled in with offsets
4504 offsetcount the number of elements in the vector
4506 Returns: > 0 => success; value is the number of elements filled in
4507 = 0 => success, but offsets is not big enough
4508 -1 => failed to match
4509 < -1 => some kind of unexpected problem
4513 pcre_exec(const pcre
*external_re
, const pcre_extra
*external_extra
,
4514 const char *subject
, int length
, int start_pos
, int options
,
4515 int *offsets
, int offsetcount
)
4517 /* The "volatile" directives are to make gcc -Wall stop complaining
4518 that these variables can be clobbered by the longjmp. Hopefully
4519 they won't cost too much performance. */
4520 volatile int resetcount
, ocount
;
4521 volatile int first_char
= -1;
4522 const uschar
* volatile start_bits
= NULL
;
4523 const uschar
* volatile start_match
= (const uschar
*)subject
+ start_pos
;
4524 match_data match_block
;
4525 const uschar
*end_subject
;
4526 const real_pcre
*re
= (const real_pcre
*)external_re
;
4527 const real_pcre_extra
*extra
= (const real_pcre_extra
*)external_extra
;
4528 volatile BOOL using_temporary_offsets
= FALSE
;
4529 volatile BOOL anchored
= ((re
->options
| options
) & PCRE_ANCHORED
) != 0;
4530 volatile BOOL startline
= (re
->options
& PCRE_STARTLINE
) != 0;
4532 if ((options
& ~PUBLIC_EXEC_OPTIONS
) != 0) return PCRE_ERROR_BADOPTION
;
4534 if (re
== NULL
|| subject
== NULL
||
4535 (offsets
== NULL
&& offsetcount
> 0)) return PCRE_ERROR_NULL
;
4536 if (re
->magic_number
!= MAGIC_NUMBER
) return PCRE_ERROR_BADMAGIC
;
4538 match_block
.start_subject
= (const uschar
*)subject
;
4539 match_block
.end_subject
= match_block
.start_subject
+ length
;
4540 end_subject
= match_block
.end_subject
;
4542 match_block
.caseless
= ((re
->options
| options
) & PCRE_CASELESS
) != 0;
4543 match_block
.runtime_caseless
= match_block
.caseless
&&
4544 (re
->options
& PCRE_CASELESS
) == 0;
4546 match_block
.multiline
= ((re
->options
| options
) & PCRE_MULTILINE
) != 0;
4547 match_block
.dotall
= ((re
->options
| options
) & PCRE_DOTALL
) != 0;
4548 match_block
.endonly
= ((re
->options
| options
) & PCRE_DOLLAR_ENDONLY
) != 0;
4550 match_block
.notbol
= (options
& PCRE_NOTBOL
) != 0;
4551 match_block
.noteol
= (options
& PCRE_NOTEOL
) != 0;
4553 match_block
.errorcode
= PCRE_ERROR_NOMATCH
; /* Default error */
4555 /* Set the stack state to empty */
4556 match_block
.off_num
= match_block
.offset_top
= NULL
;
4557 match_block
.r1
= match_block
.r2
= NULL
;
4558 match_block
.eptr
= match_block
.ecode
= NULL
;
4559 match_block
.point
= match_block
.length
= 0;
4561 /* If the expression has got more back references than the offsets supplied can
4562 hold, we get a temporary bit of working store to use during the matching.
4563 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4566 ocount
= offsetcount
& (-2);
4567 if (re
->top_backref
> 0 && re
->top_backref
>= ocount
/2)
4569 ocount
= re
->top_backref
* 2 + 2;
4570 match_block
.offset_vector
= (int *)(pcre_malloc
)(ocount
* sizeof(int));
4571 if (match_block
.offset_vector
== NULL
) return PCRE_ERROR_NOMEMORY
;
4572 using_temporary_offsets
= TRUE
;
4573 DPRINTF(("Got memory to hold back references\n"));
4575 else match_block
.offset_vector
= offsets
;
4577 match_block
.offset_end
= ocount
;
4578 match_block
.offset_overflow
= FALSE
;
4580 /* Compute the minimum number of offsets that we need to reset each time. Doing
4581 this makes a huge difference to execution time when there aren't many brackets
4584 resetcount
= 2 + re
->top_bracket
* 2;
4585 if (resetcount
> offsetcount
) resetcount
= ocount
;
4587 /* If MULTILINE is set at exec time but was not set at compile time, and the
4588 anchored flag is set, we must re-check because a setting provoked by ^ in the
4589 pattern is not right in multi-line mode. Calling is_anchored() again here does
4590 the right check, because multiline is now set. If it now yields FALSE, the
4591 expression must have had ^ starting some of its branches. Check to see if
4592 that is true for *all* branches, and if so, set the startline flag. */
4594 if (match_block
.multiline
&& anchored
&& (re
->options
& PCRE_MULTILINE
) == 0 &&
4595 !is_anchored(re
->code
, match_block
.multiline
))
4598 if (is_startline(re
->code
)) startline
= TRUE
;
4601 /* Set up the first character to match, if available. The first_char value is
4602 never set for an anchored regular expression, but the anchoring may be forced
4603 at run time, so we have to test for anchoring. The first char may be unset for
4604 an unanchored pattern, of course. If there's no first char and the pattern was
4605 studied, the may be a bitmap of possible first characters. However, we can
4606 use this only if the caseless state of the studying was correct. */
4610 if ((re
->options
& PCRE_FIRSTSET
) != 0)
4612 first_char
= re
->first_char
;
4613 if (match_block
.caseless
) first_char
= pcre_lcc
[first_char
];
4616 if (!startline
&& extra
!= NULL
&&
4617 (extra
->options
& PCRE_STUDY_MAPPED
) != 0 &&
4618 ((extra
->options
& PCRE_STUDY_CASELESS
) != 0) == match_block
.caseless
)
4619 start_bits
= extra
->start_bits
;
4622 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4627 register int *iptr
= match_block
.offset_vector
;
4628 register int *iend
= iptr
+ resetcount
;
4630 /* Reset the maximum number of extractions we might see. */
4632 while (iptr
< iend
) *iptr
++ = -1;
4634 /* Advance to a unique first char if possible */
4636 if (first_char
>= 0)
4638 if (match_block
.caseless
)
4639 while (start_match
< end_subject
&& pcre_lcc
[*start_match
] != first_char
)
4642 while (start_match
< end_subject
&& *start_match
!= first_char
)
4646 /* Or to just after \n for a multiline match if possible */
4650 if (start_match
> match_block
.start_subject
)
4652 while (start_match
< end_subject
&& start_match
[-1] != '\n')
4657 /* Or to a non-unique first char */
4659 else if (start_bits
!= NULL
)
4661 while (start_match
< end_subject
)
4663 register int c
= *start_match
;
4664 if ((start_bits
[c
/8] & (1 << (c
&7))) == 0) start_match
++; else break;
4668 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4669 printf(">>>> Match against: ");
4670 pchars(start_match
, end_subject
- start_match
, TRUE
, &match_block
);
4674 /* When a match occurs, substrings will be set for all internal extractions;
4675 we just need to set up the whole thing as substring 0 before returning. If
4676 there were too many extractions, set the return code to zero. In the case
4677 where we had to get some local store to hold offsets for backreferences, copy
4678 those back references that we can. In this case there need not be overflow
4679 if certain parts of the pattern were not used.
4681 Before starting the match, we have to set up a longjmp() target to enable
4682 the "cut" operation to fail a match completely without backtracking. This
4683 is done in a separate function to avoid compiler warnings. We need not do
4684 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4687 /* To handle errors such as running out of memory for the failure
4688 stack, we need to save this location via setjmp(), so
4689 error-handling code can call longjmp() to jump out of deeply-nested code. */
4690 if (setjmp(match_block
.error_env
)==0)
4693 if ((re
->options
& PCRE_EXTRA
) != 0)
4695 if (!match_with_setjmp(start_match
, re
->code
, 2, &match_block
))
4698 else if (!match(start_match
, re
->code
, 2, &match_block
)) continue;
4700 /* Copy the offset information from temporary store if necessary */
4702 if (using_temporary_offsets
)
4704 if (offsetcount
>= 4)
4706 memcpy(offsets
+ 2, match_block
.offset_vector
+ 2,
4707 (offsetcount
- 2) * sizeof(int));
4708 DPRINTF(("Copied offsets from temporary memory\n"));
4710 if (match_block
.end_offset_top
> offsetcount
)
4711 match_block
.offset_overflow
= TRUE
;
4713 DPRINTF(("Freeing temporary memory\n"));
4714 (pcre_free
)(match_block
.offset_vector
);
4717 rc
= match_block
.offset_overflow
? 0 : match_block
.end_offset_top
/2;
4719 if (match_block
.offset_end
< 2) rc
= 0; else
4721 offsets
[0] = start_match
- match_block
.start_subject
;
4722 offsets
[1] = match_block
.end_match_ptr
- match_block
.start_subject
;
4725 DPRINTF((">>>> returning %d\n", rc
));
4726 free_stack(&match_block
);
4728 } /* End of (if setjmp(match_block.error_env)...) */
4729 free_stack(&match_block
);
4731 /* Return an error code; pcremodule.c will preserve the exception */
4732 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY
;
4735 match_block
.errorcode
== PCRE_ERROR_NOMATCH
&&
4736 start_match
++ < end_subject
);
4738 if (using_temporary_offsets
)
4740 DPRINTF(("Freeing temporary memory\n"));
4741 (pcre_free
)(match_block
.offset_vector
);
4745 printf(">>>> returning %d\n", match_block
.errorcode
);
4748 free_stack(&match_block
);
4749 return match_block
.errorcode
;