2 /*************************************************
3 * Perl-Compatible Regular Expressions *
4 *************************************************/
6 /* DO NOT EDIT THIS FILE! */
8 /* This file is automatically written by the merge-files.py script
9 included with the PCRE distribution for Python; it's produced from
10 several C files, and code is removed in the process. If you want to
11 modify the code or track down bugs, it will be much easier to work
12 with the code in its original, multiple-file form. Don't edit this
13 file by hand, or submit patches to it.
15 The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
18 The unmodified original PCRE distribution is available at
19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20 written by: Philip Hazel <ph10@cam.ac.uk>
22 Extensively modified by the Python String-SIG: <string-sig@python.org>
23 Send bug reports to: <string-sig@python.org>
24 (They'll figure out if it's a bug in PCRE or in the Python-specific
27 Copyright (c) 1997 University of Cambridge
29 -----------------------------------------------------------------------------
30 Permission is granted to anyone to use this software for any purpose on any
31 computer system, and to redistribute it freely, subject to the following
34 1. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 2. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
41 3. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43 -----------------------------------------------------------------------------
53 /*************************************************
54 * Perl-Compatible Regular Expressions *
55 *************************************************/
57 /* This file is automatically written by the makechartables auxiliary
58 program. If you edit it by hand, you might like to edit the Makefile to
59 prevent its ever being regenerated. */
61 /* This table is a lower casing table. */
63 unsigned char pcre_lcc
[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
97 /* This table is a case flipping table. */
99 unsigned char pcre_fcc
[] = {
100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
108 64, 97, 98, 99,100,101,102,103,
109 104,105,106,107,108,109,110,111,
110 112,113,114,115,116,117,118,119,
111 120,121,122, 91, 92, 93, 94, 95,
112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
133 /* This table contains bit maps for digits, letters, 'word' chars, and
134 white space. Each map is 32 bytes long and the bits run from the least
135 significant end of each byte. */
137 unsigned char pcre_cbits
[] = {
138 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
139 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
149 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158 /* This table identifies various classes of character by individual bits:
159 0x01 white space character
162 0x08 hexadecimal digit
163 0x10 alphanumeric or '_'
164 0x80 regular expression metacharacter or binary zero
167 unsigned char pcre_ctypes
[] = {
168 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
169 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
172 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
173 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
174 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
175 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
176 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
177 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
179 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
180 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
181 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
183 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201 /* End of chartables.c */
202 /*************************************************
203 * Perl-Compatible Regular Expressions *
204 *************************************************/
207 This is a library of functions to support regular expressions whose syntax
208 and semantics are as close as possible to those of the Perl 5 language. See
209 the file Tech.Notes for some information on the internals.
211 Written by: Philip Hazel <ph10@cam.ac.uk>
213 Copyright (c) 1998 University of Cambridge
215 -----------------------------------------------------------------------------
216 Permission is granted to anyone to use this software for any purpose on any
217 computer system, and to redistribute it freely, subject to the following
220 1. This software is distributed in the hope that it will be useful,
221 but WITHOUT ANY WARRANTY; without even the implied warranty of
222 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224 2. The origin of this software must not be misrepresented, either by
225 explicit claim or by omission.
227 3. Altered versions must be plainly marked as such, and must not be
228 misrepresented as being the original software.
229 -----------------------------------------------------------------------------
233 /* Include the internals header, which itself includes Standard C headers plus
234 the external pcre header. */
239 /*************************************************
240 * Create bitmap of starting chars *
241 *************************************************/
243 /* This function scans a compiled unanchored expression and attempts to build a
244 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
245 goes by, we may be able to get more clever at doing this.
248 code points to an expression
249 start_bits points to a 32-byte table, initialized to 0
251 Returns: TRUE if table built, FALSE otherwise
255 set_start_bits(const uschar
*code
, uschar
*start_bits
)
262 const uschar
*tcode
= code
+ 3;
263 BOOL try_next
= TRUE
;
269 if ((int)*tcode
>= OP_BRA
|| *tcode
== OP_ASSERT
)
271 if (!set_start_bits(tcode
, start_bits
)) return FALSE
;
279 /* BRAZERO does the bracket, but carries on. */
283 if (!set_start_bits(++tcode
, start_bits
)) return FALSE
;
285 do tcode
+= (tcode
[1] << 8) + tcode
[2]; while (*tcode
== OP_ALT
);
290 /* Single-char * or ? sets the bit and tries the next item */
296 start_bits
[tcode
[1]/8] |= (1 << (tcode
[1]&7));
301 /* Single-char upto sets the bit and tries the next */
305 start_bits
[tcode
[3]/8] |= (1 << (tcode
[3]&7));
310 /* At least one single char sets the bit and stops */
312 case OP_EXACT
: /* Fall through */
315 case OP_CHARS
: /* Fall through */
320 start_bits
[tcode
[1]/8] |= (1 << (tcode
[1]&7));
323 /* Single character type sets the bits and stops */
326 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_digit
];
330 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_digit
];
333 case OP_NOT_WHITESPACE
:
334 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_space
];
338 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_space
];
341 case OP_NOT_WORDCHAR
:
342 for (c
= 0; c
< 32; c
++)
343 start_bits
[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
347 for (c
= 0; c
< 32; c
++)
348 start_bits
[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
351 /* One or more character type fudges the pointer and restarts, knowing
352 it will hit a single character type and stop there. */
365 /* Zero or more repeats of character types set the bits and then
370 tcode
+= 2; /* Fall through */
375 case OP_TYPEMINQUERY
:
379 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_digit
];
383 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_digit
];
386 case OP_NOT_WHITESPACE
:
387 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_space
];
391 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_space
];
394 case OP_NOT_WORDCHAR
:
395 for (c
= 0; c
< 32; c
++)
396 start_bits
[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
400 for (c
= 0; c
< 32; c
++)
401 start_bits
[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
409 /* Character class: set the bits and either carry on or not,
410 according to the repeat count. */
416 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= tcode
[c
];
430 if (((tcode
[1] << 8) + tcode
[2]) == 0)
438 break; /* End of class handling */
440 } /* End of switch */
441 } /* End of try_next loop */
443 code
+= (code
[1] << 8) + code
[2]; /* Advance to next branch */
445 while (*code
== OP_ALT
);
451 /*************************************************
452 * Study a compiled expression *
453 *************************************************/
455 /* This function is handed a compiled expression that it must study to produce
456 information that will speed up the matching. It returns a pcre_extra block
457 which then gets handed back to pcre_exec().
460 re points to the compiled expression
461 options contains option bits
462 errorptr points to where to place error messages;
463 set NULL unless error
465 Returns: pointer to a pcre_extra block,
466 NULL on error or if no optimization possible
470 pcre_study(const pcre
*external_re
, int options
, const char **errorptr
)
473 uschar start_bits
[32];
474 real_pcre_extra
*extra
;
475 const real_pcre
*re
= (const real_pcre
*)external_re
;
479 if (re
== NULL
|| re
->magic_number
!= MAGIC_NUMBER
)
481 *errorptr
= "argument is not a compiled regular expression";
485 if ((options
& ~PUBLIC_STUDY_OPTIONS
) != 0)
487 *errorptr
= "unknown or incorrect option bit(s) set";
491 /* Caseless can either be from the compiled regex or from options. */
493 caseless
= ((re
->options
| options
) & PCRE_CASELESS
) != 0;
495 /* For an anchored pattern, or an unanchored pattern that has a first char, or a
496 multiline pattern that matches only at "line starts", no further processing at
499 if ((re
->options
& (PCRE_ANCHORED
|PCRE_FIRSTSET
|PCRE_STARTLINE
)) != 0)
502 /* See if we can find a fixed set of initial characters for the pattern. */
504 memset(start_bits
, 0, 32 * sizeof(uschar
));
505 if (!set_start_bits(re
->code
, start_bits
)) return NULL
;
507 /* If this studying is caseless, scan the created bit map and duplicate the
508 bits for any letters. */
513 for (c
= 0; c
< 256; c
++)
515 if ((start_bits
[c
/8] & (1 << (c
&7))) != 0 &&
516 (pcre_ctypes
[c
] & ctype_letter
) != 0)
519 start_bits
[d
/8] |= (1 << (d
&7));
524 /* Get an "extra" block and put the information therein. */
526 extra
= (real_pcre_extra
*)(pcre_malloc
)(sizeof(real_pcre_extra
));
530 *errorptr
= "failed to get memory";
534 extra
->options
= PCRE_STUDY_MAPPED
| (caseless
? PCRE_STUDY_CASELESS
: 0);
535 memcpy(extra
->start_bits
, start_bits
, sizeof(start_bits
));
537 return (pcre_extra
*)extra
;
541 /*************************************************
542 * Perl-Compatible Regular Expressions *
543 *************************************************/
546 This is a library of functions to support regular expressions whose syntax
547 and semantics are as close as possible to those of the Perl 5 language. See
548 the file Tech.Notes for some information on the internals.
550 Written by: Philip Hazel <ph10@cam.ac.uk>
552 Copyright (c) 1998 University of Cambridge
554 -----------------------------------------------------------------------------
555 Permission is granted to anyone to use this software for any purpose on any
556 computer system, and to redistribute it freely, subject to the following
559 1. This software is distributed in the hope that it will be useful,
560 but WITHOUT ANY WARRANTY; without even the implied warranty of
561 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
563 2. The origin of this software must not be misrepresented, either by
564 explicit claim or by omission.
566 3. Altered versions must be plainly marked as such, and must not be
567 misrepresented as being the original software.
568 -----------------------------------------------------------------------------
572 /* Define DEBUG to get debugging output on stdout. */
576 /* Use a macro for debugging printing, 'cause that eliminates the the use
577 of #ifdef inline, and there are *still* stupid compilers about that don't like
578 indented pre-processor statements. I suppose it's only been 10 years... */
582 #define DPRINTF(p) printf p
584 #define DPRINTF(p) /*nothing*/
587 /* Include the internals header, which itself includes Standard C headers plus
588 the external pcre header. */
593 #ifndef Py_eval_input
594 /* For Python 1.4, graminit.h has to be explicitly included */
595 #define Py_eval_input eval_input
597 #endif /* FOR_PYTHON */
599 /* Allow compilation as C++ source code, should anybody want to do that. */
602 #define class pcre_class
606 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
608 static const char rep_min
[] = { 0, 0, 1, 1, 0, 0 };
609 static const char rep_max
[] = { 0, 0, 0, 0, 1, 1 };
611 /* Text forms of OP_ values and things, for debugging (not all used) */
614 static const char *OP_names
[] = {
615 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
616 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
617 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
618 "^", "$", "Any", "chars",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
623 "*", "*?", "+", "+?", "?", "??", "{", "{",
624 "class", "negclass", "classL", "Ref",
625 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
626 "Brazero", "Braminzero", "Bra"
630 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
631 are simple data values; negative values are for special things like \d and so
632 on. Zero means further processing is needed (for things like \x), or the escape
635 static const short int escapes
[] = {
636 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
637 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
638 '@', -ESC_A
, -ESC_B
, 0, -ESC_D
, 0, 0, 0, /* @ - G */
639 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
640 0, 0, 0, -ESC_S
, 0, 0, 0, -ESC_W
, /* P - W */
641 0, 0, -ESC_Z
, '[', '\\', ']', '^', '_', /* X - _ */
642 '`', 7, -ESC_b
, 0, -ESC_d
, 0, '\f', 0, /* ` - g */
643 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
644 0, 0, '\r', -ESC_s
, '\t', 0, '\v', -ESC_w
, /* p - w */
648 /* Definition to allow mutual recursion */
651 compile_regex(int, int *, uschar
**, const uschar
**, const char **,
654 /* Structure for passing "static" information around between the functions
655 doing the matching, so that they are thread-safe. */
657 typedef struct match_data
{
658 int errorcode
; /* As it says */
659 int *offset_vector
; /* Offset vector */
660 int offset_end
; /* One past the end */
661 BOOL offset_overflow
; /* Set if too many extractions */
662 BOOL caseless
; /* Case-independent flag */
663 BOOL runtime_caseless
; /* Caseless forced at run time */
664 BOOL multiline
; /* Multiline flag */
665 BOOL notbol
; /* NOTBOL flag */
666 BOOL noteol
; /* NOTEOL flag */
667 BOOL dotall
; /* Dot matches any char */
668 BOOL endonly
; /* Dollar not before final \n */
669 const uschar
*start_subject
; /* Start of the subject string */
670 const uschar
*end_subject
; /* End of the subject string */
671 jmp_buf fail_env
; /* Environment for longjump() break out */
672 const uschar
*end_match_ptr
; /* Subject position at end match */
673 int end_offset_top
; /* Highwater mark at end of match */
674 jmp_buf error_env
; /* For longjmp() if an error occurs deep inside a
675 matching operation */
676 int length
; /* Length of the allocated stacks */
677 int point
; /* Point to add next item pushed onto stacks */
678 /* Pointers to the 6 stacks */
679 int *off_num
, *offset_top
, *r1
, *r2
;
680 const uschar
**eptr
, **ecode
;
685 /*************************************************
687 *************************************************/
689 /* PCRE is thread-clean and doesn't use any global variables in the normal
690 sense. However, it calls memory allocation and free functions via the two
691 indirections below, which are can be changed by the caller, but are shared
692 between all threads. */
694 void *(*pcre_malloc
)(size_t) = malloc
;
695 void (*pcre_free
)(void *) = free
;
700 /*************************************************
701 * Return version string *
702 *************************************************/
713 /*************************************************
714 * Return info about a compiled pattern *
715 *************************************************/
717 /* This function picks potentially useful data out of the private
721 external_re points to compiled code
722 optptr where to pass back the options
723 first_char where to pass back the first character,
724 or -1 if multiline and all branches start ^,
727 Returns: number of identifying extraction brackets
728 or negative values on error
732 pcre_info(const pcre
*external_re
, int *optptr
, int *first_char
)
734 const real_pcre
*re
= (real_pcre
*)external_re
;
735 if (re
== NULL
) return PCRE_ERROR_NULL
;
736 if (re
->magic_number
!= MAGIC_NUMBER
) return PCRE_ERROR_BADMAGIC
;
737 if (optptr
!= NULL
) *optptr
= (re
->options
& PUBLIC_OPTIONS
);
738 if (first_char
!= NULL
)
739 *first_char
= ((re
->options
& PCRE_FIRSTSET
) != 0)? re
->first_char
:
740 ((re
->options
& PCRE_STARTLINE
) != 0)? -1 : -2;
741 return re
->top_bracket
;
748 /*************************************************
749 * Debugging function to print chars *
750 *************************************************/
752 /* Print a sequence of chars in printable format, stopping at the end of the
753 subject if the requested.
756 p points to characters
757 length number to print
758 is_subject TRUE if printing from within md->start_subject
759 md pointer to matching data block, if is_subject is TRUE
765 pchars(const uschar
*p
, int length
, BOOL is_subject
, match_data
*md
)
768 if (is_subject
&& length
> md
->end_subject
- p
) length
= md
->end_subject
- p
;
770 if (isprint(c
= *(p
++))) printf("%c", c
); else printf("\\x%02x", c
);
777 /*************************************************
778 * Check subpattern for empty operand *
779 *************************************************/
781 /* This function checks a bracketed subpattern to see if any of the paths
782 through it could match an empty string. This is used to diagnose an error if
783 such a subpattern is followed by a quantifier with an unlimited upper bound.
786 code points to the opening bracket
788 Returns: TRUE or FALSE
792 could_be_empty(uschar
*code
)
795 uschar
*cc
= code
+ 3;
797 /* Scan along the opcodes for this branch; as soon as we find something
798 that matches a non-empty string, break out and advance to test the next
799 branch. If we get to the end of the branch, return TRUE for the whole
804 /* Test an embedded subpattern; if it could not be empty, break the
805 loop. Otherwise carry on in the branch. */
807 if ((int)(*cc
) >= OP_BRA
|| (int)(*cc
) == OP_ONCE
)
809 if (!could_be_empty(cc
)) break;
810 do cc
+= (cc
[1] << 8) + cc
[2]; while (*cc
== OP_ALT
);
816 /* Reached end of a branch: the subpattern may match the empty string */
824 /* Skip over entire bracket groups with zero lower bound */
831 /* Skip over assertive subpatterns */
835 do cc
+= (cc
[1] << 8) + cc
[2]; while (*cc
== OP_ALT
);
839 /* Skip over things that don't match chars */
845 case OP_NOT_WORD_BOUNDARY
:
846 case OP_WORD_BOUNDARY
:
847 case OP_NOT_WORD_BOUNDARY_L
:
848 case OP_WORD_BOUNDARY_L
:
852 /* Skip over simple repeats with zero lower bound */
865 case OP_TYPEMINQUERY
:
869 /* Skip over UPTOs (lower bound is zero) */
878 /* Check a class or a back reference for a zero minimum */
886 case (OP_REF
): cc
+= 2; break;
887 case (OP_CLASS
): case (OP_NEGCLASS
): cc
+= 1+32; break;
888 case (OP_CLASS_L
): cc
+= 1+1+32; break;
902 if ((cc
[1] << 8) + cc
[2] != 0) goto NEXT_BRANCH
;
911 /* Anything else matches at least one character */
919 code
+= (code
[1] << 8) + code
[2];
921 while (*code
== OP_ALT
);
923 /* No branches match the empty string */
928 /* Determine the length of a group ID in an expression like
931 ptr pattern position pointer (say that 3 times fast)
932 finalchar the character that will mark the end of the ID
933 errorptr points to the pointer to the error message
937 get_group_id(const uschar
*ptr
, char finalchar
, const char **errorptr
)
939 const uschar
*start
= ptr
;
941 /* If the first character is not in \w, or is in \w but is a digit,
943 if (!(pcre_ctypes
[*ptr
] & ctype_word
) ||
944 (pcre_ctypes
[*ptr
++] & ctype_digit
))
946 *errorptr
= "(?P identifier must start with a letter or underscore";
950 /* Increment ptr until we either hit a null byte, the desired
951 final character, or a non-word character */
952 for(; (*ptr
!= 0) && (*ptr
!= finalchar
) &&
953 (pcre_ctypes
[*ptr
] & ctype_word
); ptr
++)
955 /* Empty loop body */
961 *errorptr
= "unterminated (?P identifier";
964 *errorptr
= "illegal character in (?P identifier";
968 /*************************************************
970 *************************************************/
972 /* This function is called when a \ has been encountered. It either returns a
973 positive value for a simple escape such as \n, or a negative value which
974 encodes one of the more complicated things such as \d. On entry, ptr is
975 pointing at the \. On exit, it is on the final character of the escape
979 ptrptr points to the pattern position pointer
980 errorptr points to the pointer to the error message
981 bracount number of previous extracting brackets
982 options the options bits
983 isclass TRUE if inside a character class
985 Returns: zero or positive => a data character
986 negative => a special escape sequence
987 on error, errorptr is set
991 check_escape(const uschar
**ptrptr
, const char **errorptr
, int bracount
,
992 int options
, BOOL isclass
)
994 const uschar
*ptr
= *ptrptr
;
995 int c
= *(++ptr
) & 255; /* Ensure > 0 on signed-char systems */
998 if (c
== 0) *errorptr
= ERR1
;
1000 /* Digits or letters may have special meaning; all others are literals. */
1002 else if (c
< '0' || c
> 'z') {}
1004 /* Do an initial lookup in a table. A non-zero result is something that can be
1005 returned immediately. Otherwise further processing may be required. */
1007 else if ((i
= escapes
[c
- '0']) != 0) c
= i
;
1009 /* Escapes that need further processing, or are illegal. */
1016 /* The handling of escape sequences consisting of a string of digits
1017 starting with one that is not zero is not straightforward. By experiment,
1018 the way Perl works seems to be as follows:
1020 Outside a character class, the digits are read as a decimal number. If the
1021 number is less than 10, or if there are that many previous extracting
1022 left brackets, then it is a back reference. Otherwise, up to three octal
1023 digits are read to form an escaped byte. Thus \123 is likely to be octal
1024 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1025 value is greater than 377, the least significant 8 bits are taken. Inside a
1026 character class, \ followed by a digit is always an octal number. */
1028 case '1': case '2': case '3': case '4': case '5':
1029 case '6': case '7': case '8': case '9':
1032 /* PYTHON: Try to compute an octal value for a character */
1033 for(c
=0, i
=0; ptr
[i
]!=0 && i
<3; i
++)
1035 if (( pcre_ctypes
[ ptr
[i
] ] & ctype_odigit
) != 0)
1036 c
= (c
* 8 + ptr
[i
]-'0') & 255;
1038 break; /* Non-octal character--break out of the loop */
1040 /* It's a character if there were exactly 3 octal digits, or if
1041 we're inside a character class and there was at least one
1043 if ( (i
== 3) || (isclass
&& i
!=0) )
1048 c
= ptr
[0]; /* Restore the first character after the \ */
1050 while (i
<2 && (pcre_ctypes
[ptr
[1]] & ctype_digit
) != 0)
1052 c
= c
* 10 + ptr
[1] - '0';
1055 if (c
> 255 - ESC_REF
) *errorptr
= "back reference too big";
1060 /* \0 always starts an octal number, but we may drop through to here with a
1061 larger first octal digit */
1065 while(i
++ < 2 && (pcre_ctypes
[ptr
[1]] & ctype_digit
) != 0 &&
1066 ptr
[1] != '8' && ptr
[1] != '9')
1067 c
= (c
* 8 + *(++ptr
) - '0') & 255;
1070 /* Special escapes not starting with a digit are straightforward */
1074 while ( (pcre_ctypes
[ptr
[1]] & ctype_xdigit
) != 0)
1077 c
= c
* 16 + pcre_lcc
[*ptr
] -
1078 (((pcre_ctypes
[*ptr
] & ctype_digit
) != 0)? '0' : 'W');
1084 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1085 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1086 for Perl compatibility, it is a literal. */
1089 if ((options
& PCRE_EXTRA
) != 0) switch(c
)
1092 c
= -ESC_X
; /* This could be a lookup if it ever got into Perl */
1109 /*************************************************
1110 * Check for counted repeat *
1111 *************************************************/
1113 /* This function is called when a '{' is encountered in a place where it might
1114 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1115 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1116 where the ddds are digits.
1119 p pointer to the first char after '{'
1121 Returns: TRUE or FALSE
1125 is_counted_repeat(const uschar
*p
)
1127 if ((pcre_ctypes
[*p
++] & ctype_digit
) == 0) return FALSE
;
1128 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) p
++;
1129 if (*p
== '}') return TRUE
;
1131 if (*p
++ != ',') return FALSE
;
1132 if (*p
== '}') return TRUE
;
1134 if ((pcre_ctypes
[*p
++] & ctype_digit
) == 0) return FALSE
;
1135 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) p
++;
1141 /*************************************************
1142 * Read repeat counts *
1143 *************************************************/
1145 /* Read an item of the form {n,m} and return the values. This is called only
1146 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1147 so the syntax is guaranteed to be correct, but we need to check the values.
1150 p pointer to first char after '{'
1151 minp pointer to int for min
1152 maxp pointer to int for max
1153 returned as -1 if no max
1154 errorptr points to pointer to error message
1156 Returns: pointer to '}' on success;
1157 current ptr on error, with errorptr set
1160 static const uschar
*
1161 read_repeat_counts(const uschar
*p
, int *minp
, int *maxp
, const char **errorptr
)
1166 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) min
= min
* 10 + *p
++ - '0';
1168 if (*p
== '}') max
= min
; else
1173 while((pcre_ctypes
[*p
] & ctype_digit
) != 0) max
= max
* 10 + *p
++ - '0';
1182 /* Do paranoid checks, then fill in the required variables, and pass back the
1183 pointer to the terminating '}'. */
1185 if (min
> 65535 || max
> 65535)
1197 /*************************************************
1198 * Compile one branch *
1199 *************************************************/
1201 /* Scan the pattern, compiling it into the code vector.
1204 options the option bits
1205 bracket points to number of brackets used
1206 code points to the pointer to the current code point
1207 ptrptr points to the current pattern pointer
1208 errorptr points to pointer to error message
1210 Returns: TRUE on success
1211 FALSE, with *errorptr set on error
1215 compile_branch(int options
, int *brackets
, uschar
**codeptr
,
1216 const uschar
**ptrptr
, const char **errorptr
, PyObject
*dictionary
)
1218 int repeat_type
, op_type
;
1219 int repeat_min
, repeat_max
;
1220 int bravalue
, length
;
1221 int greedy_default
, greedy_non_default
;
1223 register uschar
*code
= *codeptr
;
1224 const uschar
*ptr
= *ptrptr
;
1225 const uschar
*oldptr
;
1226 uschar
*previous
= NULL
;
1228 uschar
*class_flag
; /* Pointer to the single-byte flag for OP_CLASS_L */
1230 /* Set up the default and non-default settings for greediness */
1232 greedy_default
= ((options
& PCRE_UNGREEDY
) != 0);
1233 greedy_non_default
= greedy_default
^ 1;
1235 /* Switch on next character until the end of the branch */
1240 int class_charcount
;
1244 if ((options
& PCRE_EXTENDED
) != 0)
1246 if ((pcre_ctypes
[c
] & ctype_space
) != 0) continue;
1249 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
1256 /* The branch terminates at end of string, |, or ). */
1265 /* Handle single-character metacharacters */
1282 /* Character classes. These always build a 32-byte bitmap of the permitted
1283 characters, except in the special case where there is only one character.
1284 For negated classes, we build the map as usual, then invert it at the end.
1289 if (options
& PCRE_LOCALE
)
1291 *code
++ = OP_CLASS_L
;
1292 /* Set the flag for localized classes (like \w) to 0 */
1302 /* If the first character is '^', set the negation flag, and use a
1303 different opcode. This only matters if caseless matching is specified at
1306 if ((c
= *(++ptr
)) == '^')
1308 negate_class
= TRUE
;
1309 if (*(code
-1)==OP_CLASS
) *(code
-1) = OP_NEGCLASS
;
1312 else negate_class
= FALSE
;
1314 /* Keep a count of chars so that we can optimize the case of just a single
1317 class_charcount
= 0;
1318 class_lastchar
= -1;
1320 /* Initialize the 32-char bit map to all zeros. We have to build the
1321 map in a temporary bit of store, in case the class contains only 1
1322 character, because in that case the compiled code doesn't use the
1325 memset(class, 0, 32 * sizeof(uschar
));
1327 /* Process characters until ] is reached. By writing this as a "do" it
1328 means that an initial ] is taken as a data character. */
1338 /* Backslash may introduce a single character, or it may introduce one
1339 of the specials, which just set a flag. Escaped items are checked for
1340 validity in the pre-compiling pass. The sequence \b is a special case.
1341 Inside a class (and only there) it is treated as backspace. Elsewhere
1342 it marks a word boundary. Other escapes have preset maps ready to
1343 or into the one we are building. We assume they have more than one
1344 character in them, so set class_count bigger than one. */
1348 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, TRUE
);
1349 if (-c
== ESC_b
) c
= '\b';
1352 class_charcount
= 10;
1357 for (c
= 0; c
< 32; c
++) class[c
] |= pcre_cbits
[c
+cbit_digit
];
1363 for (c
= 0; c
< 32; c
++) class[c
] |= ~pcre_cbits
[c
+cbit_digit
];
1368 if (options
& PCRE_LOCALE
)
1374 for (c
= 0; c
< 32; c
++)
1375 class[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
1380 if (options
& PCRE_LOCALE
)
1386 for (c
= 0; c
< 32; c
++)
1387 class[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
1393 for (c
= 0; c
< 32; c
++) class[c
] |= pcre_cbits
[c
+cbit_space
];
1399 for (c
= 0; c
< 32; c
++) class[c
] |= ~pcre_cbits
[c
+cbit_space
];
1408 /* Fall through if single character */
1411 /* A single character may be followed by '-' to form a range. However,
1412 Perl does not permit ']' to be the end of the range. A '-' character
1413 here is treated as a literal. */
1415 if (ptr
[1] == '-' && ptr
[2] != ']')
1427 /* The second part of a range can be a single-character escape, but
1428 not any of the other escapes. */
1432 d
= check_escape(&ptr
, errorptr
, *brackets
, options
, TRUE
);
1435 if (d
== -ESC_b
) d
= '\b'; else
1451 class[c
/8] |= (1 << (c
&7));
1452 if ((options
& PCRE_CASELESS
) != 0)
1454 int uc
= pcre_fcc
[c
]; /* flip case */
1455 class[uc
/8] |= (1 << (uc
&7));
1457 class_charcount
++; /* in case a one-char range */
1460 continue; /* Go get the next char in the class */
1463 /* Handle a lone single character - we can get here for a normal
1464 non-escape char, or after \ that introduces a single character. */
1466 class [c
/8] |= (1 << (c
&7));
1467 if ((options
& PCRE_CASELESS
) != 0)
1469 c
= pcre_fcc
[c
]; /* flip case */
1470 class[c
/8] |= (1 << (c
&7));
1476 /* Loop until ']' reached; the check for end of string happens inside the
1477 loop. This "while" is the end of the "do" above. */
1479 while ((c
= *(++ptr
)) != ']');
1481 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1482 precisely one character. This doesn't need the whole 32-byte bit map.
1483 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1486 if (class_charcount
== 1 && class_lastchar
>= 0)
1494 code
[-1] = OP_CHARS
;
1497 *code
++ = class_lastchar
;
1500 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1505 /* If this is a localized opcode, bump the code pointer up */
1506 if (class_flag
) code
++;
1509 if (class_flag
) *class_flag
= (*class_flag
) ^ 63;
1510 for (c
= 0; c
< 32; c
++) code
[c
] = ~class[c
];
1513 memcpy(code
, class, 32);
1518 /* Various kinds of repeat */
1521 if (!is_counted_repeat(ptr
+1)) goto NORMAL_CHAR
;
1522 ptr
= read_repeat_counts(ptr
+1, &repeat_min
, &repeat_max
, errorptr
);
1523 if (*errorptr
!= NULL
) goto FAILED
;
1541 if (previous
== NULL
)
1547 /* If the next character is '?' this is a minimizing repeat, by default,
1548 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1552 { repeat_type
= greedy_non_default
; ptr
++; }
1553 else repeat_type
= greedy_default
;
1555 /* If the maximum is zero then the minimum must also be zero; Perl allows
1556 this case, so we do too - by simply omitting the item altogether. */
1558 if (repeat_max
== 0) code
= previous
;
1560 /* If previous was a string of characters, chop off the last one and use it
1561 as the subject of the repeat. If there was only one character, we can
1562 abolish the previous item altogether. */
1564 else if (*previous
== OP_CHARS
)
1566 int len
= previous
[1];
1574 c
= previous
[len
+1];
1578 op_type
= 0; /* Use single-char op codes */
1579 goto OUTPUT_SINGLE_REPEAT
; /* Code shared with single character types */
1582 /* If previous was a single negated character ([^a] or similar), we use
1583 one of the special opcodes, replacing it. The code is shared with single-
1584 character repeats by adding a suitable offset into repeat_type. */
1586 else if ((int)*previous
== OP_NOT
)
1588 op_type
= OP_NOTSTAR
- OP_STAR
; /* Use "not" opcodes */
1591 goto OUTPUT_SINGLE_REPEAT
;
1594 /* If previous was a character type match (\d or similar), abolish it and
1595 create a suitable repeat item. The code is shared with single-character
1596 repeats by adding a suitable offset into repeat_type. */
1598 else if ((int)*previous
< OP_CIRC
|| *previous
== OP_ANY
)
1600 op_type
= OP_TYPESTAR
- OP_STAR
; /* Use type opcodes */
1604 OUTPUT_SINGLE_REPEAT
:
1605 repeat_type
+= op_type
; /* Combine both values for many cases */
1607 /* A minimum of zero is handled either as the special case * or ?, or as
1608 an UPTO, with the maximum given. */
1610 if (repeat_min
== 0)
1612 if (repeat_max
== -1) *code
++ = OP_STAR
+ repeat_type
;
1613 else if (repeat_max
== 1) *code
++ = OP_QUERY
+ repeat_type
;
1616 *code
++ = OP_UPTO
+ repeat_type
;
1617 *code
++ = repeat_max
>> 8;
1618 *code
++ = (repeat_max
& 255);
1622 /* The case {1,} is handled as the special case + */
1624 else if (repeat_min
== 1 && repeat_max
== -1)
1625 *code
++ = OP_PLUS
+ repeat_type
;
1627 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1628 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1632 if (repeat_min
!= 1)
1634 *code
++ = OP_EXACT
+ op_type
; /* NB EXACT doesn't have repeat_type */
1635 *code
++ = repeat_min
>> 8;
1636 *code
++ = (repeat_min
& 255);
1639 /* If the minimum is 1 and the previous item was a character string,
1640 we either have to put back the item that got canceled if the string
1641 length was 1, or add the character back onto the end of a longer
1642 string. For a character type nothing need be done; it will just get
1643 put back naturally. Note that the final character is always going to
1646 else if (*previous
== OP_CHARS
)
1648 if (code
== previous
) code
+= 2; else previous
[1]++;
1651 /* For a single negated character we also have to put back the
1652 item that got canceled. */
1654 else if (*previous
== OP_NOT
) code
++;
1656 /* If the maximum is unlimited, insert an OP_STAR. */
1661 *code
++ = OP_STAR
+ repeat_type
;
1664 /* Else insert an UPTO if the max is greater than the min. */
1666 else if (repeat_max
!= repeat_min
)
1669 repeat_max
-= repeat_min
;
1670 *code
++ = OP_UPTO
+ repeat_type
;
1671 *code
++ = repeat_max
>> 8;
1672 *code
++ = (repeat_max
& 255);
1676 /* The character or character type itself comes last in all cases. */
1681 /* If previous was a character class or a back reference, we put the repeat
1684 else if (*previous
== OP_CLASS
|| *previous
== OP_NEGCLASS
||
1685 *previous
==OP_CLASS_L
|| *previous
== OP_REF
)
1687 if (repeat_min
== 0 && repeat_max
== -1)
1688 *code
++ = OP_CRSTAR
+ repeat_type
;
1689 else if (repeat_min
== 1 && repeat_max
== -1)
1690 *code
++ = OP_CRPLUS
+ repeat_type
;
1691 else if (repeat_min
== 0 && repeat_max
== 1)
1692 *code
++ = OP_CRQUERY
+ repeat_type
;
1695 *code
++ = OP_CRRANGE
+ repeat_type
;
1696 *code
++ = repeat_min
>> 8;
1697 *code
++ = repeat_min
& 255;
1698 if (repeat_max
== -1) repeat_max
= 0; /* 2-byte encoding for max */
1699 *code
++ = repeat_max
>> 8;
1700 *code
++ = repeat_max
& 255;
1704 /* If previous was a bracket group, we may have to replicate it in certain
1705 cases. If the maximum repeat count is unlimited, check that the bracket
1706 group cannot match the empty string, and diagnose an error if it can. */
1708 else if ((int)*previous
>= OP_BRA
)
1711 int len
= code
- previous
;
1713 if (repeat_max
== -1 && could_be_empty(previous
))
1719 /* If the minimum is greater than zero, and the maximum is unlimited or
1720 equal to the minimum, the first copy remains where it is, and is
1721 replicated up to the minimum number of times. This case includes the +
1722 repeat, but of course no replication is needed in that case. */
1724 if (repeat_min
> 0 && (repeat_max
== -1 || repeat_max
== repeat_min
))
1726 for (i
= 1; i
< repeat_min
; i
++)
1728 memcpy(code
, previous
, len
);
1733 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1734 Then, if there is a fixed upper limit, replicated up to that many times,
1735 sticking BRAZERO in front of all the optional ones. */
1739 if (repeat_min
== 0)
1741 memmove(previous
+1, previous
, len
);
1743 *previous
++ = OP_BRAZERO
+ repeat_type
;
1746 for (i
= 1; i
< repeat_min
; i
++)
1748 memcpy(code
, previous
, len
);
1752 for (i
= (repeat_min
> 0)? repeat_min
: 1; i
< repeat_max
; i
++)
1754 *code
++ = OP_BRAZERO
+ repeat_type
;
1755 memcpy(code
, previous
, len
);
1760 /* If the maximum is unlimited, set a repeater in the final copy. */
1762 if (repeat_max
== -1) code
[-3] = OP_KETRMAX
+ repeat_type
;
1765 /* Else there's some kind of shambles */
1773 /* In all case we no longer have a previous item. */
1779 /* Start of nested bracket sub-expression, or comment or lookahead.
1780 First deal with special things that can come after a bracket; all are
1781 introduced by ?, and the appearance of any of them means that this is not a
1782 referencing group. They were checked for validity in the first pass over
1783 the string, so we don't have to check for syntax errors here. */
1786 previous
= code
; /* Only real brackets can be repeated */
1787 if (*(++ptr
) == '?')
1800 while (*ptr
!= ')') ptr
++;
1804 case ':': /* Non-extracting bracket */
1808 case '=': /* Assertions can't be repeated */
1809 bravalue
= OP_ASSERT
;
1815 bravalue
= OP_ASSERT_NOT
;
1824 /* (?P<groupname>...) */
1826 PyObject
*string
, *intobj
;
1829 idlen
= get_group_id(ptr
, '>', errorptr
);
1833 string
= PyString_FromStringAndSize((char*)ptr
, idlen
);
1834 intobj
= PyInt_FromLong( brackets
[0] + 1 );
1835 if (intobj
== NULL
|| string
== NULL
)
1839 *errorptr
= "exception raised";
1842 PyDict_SetItem(dictionary
, string
, intobj
);
1843 Py_DECREF(string
); Py_DECREF(intobj
); /* XXX DECREF commented out! */
1844 ptr
+= idlen
+1; /* Point to rest of expression */
1845 goto do_grouping_bracket
;
1849 /* (?P=groupname) */
1851 PyObject
*string
, *intobj
;
1854 idlen
= get_group_id(ptr
, ')', errorptr
);
1858 string
= PyString_FromStringAndSize((char *)ptr
, idlen
);
1860 *errorptr
= "exception raised";
1863 intobj
= PyDict_GetItem(dictionary
, string
);
1866 *errorptr
= "?P= group identifier isn't defined";
1870 refnum
= PyInt_AsLong(intobj
);
1872 /* The caller doesn't own the reference to the value
1873 returned from PyDict_GetItem, so intobj is not
1878 /* The continue will cause the top-level for() loop to
1879 be resumed, so ptr will be immediately incremented.
1880 Therefore, the following line adds just idlen, not
1885 /* The character after ?P is neither < nor =, so
1886 report an error. Add more Python-extensions here. */
1887 *errorptr
="unknown after (?P";
1890 case '>': /* "Match once" brackets */
1891 if ((options
& PCRE_EXTRA
) != 0) /* Not yet standard */
1898 /* Else fall through */
1906 /* Else we have a referencing group */
1910 do_grouping_bracket
:
1911 if (++(*brackets
) > EXTRACT_MAX
)
1916 bravalue
= OP_BRA
+ *brackets
;
1919 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1920 code into a non-register variable in order to be able to pass its address
1921 because some compilers complain otherwise. */
1925 uschar
*mcode
= code
;
1926 if (!compile_regex(options
, brackets
, &mcode
, &ptr
, errorptr
, dictionary
))
1938 /* Check \ for being a real metacharacter; if not, fall through and handle
1939 it as a data character at the start of a string. Escape items are checked
1940 for validity in the pre-compiling pass. */
1944 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, FALSE
);
1946 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1947 are arranged to be the negation of the corresponding OP_values. For the
1948 back references, the values are ESC_REF plus the reference number. Only
1949 back references and those types that consume a character may be repeated.
1950 We can test for values between ESC_b and ESC_Z for the latter; this may
1951 have to change if any new ones are ever created. */
1957 int refnum
= -c
- ESC_REF
;
1958 if (*brackets
< refnum
)
1969 previous
= (-c
> ESC_b
&& -c
< ESC_X
)? code
: NULL
;
1970 if ( (options
& PCRE_LOCALE
) != 0)
1974 case (-ESC_b
): c
= -OP_WORD_BOUNDARY_L
; break;
1975 case (-ESC_B
): c
= -OP_NOT_WORD_BOUNDARY_L
; break;
1976 case (-ESC_w
): c
= -OP_WORDCHAR_L
; break;
1977 case (-ESC_W
): c
= -OP_NOT_WORDCHAR_L
; break;
1985 /* Data character: Reset and fall through */
1990 /* Handle a run of data characters until a metacharacter is encountered.
1991 The first character is guaranteed not to be whitespace or # when the
1992 extended flag is set. */
2003 if ((options
& PCRE_EXTENDED
) != 0)
2005 if ((pcre_ctypes
[c
] & ctype_space
) != 0) continue;
2008 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2014 /* Backslash may introduce a data char or a metacharacter. Escaped items
2015 are checked for validity in the pre-compiling pass. Stop the string
2016 before a metaitem. */
2021 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, FALSE
);
2022 if (c
< 0) { ptr
= oldptr
; break; }
2025 /* Ordinary character or single-char escape */
2031 /* This "while" is the end of the "do" above. */
2033 while (length
< 255 && (pcre_ctypes
[c
= *(++ptr
)] & ctype_meta
) == 0);
2035 /* Compute the length and set it in the data vector, and advance to
2038 previous
[1] = length
;
2039 if (length
< 255) ptr
--;
2042 } /* end of big loop */
2044 /* Control never reaches here by falling through, only by a goto for all the
2045 error states. Pass back the position in the pattern so that it can be displayed
2046 to the user for diagnosing the error. */
2056 /*************************************************
2057 * Compile sequence of alternatives *
2058 *************************************************/
2060 /* On entry, ptr is pointing past the bracket character, but on return
2061 it points to the closing bracket, or vertical bar, or end of string.
2062 The code variable is pointing at the byte into which the BRA operator has been
2066 options the option bits
2067 brackets -> int containing the number of extracting brackets used
2068 codeptr -> the address of the current code pointer
2069 ptrptr -> the address of the current pattern pointer
2070 errorptr -> pointer to error message
2072 Returns: TRUE on success
2076 compile_regex(int options
, int *brackets
, uschar
**codeptr
,
2077 const uschar
**ptrptr
, const char **errorptr
, PyObject
*dictionary
)
2079 const uschar
*ptr
= *ptrptr
;
2080 uschar
*code
= *codeptr
;
2081 uschar
*start_bracket
= code
;
2086 uschar
*last_branch
= code
;
2089 if (!compile_branch(options
, brackets
, &code
, &ptr
, errorptr
, dictionary
))
2095 /* Fill in the length of the last branch */
2097 length
= code
- last_branch
;
2098 last_branch
[1] = length
>> 8;
2099 last_branch
[2] = length
& 255;
2101 /* Reached end of expression, either ')' or end of pattern. Insert a
2102 terminating ket and the length of the whole bracketed item, and return,
2103 leaving the pointer at the terminating char. */
2107 length
= code
- start_bracket
;
2109 *code
++ = length
>> 8;
2110 *code
++ = length
& 255;
2116 /* Another branch follows; insert an "or" node and advance the pointer. */
2121 /* Control never reaches here */
2126 /*************************************************
2127 * Check for anchored expression *
2128 *************************************************/
2130 /* Try to find out if this is an anchored regular expression. Consider each
2131 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2132 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2133 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2134 counts, since OP_CIRC can match in the middle.
2136 A branch is also implicitly anchored if it starts with .* because that will try
2137 the rest of the pattern at all possible matching points, so there is no point
2140 Argument: points to start of expression (the bracket)
2141 Returns: TRUE or FALSE
2145 is_anchored(register const uschar
*code
, BOOL multiline
)
2148 int op
= (int)code
[3];
2149 if (op
>= OP_BRA
|| op
== OP_ASSERT
|| op
== OP_ONCE
)
2150 { if (!is_anchored(code
+3, multiline
)) return FALSE
; }
2151 else if (op
== OP_TYPESTAR
|| op
== OP_TYPEMINSTAR
)
2152 { if (code
[4] != OP_ANY
) return FALSE
; }
2153 else if (op
!= OP_SOD
&& (multiline
|| op
!= OP_CIRC
)) return FALSE
;
2154 code
+= (code
[1] << 8) + code
[2];
2156 while (*code
== OP_ALT
);
2162 /*************************************************
2163 * Check for start with \n line expression *
2164 *************************************************/
2166 /* This is called for multiline expressions to try to find out if every branch
2167 starts with ^ so that "first char" processing can be done to speed things up.
2169 Argument: points to start of expression (the bracket)
2170 Returns: TRUE or FALSE
2174 is_startline(const uschar
*code
)
2177 if ((int)code
[3] >= OP_BRA
|| code
[3] == OP_ASSERT
)
2178 { if (!is_startline(code
+3)) return FALSE
; }
2179 else if (code
[3] != OP_CIRC
) return FALSE
;
2180 code
+= (code
[1] << 8) + code
[2];
2182 while (*code
== OP_ALT
);
2188 /*************************************************
2189 * Check for fixed first char *
2190 *************************************************/
2192 /* Try to find out if there is a fixed first character. This is called for
2193 unanchored expressions, as it speeds up their processing quite considerably.
2194 Consider each alternative branch. If they all start with the same char, or with
2195 a bracket all of whose alternatives start with the same char (recurse ad lib),
2196 then we return that char, otherwise -1.
2198 Argument: points to start of expression (the bracket)
2199 Returns: -1 or the fixed first char
2203 find_firstchar(uschar
*code
)
2205 register int c
= -1;
2208 register int charoffset
= 4;
2210 if ((int)code
[3] >= OP_BRA
|| code
[3] == OP_ASSERT
)
2213 if ((d
= find_firstchar(code
+3)) < 0) return -1;
2214 if (c
< 0) c
= d
; else if (c
!= d
) return -1;
2217 else switch(code
[3])
2222 case OP_EXACT
: /* Fall through */
2225 case OP_CHARS
: /* Fall through */
2230 if (c
< 0) c
= code
[charoffset
]; else if (c
!= code
[charoffset
]) return -1;
2233 code
+= (code
[1] << 8) + code
[2];
2235 while (*code
== OP_ALT
);
2241 /*************************************************
2242 * Compile a Regular Expression *
2243 *************************************************/
2245 /* This function takes a string and returns a pointer to a block of store
2246 holding a compiled version of the expression.
2249 pattern the regular expression
2250 options various option bits
2251 errorptr pointer to pointer to error text
2252 erroroffset ptr offset in pattern where error was detected
2254 Returns: pointer to compiled data block, or NULL on error,
2255 with errorptr and erroroffset set
2259 pcre_compile(const char *pattern
, int options
, const char **errorptr
,
2260 int *erroroffset
, PyObject
*dictionary
)
2264 int length
= 3; /* For initial BRA plus length */
2269 int top_backref
= 0;
2270 unsigned int brastackptr
= 0;
2275 uschar
*code_base
, *code_end
;
2278 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2279 can do is just return NULL. */
2281 if (errorptr
== NULL
) return NULL
;
2284 /* However, we can give a message for this error */
2286 if (erroroffset
== NULL
)
2293 if ((options
& ~PUBLIC_OPTIONS
) != 0)
2299 DPRINTF(("------------------------------------------------------------------\n"));
2300 DPRINTF(("%s\n", pattern
));
2302 /* The first thing to do is to make a pass over the pattern to compute the
2303 amount of store required to hold the compiled code. This does not have to be
2304 perfect as long as errors are overestimates. At the same time we can detect any
2305 internal flag settings. Make an attempt to correct for any counted white space
2306 if an "extended" flag setting appears late in the pattern. We can't be so
2307 clever for #-comments. */
2309 ptr
= (const uschar
*)(pattern
- 1);
2310 while ((c
= *(++ptr
)) != 0)
2313 int class_charcount
;
2315 if ((pcre_ctypes
[c
] & ctype_space
) != 0)
2317 if ((options
& PCRE_EXTENDED
) != 0) continue;
2321 if (c
== '#' && (options
& PCRE_EXTENDED
) != 0)
2323 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2329 /* A backslashed item may be an escaped "normal" character or a
2330 character type. For a "normal" character, put the pointers and
2331 character back so that tests for whitespace etc. in the input
2332 are done correctly. */
2336 const uschar
*save_ptr
= ptr
;
2337 c
= check_escape(&ptr
, errorptr
, bracount
, options
, FALSE
);
2338 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2348 /* A back reference needs an additional char, plus either one or 5
2349 bytes for a repeat. We also need to keep the value of the highest
2354 int refnum
= -c
- ESC_REF
;
2355 if (refnum
> top_backref
) top_backref
= refnum
;
2356 length
++; /* For single back reference */
2357 if (ptr
[1] == '{' && is_counted_repeat(ptr
+2))
2359 ptr
= read_repeat_counts(ptr
+2, &min
, &max
, errorptr
);
2360 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2361 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2362 (min
== 1 && max
== -1))
2365 if (ptr
[1] == '?') ptr
++;
2373 case '*': /* These repeats won't be after brackets; */
2374 case '+': /* those are handled separately */
2379 /* This covers the cases of repeats after a single char, metachar, class,
2380 or back reference. */
2383 if (!is_counted_repeat(ptr
+1)) goto NORMAL_CHAR
;
2384 ptr
= read_repeat_counts(ptr
+1, &min
, &max
, errorptr
);
2385 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2386 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2387 (min
== 1 && max
== -1))
2391 length
--; /* Uncount the original char or metachar */
2392 if (min
== 1) length
++; else if (min
> 0) length
+= 4;
2393 if (max
> 0) length
+= 4; else length
+= 2;
2395 if (ptr
[1] == '?') ptr
++;
2398 /* An alternation contains an offset to the next branch or ket. */
2403 /* A character class uses 33 characters. Don't worry about character types
2404 that aren't allowed in classes - they'll get picked up during the compile.
2405 A character class that contains only one character uses 2 or 3 bytes,
2406 depending on whether it is negated or not. Notice this where we can. */
2409 class_charcount
= 0;
2410 if (*(++ptr
) == '^') ptr
++;
2415 int ch
= check_escape(&ptr
, errorptr
, bracount
, options
, TRUE
);
2416 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2417 if (-ch
== ESC_b
) class_charcount
++; else class_charcount
= 10;
2419 else class_charcount
++;
2422 while (*ptr
!= 0 && *ptr
!= ']');
2424 /* Repeats for negated single chars are handled by the general code */
2426 if (class_charcount
== 1) length
+= 3; else
2429 if (options
& PCRE_LOCALE
) length
++; /* Add a byte for the localization flag */
2431 /* A repeat needs either 1 or 5 bytes. */
2433 if (*ptr
!= 0 && ptr
[1] == '{' && is_counted_repeat(ptr
+2))
2435 ptr
= read_repeat_counts(ptr
+2, &min
, &max
, errorptr
);
2436 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2437 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2438 (min
== 1 && max
== -1))
2441 if (ptr
[1] == '?') ptr
++;
2446 /* Brackets may be genuine groups or special things */
2450 /* Handle special forms of bracket, which all start (? */
2452 if (ptr
[1] == '?') switch (c
= ptr
[2])
2454 /* Skip over comments entirely */
2457 while (*ptr
!= 0 && *ptr
!= ')') ptr
++;
2461 goto PCRE_ERROR_RETURN
;
2465 /* Non-referencing groups and lookaheads just move the pointer on, and
2466 then behave like a non-special bracket, except that they don't increment
2467 the count of extracting brackets. */
2480 idlen
= get_group_id(ptr
++, '>', errorptr
);
2481 if (*errorptr
) goto PCRE_ERROR_RETURN
;
2485 idlen
= get_group_id(ptr
++, ')', errorptr
);
2486 if (*errorptr
) goto PCRE_ERROR_RETURN
;
2494 /* Ditto for the "once only" bracket, allowed only if the extra bit
2498 if ((options
& PCRE_EXTRA
) != 0)
2503 /* Else fall through */
2505 /* Else loop setting valid options until ) is met. Anything else is an
2512 if ((c
= *ptr
) == 'i')
2514 options
|= PCRE_CASELESS
;
2517 else if ((c
= *ptr
) == 'L')
2519 options
|= PCRE_LOCALE
;
2522 else if ((c
= *ptr
) == 'm')
2524 options
|= PCRE_MULTILINE
;
2529 options
|= PCRE_DOTALL
;
2534 options
|= PCRE_EXTENDED
;
2535 length
-= spaces
; /* Already counted spaces */
2538 else if (c
== ')') break;
2541 goto PCRE_ERROR_RETURN
;
2543 continue; /* End of this bracket handling */
2546 /* Extracting brackets must be counted so we can process escapes in a
2551 /* Non-special forms of bracket. Save length for computing whole length
2552 at end if there's a repeat that requires duplication of the group. */
2554 if (brastackptr
>= sizeof(brastack
)/sizeof(int))
2557 goto PCRE_ERROR_RETURN
;
2560 brastack
[brastackptr
++] = length
;
2564 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2565 have to replicate this bracket up to that many times. If brastackptr is
2566 0 this is an unmatched bracket which will generate an error, but take care
2567 not to try to access brastack[-1]. */
2574 int duplength
= (brastackptr
> 0)? length
- brastack
[--brastackptr
] : 0;
2576 /* Leave ptr at the final char; for read_repeat_counts this happens
2577 automatically; for the others we need an increment. */
2579 if ((c
= ptr
[1]) == '{' && is_counted_repeat(ptr
+2))
2581 ptr
= read_repeat_counts(ptr
+2, &minval
, &maxval
, errorptr
);
2582 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2584 else if (c
== '*') { minval
= 0; maxval
= -1; ptr
++; }
2585 else if (c
== '+') { maxval
= -1; ptr
++; }
2586 else if (c
== '?') { minval
= 0; ptr
++; }
2588 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2589 if there is a limited maximum we have to replicate up to maxval-1 times
2590 and allow for a BRAZERO item before each optional copy, as we also have
2591 to do before the first copy if the minimum is zero. */
2593 if (minval
== 0) length
++;
2594 else if (minval
> 1) length
+= (minval
- 1) * duplength
;
2595 if (maxval
> minval
) length
+= (maxval
- minval
) * (duplength
+ 1);
2599 /* Non-special character. For a run of such characters the length required
2600 is the number of characters + 2, except that the maximum run length is 255.
2601 We won't get a skipped space or a non-data escape or the start of a #
2602 comment as the first character, so the length can't be zero. */
2610 if ((pcre_ctypes
[c
] & ctype_space
) != 0)
2612 if ((options
& PCRE_EXTENDED
) != 0) continue;
2616 if (c
== '#' && (options
& PCRE_EXTENDED
) != 0)
2618 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2622 /* Backslash may introduce a data char or a metacharacter; stop the
2623 string before the latter. */
2627 const uschar
*saveptr
= ptr
;
2628 c
= check_escape(&ptr
, errorptr
, bracount
, options
, FALSE
);
2629 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2630 if (c
< 0) { ptr
= saveptr
; break; }
2633 /* Ordinary character or single-char escape */
2638 /* This "while" is the end of the "do" above. */
2640 while (runlength
< 255 && (pcre_ctypes
[c
= *(++ptr
)] & ctype_meta
) == 0);
2643 length
+= runlength
;
2648 length
+= 4; /* For final KET and END */
2656 /* Compute the size of data block needed and get it, either from malloc or
2657 externally provided function. We specify "code[0]" in the offsetof() expression
2658 rather than just "code", because it has been reported that one broken compiler
2659 fails on "code" because it is also an independent variable. It should make no
2660 difference to the value of the offsetof(). */
2662 size
= length
+ offsetof(real_pcre
, code
[0]);
2663 re
= (real_pcre
*)(pcre_malloc
)(size
+50);
2671 /* Put in the magic number and the options. */
2673 re
->magic_number
= MAGIC_NUMBER
;
2674 re
->options
= options
;
2676 /* Set up a starting, non-extracting bracket, then compile the expression. On
2677 error, *errorptr will be set non-NULL, so we don't need to look at the result
2678 of the function here. */
2680 ptr
= (const uschar
*)pattern
;
2684 (void)compile_regex(options
, &bracount
, &code
, &ptr
, errorptr
, dictionary
);
2685 re
->top_bracket
= bracount
;
2686 re
->top_backref
= top_backref
;
2688 /* If not reached end of pattern on success, there's an excess bracket. */
2690 if (*errorptr
== NULL
&& *ptr
!= 0) *errorptr
= ERR22
;
2692 /* Fill in the terminating state and check for disastrous overflow, but
2693 if debugging, leave the test till after things are printed out. */
2699 if (code
- re
->code
> length
) *errorptr
= ERR23
;
2702 /* Failed to compile */
2704 if (*errorptr
!= NULL
)
2708 *erroroffset
= ptr
- (const uschar
*)pattern
;
2712 /* If the anchored option was not passed, set flag if we can determine that it
2713 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2714 we can determine what the first character has to be, because that speeds up
2715 unanchored matches no end. In the case of multiline matches, an alternative is
2716 to set the PCRE_STARTLINE flag if all branches start with ^. */
2718 if ((options
& PCRE_ANCHORED
) == 0)
2720 if (is_anchored(re
->code
, (options
& PCRE_MULTILINE
) != 0))
2721 re
->options
|= PCRE_ANCHORED
;
2724 int ch
= find_firstchar(re
->code
);
2727 re
->first_char
= ch
;
2728 re
->options
|= PCRE_FIRSTSET
;
2730 else if (is_startline(re
->code
))
2731 re
->options
|= PCRE_STARTLINE
;
2735 /* Print out the compiled data for debugging */
2739 printf("Length = %d top_bracket = %d top_backref=%d\n",
2740 length
, re
->top_bracket
, re
->top_backref
);
2742 if (re
->options
!= 0)
2744 printf("%s%s%s%s%s%s%s%s\n",
2745 ((re
->options
& PCRE_ANCHORED
) != 0)? "anchored " : "",
2746 ((re
->options
& PCRE_CASELESS
) != 0)? "caseless " : "",
2747 ((re
->options
& PCRE_EXTENDED
) != 0)? "extended " : "",
2748 ((re
->options
& PCRE_MULTILINE
) != 0)? "multiline " : "",
2749 ((re
->options
& PCRE_DOTALL
) != 0)? "dotall " : "",
2750 ((re
->options
& PCRE_DOLLAR_ENDONLY
) != 0)? "endonly " : "",
2751 ((re
->options
& PCRE_EXTRA
) != 0)? "extra " : "",
2752 ((re
->options
& PCRE_UNGREEDY
) != 0)? "ungreedy " : "");
2755 if ((re
->options
& PCRE_FIRSTSET
) != 0)
2757 if (isprint(re
->first_char
)) printf("First char = %c\n", re
->first_char
);
2758 else printf("First char = \\x%02x\n", re
->first_char
);
2762 code_base
= code
= re
->code
;
2764 while (code
< code_end
)
2768 printf("%3d ", code
- code_base
);
2770 if (*code
>= OP_BRA
)
2772 printf("%3d Bra %d", (code
[1] << 8) + code
[2], *code
- OP_BRA
);
2779 charlength
= *(++code
);
2780 printf("%3d ", charlength
);
2781 while (charlength
-- > 0)
2782 if (isprint(c
= *(++code
))) printf("%c", c
); else printf("\\x%02x", c
);
2792 printf("%3d %s", (code
[1] << 8) + code
[2], OP_names
[*code
]);
2803 case OP_TYPEMINSTAR
:
2805 case OP_TYPEMINPLUS
:
2807 case OP_TYPEMINQUERY
:
2808 if (*code
>= OP_TYPESTAR
)
2809 printf(" %s", OP_names
[code
[1]]);
2810 else if (isprint(c
= code
[1])) printf(" %c", c
);
2811 else printf(" \\x%02x", c
);
2812 printf("%s", OP_names
[*code
++]);
2818 if (isprint(c
= code
[3])) printf(" %c{", c
);
2819 else printf(" \\x%02x{", c
);
2820 if (*code
!= OP_EXACT
) printf("0,");
2821 printf("%d}", (code
[1] << 8) + code
[2]);
2822 if (*code
== OP_MINUPTO
) printf("?");
2828 case OP_TYPEMINUPTO
:
2829 printf(" %s{", OP_names
[code
[3]]);
2830 if (*code
!= OP_TYPEEXACT
) printf(",");
2831 printf("%d}", (code
[1] << 8) + code
[2]);
2832 if (*code
== OP_TYPEMINUPTO
) printf("?");
2837 if (isprint(c
= *(++code
))) printf(" [^%c]", c
);
2838 else printf(" [^\\x%02x]", c
);
2846 case OP_NOTMINQUERY
:
2847 if (isprint(c
= code
[1])) printf(" [^%c]", c
);
2848 else printf(" [^\\x%02x]", c
);
2849 printf("%s", OP_names
[*code
++]);
2855 if (isprint(c
= code
[3])) printf(" [^%c]{", c
);
2856 else printf(" [^\\x%02x]{", c
);
2857 if (*code
!= OP_NOTEXACT
) printf(",");
2858 printf("%d}", (code
[1] << 8) + code
[2]);
2859 if (*code
== OP_NOTMINUPTO
) printf("?");
2864 printf(" \\%d", *(++code
));
2866 goto CLASS_REF_REPEAT
;
2874 if (*code
==OP_CLASS_L
)
2877 printf("Locflag = %i ", *code
++);
2882 if (*code
++ == OP_CLASS
) printf(" [");
2887 for (i
= 0; i
< 256; i
++)
2889 if ((code
[i
/8] & (1 << (i
&7))) != 0)
2892 for (j
= i
+1; j
< 256; j
++)
2893 if ((code
[j
/8] & (1 << (j
&7))) == 0) break;
2894 if (i
== '-' || i
== ']') printf("\\");
2895 if (isprint(i
)) printf("%c", i
); else printf("\\x%02x", i
);
2899 if (j
== '-' || j
== ']') printf("\\");
2900 if (isprint(j
)) printf("%c", j
); else printf("\\x%02x", j
);
2919 printf("%s", OP_names
[*code
]);
2924 min
= (code
[1] << 8) + code
[2];
2925 max
= (code
[3] << 8) + code
[4];
2926 if (max
== 0) printf("{%d,}", min
);
2927 else printf("{%d,%d}", min
, max
);
2928 if (*code
== OP_CRMINRANGE
) printf("?");
2938 /* Anything else is just a one-node item */
2941 printf(" %s", OP_names
[*code
]);
2948 printf("------------------------------------------------------------------\n");
2950 /* This check is done here in the debugging case so that the code that
2951 was compiled can be seen. */
2953 if (code
- re
->code
> length
)
2955 printf("length=%i, code length=%i\n", length
, code
-re
->code
);
2958 *erroroffset
= ptr
- (uschar
*)pattern
;
2968 /*************************************************
2969 * Match a character type *
2970 *************************************************/
2972 /* Not used in all the places it might be as it's sometimes faster
2973 to put the code inline.
2976 type the character type
2978 dotall the dotall flag
2980 Returns: TRUE if character is of the type
2984 match_type(int type
, int c
, BOOL dotall
)
2988 if (isprint(c
)) printf("matching subject %c against ", c
);
2989 else printf("matching subject \\x%02x against ", c
);
2990 printf("%s\n", OP_names
[type
]);
2995 case OP_ANY
: return dotall
|| c
!= '\n';
2996 case OP_NOT_DIGIT
: return (pcre_ctypes
[c
] & ctype_digit
) == 0;
2997 case OP_DIGIT
: return (pcre_ctypes
[c
] & ctype_digit
) != 0;
2998 case OP_NOT_WHITESPACE
: return (pcre_ctypes
[c
] & ctype_space
) == 0;
2999 case OP_WHITESPACE
: return (pcre_ctypes
[c
] & ctype_space
) != 0;
3000 case OP_NOT_WORDCHAR
: return (pcre_ctypes
[c
] & ctype_word
) == 0;
3001 case OP_WORDCHAR
: return (pcre_ctypes
[c
] & ctype_word
) != 0;
3002 case OP_NOT_WORDCHAR_L
: return (c
!='_' && !isalnum(c
));
3003 case OP_WORDCHAR_L
: return (c
=='_' || isalnum(c
));
3010 /*************************************************
3011 * Match a back-reference *
3012 *************************************************/
3014 /* If a back reference hasn't been set, the match fails.
3017 number reference number
3018 eptr points into the subject
3019 length length to be matched
3020 md points to match data block
3022 Returns: TRUE if matched
3026 match_ref(int number
, register const uschar
*eptr
, int length
, match_data
*md
)
3028 const uschar
*p
= md
->start_subject
+ md
->offset_vector
[number
];
3031 if (eptr
>= md
->end_subject
)
3032 printf("matching subject <null>");
3035 printf("matching subject ");
3036 pchars(eptr
, length
, TRUE
, md
);
3038 printf(" against backref ");
3039 pchars(p
, length
, FALSE
, md
);
3043 /* Always fail if not enough characters left */
3045 if (length
> md
->end_subject
- p
) return FALSE
;
3047 /* Separate the caseless case for speed */
3050 { while (length
-- > 0) if (pcre_lcc
[*p
++] != pcre_lcc
[*eptr
++]) return FALSE
; }
3052 { while (length
-- > 0) if (*p
++ != *eptr
++) return FALSE
; }
3057 static int free_stack(match_data
*md
)
3059 /* Free any stack space that was allocated by the call to match(). */
3060 if (md
->off_num
) PyMem_DEL(md
->off_num
);
3061 if (md
->offset_top
) PyMem_DEL(md
->offset_top
);
3062 if (md
->r1
) PyMem_DEL(md
->r1
);
3063 if (md
->r2
) PyMem_DEL(md
->r2
);
3064 if (md
->eptr
) PyMem_DEL((char *)md
->eptr
);
3065 if (md
->ecode
) PyMem_DEL((char *)md
->ecode
);
3069 static int grow_stack(match_data
*md
)
3071 if (md
->length
!= 0)
3073 md
->length
= md
->length
+ md
->length
/2;
3077 int string_len
= md
->end_subject
- md
->start_subject
+ 1;
3078 if (string_len
< 80) {md
->length
= string_len
; }
3079 else {md
->length
= 80;}
3081 PyMem_RESIZE(md
->offset_top
, int, md
->length
);
3082 /* Can't realloc a pointer-to-const; cast const away. */
3083 md
->eptr
= (const uschar
**)PyMem_Realloc((void *)md
->eptr
,
3084 sizeof(uschar
*) * md
->length
);
3085 md
->ecode
= (const uschar
**)PyMem_Realloc((void *)md
->ecode
,
3086 sizeof(uschar
*) * md
->length
);
3087 PyMem_RESIZE(md
->off_num
, int, md
->length
);
3088 PyMem_RESIZE(md
->r1
, int, md
->length
);
3089 PyMem_RESIZE(md
->r2
, int, md
->length
);
3090 if (md
->offset_top
== NULL
|| md
->eptr
== NULL
|| md
->ecode
== NULL
||
3091 md
->off_num
== NULL
|| md
->r1
== NULL
|| md
->r2
== NULL
)
3094 longjmp(md
->error_env
, 1);
3100 /*************************************************
3101 * Match from current position *
3102 *************************************************/
3104 /* On entry ecode points to the first opcode, and eptr to the first character.
3107 eptr pointer in subject
3108 ecode position in code
3109 offset_top current top pointer
3110 md pointer to "static" info for the match
3112 Returns: TRUE if matched
3116 match(register const uschar
*eptr
, register const uschar
*ecode
, int offset_top
,
3119 int save_stack_position
= md
->point
;
3122 #define SUCCEED goto succeed
3123 #define FAIL goto fail
3127 int min
, max
, ctype
;
3130 BOOL minimize
= FALSE
;
3132 /* Opening bracket. Check the alternative branches in turn, failing if none
3133 match. We have to set the start offset if required and there is space
3134 in the offset vector so that it is available for subsequent back references
3135 if the bracket matches. However, if the bracket fails, we must put back the
3136 previous value of both offsets in case they were set by a previous copy of
3137 the same bracket. Don't worry about setting the flag for the error case here;
3138 that is handled in the code for KET. */
3140 if ((int)*ecode
>= OP_BRA
)
3142 int number
= (*ecode
- OP_BRA
) << 1;
3143 int save_offset1
= 0, save_offset2
= 0;
3145 DPRINTF(("start bracket %d\n", number
/2));
3147 if (number
> 0 && number
< md
->offset_end
)
3149 save_offset1
= md
->offset_vector
[number
];
3150 save_offset2
= md
->offset_vector
[number
+1];
3151 md
->offset_vector
[number
] = eptr
- md
->start_subject
;
3153 DPRINTF(("saving %d %d\n", save_offset1
, save_offset2
));
3156 /* Recurse for all the alternatives. */
3160 if (match(eptr
, ecode
+3, offset_top
, md
)) SUCCEED
;
3161 ecode
+= (ecode
[1] << 8) + ecode
[2];
3163 while (*ecode
== OP_ALT
);
3165 DPRINTF(("bracket %d failed\n", number
/2));
3167 if (number
> 0 && number
< md
->offset_end
)
3169 md
->offset_vector
[number
] = save_offset1
;
3170 md
->offset_vector
[number
+1] = save_offset2
;
3176 /* Other types of node can be handled by a switch */
3181 md
->end_match_ptr
= eptr
; /* Record where we ended */
3182 md
->end_offset_top
= offset_top
; /* and how many extracts were taken */
3185 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3186 whole thing doesn't match, so we have to get out via a longjmp(). */
3189 if (match(eptr
, ecode
+1, offset_top
, md
)) SUCCEED
;
3190 longjmp(md
->fail_env
, 1);
3192 /* Assertion brackets. Check the alternative branches in turn - the
3193 matching won't pass the KET for an assertion. If any one branch matches,
3194 the assertion is true. */
3199 if (match(eptr
, ecode
+3, offset_top
, md
)) break;
3200 ecode
+= (ecode
[1] << 8) + ecode
[2];
3202 while (*ecode
== OP_ALT
);
3203 if (*ecode
== OP_KET
) FAIL
;
3205 /* Continue from after the assertion, updating the offsets high water
3206 mark, since extracts may have been taken during the assertion. */
3208 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3210 offset_top
= md
->end_offset_top
;
3213 /* Negative assertion: all branches must fail to match */
3218 if (match(eptr
, ecode
+3, offset_top
, md
)) FAIL
;
3219 ecode
+= (ecode
[1] << 8) + ecode
[2];
3221 while (*ecode
== OP_ALT
);
3225 /* "Once" brackets are like assertion brackets except that after a match,
3226 the point in the subject string is not moved back. Thus there can never be
3227 a move back into the brackets. Check the alternative branches in turn - the
3228 matching won't pass the KET for this kind of subpattern. If any one branch
3229 matches, we carry on, leaving the subject pointer. */
3234 if (match(eptr
, ecode
+3, offset_top
, md
)) break;
3235 ecode
+= (ecode
[1] << 8) + ecode
[2];
3237 while (*ecode
== OP_ALT
);
3238 if (*ecode
== OP_KET
) FAIL
;
3240 /* Continue as from after the assertion, updating the offsets high water
3241 mark, since extracts may have been taken. */
3243 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3245 offset_top
= md
->end_offset_top
;
3246 eptr
= md
->end_match_ptr
;
3249 /* An alternation is the end of a branch; scan along to find the end of the
3250 bracketed group and go to there. */
3253 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3256 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3257 that it may occur zero times. It may repeat infinitely, or not at all -
3258 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3259 repeat limits are compiled as a number of copies, with the optional ones
3260 preceded by BRAZERO or BRAMINZERO. */
3264 const uschar
*next
= ecode
+1;
3265 if (match(eptr
, next
, offset_top
, md
)) SUCCEED
;
3266 do next
+= (next
[1] << 8) + next
[2]; while (*next
== OP_ALT
);
3273 const uschar
*next
= ecode
+1;
3274 do next
+= (next
[1] << 8) + next
[2]; while (*next
== OP_ALT
);
3275 if (match(eptr
, next
+3, offset_top
, md
)) SUCCEED
;
3280 /* End of a group, repeated or non-repeating. If we are at the end of
3281 an assertion "group", stop matching and SUCCEED, but record the
3282 current high water mark for use by positive assertions. */
3289 const uschar
*prev
= ecode
- (ecode
[1] << 8) - ecode
[2];
3291 if (*prev
== OP_ASSERT
|| *prev
== OP_ASSERT_NOT
|| *prev
== OP_ONCE
)
3293 md
->end_match_ptr
= eptr
; /* For ONCE */
3294 md
->end_offset_top
= offset_top
;
3298 /* In all other cases we have to check the group number back at the
3299 start and if necessary complete handling an extraction by setting the
3300 final offset and bumping the high water mark. */
3302 number
= (*prev
- OP_BRA
) << 1;
3304 DPRINTF(("end bracket %d\n", number
/2));
3308 if (number
>= md
->offset_end
) md
->offset_overflow
= TRUE
; else
3310 md
->offset_vector
[number
+1] = eptr
- md
->start_subject
;
3311 if (offset_top
<= number
) offset_top
= number
+ 2;
3315 /* For a non-repeating ket, just advance to the next node and continue at
3318 if (*ecode
== OP_KET
)
3324 /* The repeating kets try the rest of the pattern or restart from the
3325 preceding bracket, in the appropriate order. */
3327 if (*ecode
== OP_KETRMIN
)
3330 if (match(eptr
, ecode
+3, offset_top
, md
)) goto succeed
;
3331 /* Handle alternation inside the BRA...KET; push the additional
3332 alternatives onto the stack */
3335 ptr
+= (ptr
[1]<<8)+ ptr
[2];
3338 if (md
->length
== md
->point
)
3342 md
->offset_top
[md
->point
] = offset_top
;
3343 md
->eptr
[md
->point
] = eptr
;
3344 md
->ecode
[md
->point
] = ptr
+3;
3345 md
->r1
[md
->point
] = 0;
3346 md
->r2
[md
->point
] = 0;
3347 md
->off_num
[md
->point
] = 0;
3350 } while (*ptr
==OP_ALT
);
3351 ecode
=prev
+3; goto match_loop
;
3353 else /* OP_KETRMAX */
3356 /*int points_pushed=0;*/
3358 /* Push one failure point, that will resume matching at the code after
3359 the KETRMAX opcode. */
3360 if (md
->length
== md
->point
)
3364 md
->offset_top
[md
->point
] = offset_top
;
3365 md
->eptr
[md
->point
] = eptr
;
3366 md
->ecode
[md
->point
] = ecode
+3;
3367 md
->r1
[md
->point
] = md
->offset_vector
[number
];
3368 md
->r2
[md
->point
] = md
->offset_vector
[number
+1];
3369 md
->off_num
[md
->point
] = number
;
3372 md
->offset_vector
[number
] = eptr
- md
->start_subject
;
3373 /* Handle alternation inside the BRA...KET; push each of the
3374 additional alternatives onto the stack */
3377 ptr
+= (ptr
[1]<<8)+ ptr
[2];
3380 if (md
->length
== md
->point
)
3381 if (md
->length
== md
->point
)
3385 md
->offset_top
[md
->point
] = offset_top
;
3386 md
->eptr
[md
->point
] = eptr
;
3387 md
->ecode
[md
->point
] = ptr
+3;
3388 md
->r1
[md
->point
] = 0;
3389 md
->r2
[md
->point
] = 0;
3390 md
->off_num
[md
->point
] = 0;
3392 /*points_pushed++;*/
3394 } while (*ptr
==OP_ALT
);
3395 /* Jump to the first (or only) alternative and resume trying to match */
3396 ecode
=prev
+3; goto match_loop
;
3400 /* Start of subject unless notbol, or after internal newline if multiline */
3403 if (md
->notbol
&& eptr
== md
->start_subject
) FAIL
;
3406 if (eptr
!= md
->start_subject
&& eptr
[-1] != '\n') FAIL
;
3410 /* ... else fall through */
3412 /* Start of subject assertion */
3415 if (eptr
!= md
->start_subject
) FAIL
;
3419 /* Assert before internal newline if multiline, or before
3420 a terminating newline unless endonly is set, else end of subject unless
3424 if (md
->noteol
&& eptr
>= md
->end_subject
) FAIL
;
3427 if (eptr
< md
->end_subject
&& *eptr
!= '\n') FAIL
;
3431 else if (!md
->endonly
)
3433 if (eptr
< md
->end_subject
- 1 ||
3434 (eptr
== md
->end_subject
- 1 && *eptr
!= '\n')) FAIL
;
3438 /* ... else fall through */
3440 /* End of subject assertion */
3443 if (eptr
< md
->end_subject
) FAIL
;
3447 /* Word boundary assertions */
3449 case OP_NOT_WORD_BOUNDARY
:
3450 case OP_WORD_BOUNDARY
:
3452 BOOL prev_is_word
= (eptr
!= md
->start_subject
) &&
3453 ((pcre_ctypes
[eptr
[-1]] & ctype_word
) != 0);
3454 BOOL cur_is_word
= (eptr
< md
->end_subject
) &&
3455 ((pcre_ctypes
[*eptr
] & ctype_word
) != 0);
3456 if ((*ecode
++ == OP_WORD_BOUNDARY
)?
3457 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
3462 case OP_NOT_WORD_BOUNDARY_L
:
3463 case OP_WORD_BOUNDARY_L
:
3465 BOOL prev_is_word
= (eptr
!= md
->start_subject
) &&
3466 (isalnum(eptr
[-1]) || eptr
[-1]=='_');
3467 BOOL cur_is_word
= (eptr
< md
->end_subject
) &&
3468 (isalnum(*eptr
) || *eptr
=='_');
3469 if ((*ecode
++ == OP_WORD_BOUNDARY_L
)?
3470 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
3476 /* Match a single character type; inline for speed */
3479 if (!md
->dotall
&& eptr
< md
->end_subject
&& *eptr
== '\n') FAIL
;
3480 if (eptr
++ >= md
->end_subject
) FAIL
;
3485 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_digit
) != 0)
3491 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_digit
) == 0)
3496 case OP_NOT_WHITESPACE
:
3497 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_space
) != 0)
3503 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_space
) == 0)
3508 case OP_NOT_WORDCHAR
:
3509 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_word
) != 0)
3515 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_word
) == 0)
3520 case OP_NOT_WORDCHAR_L
:
3521 if (eptr
>= md
->end_subject
|| (*eptr
=='_' || isalnum(*eptr
) ))
3528 if (eptr
>= md
->end_subject
|| (*eptr
!='_' && !isalnum(*eptr
) ))
3534 /* Match a back reference, possibly repeatedly. Look past the end of the
3535 item to see if there is repeat information following. The code is similar
3536 to that for character classes, but repeated for efficiency. Then obey
3537 similar code to character type repeats - written out again for speed.
3538 However, if the referenced string is the empty string, always treat
3539 it as matched, any number of times (otherwise there could be infinite
3545 int number
= ecode
[1] << 1; /* Doubled reference number */
3546 ecode
+= 2; /* Advance past the item */
3548 if (number
>= offset_top
|| md
->offset_vector
[number
] < 0)
3550 md
->errorcode
= PCRE_ERROR_BADREF
;
3554 length
= md
->offset_vector
[number
+1] - md
->offset_vector
[number
];
3564 c
= *ecode
++ - OP_CRSTAR
;
3565 minimize
= (c
& 1) != 0;
3566 min
= rep_min
[c
]; /* Pick up values from tables; */
3567 max
= rep_max
[c
]; /* zero for max => infinity */
3568 if (max
== 0) max
= INT_MAX
;
3573 minimize
= (*ecode
== OP_CRMINRANGE
);
3574 min
= (ecode
[1] << 8) + ecode
[2];
3575 max
= (ecode
[3] << 8) + ecode
[4];
3576 if (max
== 0) max
= INT_MAX
;
3580 default: /* No repeat follows */
3581 if (!match_ref(number
, eptr
, length
, md
)) FAIL
;
3583 continue; /* With the main loop */
3586 /* If the length of the reference is zero, just continue with the
3589 if (length
== 0) continue;
3591 /* First, ensure the minimum number of matches are present. We get back
3592 the length of the reference string explicitly rather than passing the
3593 address of eptr, so that eptr can be a register variable. */
3595 for (i
= 1; i
<= min
; i
++)
3597 if (!match_ref(number
, eptr
, length
, md
)) FAIL
;
3601 /* If min = max, continue at the same level without recursion.
3602 They are not both allowed to be zero. */
3604 if (min
== max
) continue;
3606 /* If minimizing, keep trying and advancing the pointer */
3612 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3613 if (i
>= max
|| !match_ref(number
, eptr
, length
, md
))
3617 /* Control never gets here */
3620 /* If maximizing, find the longest string and work backwards */
3624 const uschar
*pp
= eptr
;
3625 for (i
= min
; i
< max
; i
++)
3627 if (!match_ref(number
, eptr
, length
, md
)) break;
3632 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3638 /* Control never gets here */
3640 /* Match a character class, possibly repeatedly. Look past the end of the
3641 item to see if there is repeat information following. Then obey similar
3642 code to character type repeats - written out again for speed. If caseless
3643 matching was set at runtime but not at compile time, we have to check both
3644 versions of a character, and we have to behave differently for positive and
3645 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3646 treated differently. */
3651 BOOL nasty_case
= *ecode
== OP_NEGCLASS
&& md
->runtime_caseless
;
3652 const uschar
*data
= ecode
+ 1; /* Save for matching */
3653 ecode
+= 33; /* Advance past the item */
3663 c
= *ecode
++ - OP_CRSTAR
;
3664 minimize
= (c
& 1) != 0;
3665 min
= rep_min
[c
]; /* Pick up values from tables; */
3666 max
= rep_max
[c
]; /* zero for max => infinity */
3667 if (max
== 0) max
= INT_MAX
;
3672 minimize
= (*ecode
== OP_CRMINRANGE
);
3673 min
= (ecode
[1] << 8) + ecode
[2];
3674 max
= (ecode
[3] << 8) + ecode
[4];
3675 if (max
== 0) max
= INT_MAX
;
3679 default: /* No repeat follows */
3684 /* First, ensure the minimum number of matches are present. */
3686 for (i
= 1; i
<= min
; i
++)
3688 if (eptr
>= md
->end_subject
) FAIL
;
3691 /* Either not runtime caseless, or it was a positive class. For
3692 runtime caseless, continue if either case is in the map. */
3696 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3697 if (md
->runtime_caseless
)
3700 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3704 /* Runtime caseless and it was a negative class. Continue only if
3705 both cases are in the map. */
3709 if ((data
[c
/8] & (1 << (c
&7))) == 0) FAIL
;
3711 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3717 /* If max == min we can continue with the main loop without the
3720 if (min
== max
) continue;
3722 /* If minimizing, keep testing the rest of the expression and advancing
3723 the pointer while it matches the class. */
3729 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3730 if (i
>= max
|| eptr
>= md
->end_subject
) FAIL
;
3733 /* Either not runtime caseless, or it was a positive class. For
3734 runtime caseless, continue if either case is in the map. */
3738 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3739 if (md
->runtime_caseless
)
3742 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3746 /* Runtime caseless and it was a negative class. Continue only if
3747 both cases are in the map. */
3751 if ((data
[c
/8] & (1 << (c
&7))) == 0) return FALSE
;
3753 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3758 /* Control never gets here */
3761 /* If maximizing, find the longest possible run, then work backwards. */
3765 const uschar
*pp
= eptr
;
3766 for (i
= min
; i
< max
; eptr
++, i
++)
3768 if (eptr
>= md
->end_subject
) break;
3771 /* Either not runtime caseless, or it was a positive class. For
3772 runtime caseless, continue if either case is in the map. */
3776 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3777 if (md
->runtime_caseless
)
3780 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3784 /* Runtime caseless and it was a negative class. Continue only if
3785 both cases are in the map. */
3789 if ((data
[c
/8] & (1 << (c
&7))) == 0) break;
3791 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3798 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
3802 /* Control never gets here */
3804 /* OP_CLASS_L opcode: handles localized character classes */
3808 const uschar
*data
= ecode
+ 1; /* Save for matching */
3809 const uschar locale_flag
= *data
;
3810 ecode
++; data
++; /* The localization support adds an extra byte */
3812 ecode
+= 33; /* Advance past the item */
3822 c
= *ecode
++ - OP_CRSTAR
;
3823 minimize
= (c
& 1) != 0;
3824 min
= rep_min
[c
]; /* Pick up values from tables; */
3825 max
= rep_max
[c
]; /* zero for max => infinity */
3826 if (max
== 0) max
= INT_MAX
;
3831 minimize
= (*ecode
== OP_CRMINRANGE
);
3832 min
= (ecode
[1] << 8) + ecode
[2];
3833 max
= (ecode
[3] << 8) + ecode
[4];
3834 if (max
== 0) max
= INT_MAX
;
3838 default: /* No repeat follows */
3839 if (eptr
>= md
->end_subject
) FAIL
;
3841 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue; /* With main loop */
3842 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3843 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3845 if ( (locale_flag
& 4) && isdigit(c
) ) continue; /* Locale \d */
3846 if ( (locale_flag
& 8) && !isdigit(c
) ) continue; /* Locale \D */
3847 if ( (locale_flag
& 16) && isspace(c
) ) continue; /* Locale \s */
3848 if ( (locale_flag
& 32) && !isspace(c
) ) continue; /* Locale \S */
3851 if (md
->runtime_caseless
)
3854 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue; /* With main loop */
3856 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3857 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3862 /* First, ensure the minimum number of matches are present. */
3864 for (i
= 1; i
<= min
; i
++)
3866 if (eptr
>= md
->end_subject
) FAIL
;
3868 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3869 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3870 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3872 if (md
->runtime_caseless
)
3875 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3876 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3877 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3882 /* If max == min we can continue with the main loop without the
3885 if (min
== max
) continue;
3887 /* If minimizing, keep testing the rest of the expression and advancing
3888 the pointer while it matches the class. */
3894 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3895 if (i
>= max
|| eptr
>= md
->end_subject
) FAIL
;
3897 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3898 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3899 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3901 if (md
->runtime_caseless
)
3904 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3905 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3906 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3910 /* Control never gets here */
3913 /* If maximizing, find the longest possible run, then work backwards. */
3917 const uschar
*pp
= eptr
;
3918 for (i
= min
; i
< max
; eptr
++, i
++)
3920 if (eptr
>= md
->end_subject
) break;
3922 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3923 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3924 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3925 if (md
->runtime_caseless
)
3928 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3929 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3930 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3936 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
3940 /* Control never gets here */
3942 /* Match a run of characters */
3946 register int length
= ecode
[1];
3949 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3950 if (eptr
>= md
->end_subject
)
3951 printf("matching subject <null> against pattern ");
3954 printf("matching subject ");
3955 pchars(eptr
, length
, TRUE
, md
);
3956 printf(" against pattern ");
3958 pchars(ecode
, length
, FALSE
, md
);
3962 if (length
> md
->end_subject
- eptr
) FAIL
;
3965 while (length
-- > 0) if (pcre_lcc
[*ecode
++] != pcre_lcc
[*eptr
++]) FAIL
;
3969 while (length
-- > 0) if (*ecode
++ != *eptr
++) FAIL
;
3974 /* Match a single character repeatedly; different opcodes share code. */
3977 min
= max
= (ecode
[1] << 8) + ecode
[2];
3984 max
= (ecode
[1] << 8) + ecode
[2];
3985 minimize
= *ecode
== OP_MINUPTO
;
3995 c
= *ecode
++ - OP_STAR
;
3996 minimize
= (c
& 1) != 0;
3997 min
= rep_min
[c
]; /* Pick up values from tables; */
3998 max
= rep_max
[c
]; /* zero for max => infinity */
3999 if (max
== 0) max
= INT_MAX
;
4001 /* Common code for all repeated single-character matches. We can give
4002 up quickly if there are fewer than the minimum number of characters left in
4006 if (min
> md
->end_subject
- eptr
) FAIL
;
4009 /* The code is duplicated for the caseless and caseful cases, for speed,
4010 since matching characters is likely to be quite common. First, ensure the
4011 minimum number of matches are present. If min = max, continue at the same
4012 level without recursing. Otherwise, if minimizing, keep trying the rest of
4013 the expression and advancing one matching character if failing, up to the
4014 maximum. Alternatively, if maximizing, find the maximum number of
4015 characters and work backwards. */
4017 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c
, min
, max
,
4023 for (i
= 1; i
<= min
; i
++) if (c
!= pcre_lcc
[*eptr
++]) FAIL
;
4024 if (min
== max
) continue;
4029 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4030 if (i
>= max
|| eptr
>= md
->end_subject
|| c
!= pcre_lcc
[*eptr
++])
4033 /* Control never gets here */
4037 const uschar
*pp
= eptr
;
4038 for (i
= min
; i
< max
; i
++)
4040 if (eptr
>= md
->end_subject
|| c
!= pcre_lcc
[*eptr
]) break;
4044 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4047 /* Control never gets here */
4050 /* Caseful comparisons */
4054 for (i
= 1; i
<= min
; i
++) if (c
!= *eptr
++) FAIL
;
4055 if (min
== max
) continue;
4060 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4061 if (i
>= max
|| eptr
>= md
->end_subject
|| c
!= *eptr
++) FAIL
;
4063 /* Control never gets here */
4067 const uschar
*pp
= eptr
;
4068 for (i
= min
; i
< max
; i
++)
4070 if (eptr
>= md
->end_subject
|| c
!= *eptr
) break;
4074 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4078 /* Control never gets here */
4080 /* Match a negated single character */
4083 if (eptr
>= md
->end_subject
) FAIL
;
4087 if (pcre_lcc
[*ecode
++] == pcre_lcc
[*eptr
++]) FAIL
;
4091 if (*ecode
++ == *eptr
++) FAIL
;
4095 /* Match a negated single character repeatedly. This is almost a repeat of
4096 the code for a repeated single character, but I haven't found a nice way of
4097 commoning these up that doesn't require a test of the positive/negative
4098 option for each character match. Maybe that wouldn't add very much to the
4099 time taken, but character matching *is* what this is all about... */
4102 min
= max
= (ecode
[1] << 8) + ecode
[2];
4109 max
= (ecode
[1] << 8) + ecode
[2];
4110 minimize
= *ecode
== OP_NOTMINUPTO
;
4119 case OP_NOTMINQUERY
:
4120 c
= *ecode
++ - OP_NOTSTAR
;
4121 minimize
= (c
& 1) != 0;
4122 min
= rep_min
[c
]; /* Pick up values from tables; */
4123 max
= rep_max
[c
]; /* zero for max => infinity */
4124 if (max
== 0) max
= INT_MAX
;
4126 /* Common code for all repeated single-character matches. We can give
4127 up quickly if there are fewer than the minimum number of characters left in
4131 if (min
> md
->end_subject
- eptr
) FAIL
;
4134 /* The code is duplicated for the caseless and caseful cases, for speed,
4135 since matching characters is likely to be quite common. First, ensure the
4136 minimum number of matches are present. If min = max, continue at the same
4137 level without recursing. Otherwise, if minimizing, keep trying the rest of
4138 the expression and advancing one matching character if failing, up to the
4139 maximum. Alternatively, if maximizing, find the maximum number of
4140 characters and work backwards. */
4142 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c
, min
, max
,
4148 for (i
= 1; i
<= min
; i
++) if (c
== pcre_lcc
[*eptr
++]) FAIL
;
4149 if (min
== max
) continue;
4154 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4155 if (i
>= max
|| eptr
>= md
->end_subject
|| c
== pcre_lcc
[*eptr
++])
4158 /* Control never gets here */
4162 const uschar
*pp
= eptr
;
4163 for (i
= min
; i
< max
; i
++)
4165 if (eptr
>= md
->end_subject
|| c
== pcre_lcc
[*eptr
]) break;
4169 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4172 /* Control never gets here */
4175 /* Caseful comparisons */
4179 for (i
= 1; i
<= min
; i
++) if (c
== *eptr
++) FAIL
;
4180 if (min
== max
) continue;
4185 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4186 if (i
>= max
|| eptr
>= md
->end_subject
|| c
== *eptr
++) FAIL
;
4188 /* Control never gets here */
4192 const uschar
*pp
= eptr
;
4193 for (i
= min
; i
< max
; i
++)
4195 if (eptr
>= md
->end_subject
|| c
== *eptr
) break;
4199 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4203 /* Control never gets here */
4205 /* Match a single character type repeatedly; several different opcodes
4206 share code. This is very similar to the code for single characters, but we
4207 repeat it in the interests of efficiency. */
4210 min
= max
= (ecode
[1] << 8) + ecode
[2];
4216 case OP_TYPEMINUPTO
:
4218 max
= (ecode
[1] << 8) + ecode
[2];
4219 minimize
= *ecode
== OP_TYPEMINUPTO
;
4224 case OP_TYPEMINSTAR
:
4226 case OP_TYPEMINPLUS
:
4228 case OP_TYPEMINQUERY
:
4229 c
= *ecode
++ - OP_TYPESTAR
;
4230 minimize
= (c
& 1) != 0;
4231 min
= rep_min
[c
]; /* Pick up values from tables; */
4232 max
= rep_max
[c
]; /* zero for max => infinity */
4233 if (max
== 0) max
= INT_MAX
;
4235 /* Common code for all repeated single character type matches */
4238 ctype
= *ecode
++; /* Code for the character type */
4240 /* First, ensure the minimum number of matches are present. Use inline
4241 code for maximizing the speed, and do the type test once at the start
4242 (i.e. keep it out of the loop). Also test that there are at least the
4243 minimum number of characters before we start. */
4245 if (min
> md
->end_subject
- eptr
) FAIL
;
4246 if (min
> 0) switch(ctype
)
4250 { for (i
= 1; i
<= min
; i
++) if (*eptr
++ == '\n') FAIL
; }
4255 for (i
= 1; i
<= min
; i
++)
4256 if ((pcre_ctypes
[*eptr
++] & ctype_digit
) != 0) FAIL
;
4260 for (i
= 1; i
<= min
; i
++)
4261 if ((pcre_ctypes
[*eptr
++] & ctype_digit
) == 0) FAIL
;
4264 case OP_NOT_WHITESPACE
:
4265 for (i
= 1; i
<= min
; i
++)
4266 if ((pcre_ctypes
[*eptr
++] & ctype_space
) != 0) FAIL
;
4270 for (i
= 1; i
<= min
; i
++)
4271 if ((pcre_ctypes
[*eptr
++] & ctype_space
) == 0) FAIL
;
4274 case OP_NOT_WORDCHAR
:
4275 for (i
= 1; i
<= min
; i
++) if ((pcre_ctypes
[*eptr
++] & ctype_word
) != 0)
4280 for (i
= 1; i
<= min
; i
++) if ((pcre_ctypes
[*eptr
++] & ctype_word
) == 0)
4284 case OP_NOT_WORDCHAR_L
:
4285 for (i
= 1; i
<= min
; i
++, eptr
++) if (*eptr
=='_' || isalnum(*eptr
))
4290 for (i
= 1; i
<= min
; i
++, eptr
++) if (*eptr
!='_' && !isalnum(*eptr
))
4295 /* If min = max, continue at the same level without recursing */
4297 if (min
== max
) continue;
4299 /* If minimizing, we have to test the rest of the pattern before each
4300 subsequent match, so inlining isn't much help; just use the function. */
4306 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4307 if (i
>= max
|| eptr
>= md
->end_subject
||
4308 !match_type(ctype
, *eptr
++, md
->dotall
))
4311 /* Control never gets here */
4314 /* If maximizing it is worth using inline code for speed, doing the type
4315 test once at the start (i.e. keep it out of the loop). */
4319 const uschar
*pp
= eptr
;
4325 for (i
= min
; i
< max
; i
++)
4327 if (eptr
>= md
->end_subject
|| *eptr
== '\n') break;
4334 if (c
> md
->end_subject
- eptr
) c
= md
->end_subject
- eptr
;
4340 for (i
= min
; i
< max
; i
++)
4342 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_digit
) != 0)
4349 for (i
= min
; i
< max
; i
++)
4351 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_digit
) == 0)
4357 case OP_NOT_WHITESPACE
:
4358 for (i
= min
; i
< max
; i
++)
4360 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_space
) != 0)
4367 for (i
= min
; i
< max
; i
++)
4369 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_space
) == 0)
4375 case OP_NOT_WORDCHAR
:
4376 for (i
= min
; i
< max
; i
++)
4378 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_word
) != 0)
4385 for (i
= min
; i
< max
; i
++)
4387 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_word
) == 0)
4392 case OP_NOT_WORDCHAR_L
:
4393 for (i
= min
; i
< max
; i
++)
4395 if (eptr
>= md
->end_subject
|| (*eptr
=='_' || isalnum(*eptr
) ) )
4402 for (i
= min
; i
< max
; i
++)
4404 if (eptr
>= md
->end_subject
|| (*eptr
!='_' && !isalnum(*eptr
) ) )
4412 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4415 /* Control never gets here */
4417 /* There's been some horrible disaster. */
4420 DPRINTF(("Unknown opcode %d\n", *ecode
));
4421 md
->errorcode
= PCRE_ERROR_UNKNOWN_NODE
;
4425 /* Do not stick any code in here without much thought; it is assumed
4426 that "continue" in the code above comes out to here to repeat the main
4429 } /* End of main loop */
4430 /* Control never reaches here */
4433 if (md
->point
> save_stack_position
)
4435 /* If there are still points remaining on the stack, pop the next one off */
4439 offset_top
= md
->offset_top
[md
->point
];
4440 eptr
= md
->eptr
[md
->point
];
4441 ecode
= md
->ecode
[md
->point
];
4442 off_num
= md
->off_num
[md
->point
];
4443 md
->offset_vector
[off_num
] = md
->r1
[md
->point
];
4444 md
->offset_vector
[off_num
+1] = md
->r2
[md
->point
];
4447 /* Failure, and nothing left on the stack, so end this function call */
4449 /* Restore the top of the stack to where it was before this function
4450 call. This lets us use one stack for everything; recursive calls
4451 can push and pop information, and may increase the stack. When
4452 the call returns, the parent function can resume pushing and
4453 popping wherever it was. */
4455 md
->point
= save_stack_position
;
4464 /*************************************************
4465 * Segregate setjmp() *
4466 *************************************************/
4468 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4469 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4470 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4471 since it's needed only for the extension \X option, and with any luck, a good
4472 compiler will spot the tail recursion and compile it efficiently.
4475 eptr pointer in subject
4476 ecode position in code
4477 offset_top current top pointer
4478 md pointer to "static" info for the match
4480 Returns: TRUE if matched
4484 match_with_setjmp(const uschar
*eptr
, const uschar
*ecode
, int offset_top
,
4485 match_data
*match_block
)
4487 return setjmp(match_block
->fail_env
) == 0 &&
4488 match(eptr
, ecode
, offset_top
, match_block
);
4493 /*************************************************
4494 * Execute a Regular Expression *
4495 *************************************************/
4497 /* This function applies a compiled re to a subject string and picks out
4498 portions of the string if it matches. Two elements in the vector are set for
4499 each substring: the offsets to the start and end of the substring.
4502 external_re points to the compiled expression
4503 external_extra points to "hints" from pcre_study() or is NULL
4504 subject points to the subject string
4505 length length of subject string (may contain binary zeros)
4507 offsets points to a vector of ints to be filled in with offsets
4508 offsetcount the number of elements in the vector
4510 Returns: > 0 => success; value is the number of elements filled in
4511 = 0 => success, but offsets is not big enough
4512 -1 => failed to match
4513 < -1 => some kind of unexpected problem
4517 pcre_exec(const pcre
*external_re
, const pcre_extra
*external_extra
,
4518 const char *subject
, int length
, int start_pos
, int options
,
4519 int *offsets
, int offsetcount
)
4521 /* The "volatile" directives are to make gcc -Wall stop complaining
4522 that these variables can be clobbered by the longjmp. Hopefully
4523 they won't cost too much performance. */
4524 volatile int resetcount
, ocount
;
4525 volatile int first_char
= -1;
4526 const uschar
* volatile start_bits
= NULL
;
4527 const uschar
* volatile start_match
= (const uschar
*)subject
+ start_pos
;
4528 match_data match_block
;
4529 const uschar
*end_subject
;
4530 const real_pcre
*re
= (const real_pcre
*)external_re
;
4531 const real_pcre_extra
*extra
= (const real_pcre_extra
*)external_extra
;
4532 volatile BOOL using_temporary_offsets
= FALSE
;
4533 volatile BOOL anchored
= ((re
->options
| options
) & PCRE_ANCHORED
) != 0;
4534 volatile BOOL startline
= (re
->options
& PCRE_STARTLINE
) != 0;
4536 if ((options
& ~PUBLIC_EXEC_OPTIONS
) != 0) return PCRE_ERROR_BADOPTION
;
4538 if (re
== NULL
|| subject
== NULL
||
4539 (offsets
== NULL
&& offsetcount
> 0)) return PCRE_ERROR_NULL
;
4540 if (re
->magic_number
!= MAGIC_NUMBER
) return PCRE_ERROR_BADMAGIC
;
4542 match_block
.start_subject
= (const uschar
*)subject
;
4543 match_block
.end_subject
= match_block
.start_subject
+ length
;
4544 end_subject
= match_block
.end_subject
;
4546 match_block
.caseless
= ((re
->options
| options
) & PCRE_CASELESS
) != 0;
4547 match_block
.runtime_caseless
= match_block
.caseless
&&
4548 (re
->options
& PCRE_CASELESS
) == 0;
4550 match_block
.multiline
= ((re
->options
| options
) & PCRE_MULTILINE
) != 0;
4551 match_block
.dotall
= ((re
->options
| options
) & PCRE_DOTALL
) != 0;
4552 match_block
.endonly
= ((re
->options
| options
) & PCRE_DOLLAR_ENDONLY
) != 0;
4554 match_block
.notbol
= (options
& PCRE_NOTBOL
) != 0;
4555 match_block
.noteol
= (options
& PCRE_NOTEOL
) != 0;
4557 match_block
.errorcode
= PCRE_ERROR_NOMATCH
; /* Default error */
4559 /* Set the stack state to empty */
4560 match_block
.off_num
= match_block
.offset_top
= NULL
;
4561 match_block
.r1
= match_block
.r2
= NULL
;
4562 match_block
.eptr
= match_block
.ecode
= NULL
;
4563 match_block
.point
= match_block
.length
= 0;
4565 /* If the expression has got more back references than the offsets supplied can
4566 hold, we get a temporary bit of working store to use during the matching.
4567 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4570 ocount
= offsetcount
& (-2);
4571 if (re
->top_backref
> 0 && re
->top_backref
>= ocount
/2)
4573 ocount
= re
->top_backref
* 2 + 2;
4574 match_block
.offset_vector
= (int *)(pcre_malloc
)(ocount
* sizeof(int));
4575 if (match_block
.offset_vector
== NULL
) return PCRE_ERROR_NOMEMORY
;
4576 using_temporary_offsets
= TRUE
;
4577 DPRINTF(("Got memory to hold back references\n"));
4579 else match_block
.offset_vector
= offsets
;
4581 match_block
.offset_end
= ocount
;
4582 match_block
.offset_overflow
= FALSE
;
4584 /* Compute the minimum number of offsets that we need to reset each time. Doing
4585 this makes a huge difference to execution time when there aren't many brackets
4588 resetcount
= 2 + re
->top_bracket
* 2;
4589 if (resetcount
> offsetcount
) resetcount
= ocount
;
4591 /* If MULTILINE is set at exec time but was not set at compile time, and the
4592 anchored flag is set, we must re-check because a setting provoked by ^ in the
4593 pattern is not right in multi-line mode. Calling is_anchored() again here does
4594 the right check, because multiline is now set. If it now yields FALSE, the
4595 expression must have had ^ starting some of its branches. Check to see if
4596 that is true for *all* branches, and if so, set the startline flag. */
4598 if (match_block
.multiline
&& anchored
&& (re
->options
& PCRE_MULTILINE
) == 0 &&
4599 !is_anchored(re
->code
, match_block
.multiline
))
4602 if (is_startline(re
->code
)) startline
= TRUE
;
4605 /* Set up the first character to match, if available. The first_char value is
4606 never set for an anchored regular expression, but the anchoring may be forced
4607 at run time, so we have to test for anchoring. The first char may be unset for
4608 an unanchored pattern, of course. If there's no first char and the pattern was
4609 studied, the may be a bitmap of possible first characters. However, we can
4610 use this only if the caseless state of the studying was correct. */
4614 if ((re
->options
& PCRE_FIRSTSET
) != 0)
4616 first_char
= re
->first_char
;
4617 if (match_block
.caseless
) first_char
= pcre_lcc
[first_char
];
4620 if (!startline
&& extra
!= NULL
&&
4621 (extra
->options
& PCRE_STUDY_MAPPED
) != 0 &&
4622 ((extra
->options
& PCRE_STUDY_CASELESS
) != 0) == match_block
.caseless
)
4623 start_bits
= extra
->start_bits
;
4626 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4631 register int *iptr
= match_block
.offset_vector
;
4632 register int *iend
= iptr
+ resetcount
;
4634 /* Reset the maximum number of extractions we might see. */
4636 while (iptr
< iend
) *iptr
++ = -1;
4638 /* Advance to a unique first char if possible */
4640 if (first_char
>= 0)
4642 if (match_block
.caseless
)
4643 while (start_match
< end_subject
&& pcre_lcc
[*start_match
] != first_char
)
4646 while (start_match
< end_subject
&& *start_match
!= first_char
)
4650 /* Or to just after \n for a multiline match if possible */
4654 if (start_match
> match_block
.start_subject
)
4656 while (start_match
< end_subject
&& start_match
[-1] != '\n')
4661 /* Or to a non-unique first char */
4663 else if (start_bits
!= NULL
)
4665 while (start_match
< end_subject
)
4667 register int c
= *start_match
;
4668 if ((start_bits
[c
/8] & (1 << (c
&7))) == 0) start_match
++; else break;
4672 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4673 printf(">>>> Match against: ");
4674 pchars(start_match
, end_subject
- start_match
, TRUE
, &match_block
);
4678 /* When a match occurs, substrings will be set for all internal extractions;
4679 we just need to set up the whole thing as substring 0 before returning. If
4680 there were too many extractions, set the return code to zero. In the case
4681 where we had to get some local store to hold offsets for backreferences, copy
4682 those back references that we can. In this case there need not be overflow
4683 if certain parts of the pattern were not used.
4685 Before starting the match, we have to set up a longjmp() target to enable
4686 the "cut" operation to fail a match completely without backtracking. This
4687 is done in a separate function to avoid compiler warnings. We need not do
4688 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4691 /* To handle errors such as running out of memory for the failure
4692 stack, we need to save this location via setjmp(), so
4693 error-handling code can call longjmp() to jump out of deeply-nested code. */
4694 if (setjmp(match_block
.error_env
)==0)
4697 if ((re
->options
& PCRE_EXTRA
) != 0)
4699 if (!match_with_setjmp(start_match
, re
->code
, 2, &match_block
))
4702 else if (!match(start_match
, re
->code
, 2, &match_block
)) continue;
4704 /* Copy the offset information from temporary store if necessary */
4706 if (using_temporary_offsets
)
4708 if (offsetcount
>= 4)
4710 memcpy(offsets
+ 2, match_block
.offset_vector
+ 2,
4711 (offsetcount
- 2) * sizeof(int));
4712 DPRINTF(("Copied offsets from temporary memory\n"));
4714 if (match_block
.end_offset_top
> offsetcount
)
4715 match_block
.offset_overflow
= TRUE
;
4717 DPRINTF(("Freeing temporary memory\n"));
4718 (pcre_free
)(match_block
.offset_vector
);
4721 rc
= match_block
.offset_overflow
? 0 : match_block
.end_offset_top
/2;
4723 if (match_block
.offset_end
< 2) rc
= 0; else
4725 offsets
[0] = start_match
- match_block
.start_subject
;
4726 offsets
[1] = match_block
.end_match_ptr
- match_block
.start_subject
;
4729 DPRINTF((">>>> returning %d\n", rc
));
4730 free_stack(&match_block
);
4732 } /* End of (if setjmp(match_block.error_env)...) */
4733 free_stack(&match_block
);
4735 /* Return an error code; pcremodule.c will preserve the exception */
4736 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY
;
4739 match_block
.errorcode
== PCRE_ERROR_NOMATCH
&&
4740 start_match
++ < end_subject
);
4742 if (using_temporary_offsets
)
4744 DPRINTF(("Freeing temporary memory\n"));
4745 (pcre_free
)(match_block
.offset_vector
);
4749 printf(">>>> returning %d\n", match_block
.errorcode
);
4752 free_stack(&match_block
);
4753 return match_block
.errorcode
;