2 /*************************************************
3 * Perl-Compatible Regular Expressions *
4 *************************************************/
6 /* DO NOT EDIT THIS FILE! */
8 /* This file is automatically written by the merge-files.py script
9 included with the PCRE distribution for Python; it's produced from
10 several C files, and code is removed in the process. If you want to
11 modify the code or track down bugs, it will be much easier to work
12 with the code in its original, multiple-file form. Don't edit this
13 file by hand, or submit patches to it.
15 The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
18 The unmodified original PCRE distribution is available at
19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20 written by: Philip Hazel <ph10@cam.ac.uk>
22 Extensively modified by the Python String-SIG: <string-sig@python.org>
23 Send bug reports to: <string-sig@python.org>
24 (They'll figure out if it's a bug in PCRE or in the Python-specific
27 Copyright (c) 1997 University of Cambridge
29 -----------------------------------------------------------------------------
30 Permission is granted to anyone to use this software for any purpose on any
31 computer system, and to redistribute it freely, subject to the following
34 1. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 2. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
41 3. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43 -----------------------------------------------------------------------------
54 /*************************************************
55 * Perl-Compatible Regular Expressions *
56 *************************************************/
58 /* This file is automatically written by the makechartables auxiliary
59 program. If you edit it by hand, you might like to edit the Makefile to
60 prevent its ever being regenerated. */
62 /* This table is a lower casing table. */
64 unsigned char pcre_lcc
[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
98 /* This table is a case flipping table. */
100 unsigned char pcre_fcc
[] = {
101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
134 /* This table contains bit maps for digits, letters, 'word' chars, and
135 white space. Each map is 32 bytes long and the bits run from the least
136 significant end of each byte. */
138 unsigned char pcre_cbits
[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
159 /* This table identifies various classes of character by individual bits:
160 0x01 white space character
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
168 unsigned char pcre_ctypes
[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
202 /* End of chartables.c */
203 /*************************************************
204 * Perl-Compatible Regular Expressions *
205 *************************************************/
208 This is a library of functions to support regular expressions whose syntax
209 and semantics are as close as possible to those of the Perl 5 language. See
210 the file Tech.Notes for some information on the internals.
212 Written by: Philip Hazel <ph10@cam.ac.uk>
214 Copyright (c) 1998 University of Cambridge
216 -----------------------------------------------------------------------------
217 Permission is granted to anyone to use this software for any purpose on any
218 computer system, and to redistribute it freely, subject to the following
221 1. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
225 2. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
228 3. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230 -----------------------------------------------------------------------------
234 /* Include the internals header, which itself includes Standard C headers plus
235 the external pcre header. */
240 /*************************************************
241 * Create bitmap of starting chars *
242 *************************************************/
244 /* This function scans a compiled unanchored expression and attempts to build a
245 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246 goes by, we may be able to get more clever at doing this.
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
252 Returns: TRUE if table built, FALSE otherwise
256 set_start_bits(const uschar
*code
, uschar
*start_bits
)
262 const uschar
*tcode
= code
+ 3;
263 BOOL try_next
= TRUE
;
269 if ((int)*tcode
>= OP_BRA
|| *tcode
== OP_ASSERT
)
271 if (!set_start_bits(tcode
, start_bits
)) return FALSE
;
279 /* BRAZERO does the bracket, but carries on. */
283 if (!set_start_bits(++tcode
, start_bits
)) return FALSE
;
284 do tcode
+= (tcode
[1] << 8) + tcode
[2]; while (*tcode
== OP_ALT
);
289 /* Single-char * or ? sets the bit and tries the next item */
295 start_bits
[tcode
[1]/8] |= (1 << (tcode
[1]&7));
300 /* Single-char upto sets the bit and tries the next */
304 start_bits
[tcode
[3]/8] |= (1 << (tcode
[3]&7));
309 /* At least one single char sets the bit and stops */
311 case OP_EXACT
: /* Fall through */
314 case OP_CHARS
: /* Fall through */
319 start_bits
[tcode
[1]/8] |= (1 << (tcode
[1]&7));
322 /* Single character type sets the bits and stops */
325 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_digit
];
329 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_digit
];
332 case OP_NOT_WHITESPACE
:
333 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_space
];
337 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_space
];
340 case OP_NOT_WORDCHAR
:
341 for (c
= 0; c
< 32; c
++)
342 start_bits
[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
346 for (c
= 0; c
< 32; c
++)
347 start_bits
[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
350 /* One or more character type fudges the pointer and restarts, knowing
351 it will hit a single character type and stop there. */
364 /* Zero or more repeats of character types set the bits and then
369 tcode
+= 2; /* Fall through */
374 case OP_TYPEMINQUERY
:
378 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_digit
];
382 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_digit
];
385 case OP_NOT_WHITESPACE
:
386 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_space
];
390 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_space
];
393 case OP_NOT_WORDCHAR
:
394 for (c
= 0; c
< 32; c
++)
395 start_bits
[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
399 for (c
= 0; c
< 32; c
++)
400 start_bits
[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
408 /* Character class: set the bits and either carry on or not,
409 according to the repeat count. */
415 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= tcode
[c
];
429 if (((tcode
[1] << 8) + tcode
[2]) == 0)
437 break; /* End of class handling */
439 } /* End of switch */
440 } /* End of try_next loop */
442 code
+= (code
[1] << 8) + code
[2]; /* Advance to next branch */
444 while (*code
== OP_ALT
);
450 /*************************************************
451 * Study a compiled expression *
452 *************************************************/
454 /* This function is handed a compiled expression that it must study to produce
455 information that will speed up the matching. It returns a pcre_extra block
456 which then gets handed back to pcre_exec().
459 re points to the compiled expression
460 options contains option bits
461 errorptr points to where to place error messages;
462 set NULL unless error
464 Returns: pointer to a pcre_extra block,
465 NULL on error or if no optimization possible
469 pcre_study(const pcre
*external_re
, int options
, const char **errorptr
)
472 uschar start_bits
[32];
473 real_pcre_extra
*extra
;
474 const real_pcre
*re
= (const real_pcre
*)external_re
;
478 if (re
== NULL
|| re
->magic_number
!= MAGIC_NUMBER
)
480 *errorptr
= "argument is not a compiled regular expression";
484 if ((options
& ~PUBLIC_STUDY_OPTIONS
) != 0)
486 *errorptr
= "unknown or incorrect option bit(s) set";
490 /* Caseless can either be from the compiled regex or from options. */
492 caseless
= ((re
->options
| options
) & PCRE_CASELESS
) != 0;
494 /* For an anchored pattern, or an unchored pattern that has a first char, or a
495 multiline pattern that matches only at "line starts", no further processing at
498 if ((re
->options
& (PCRE_ANCHORED
|PCRE_FIRSTSET
|PCRE_STARTLINE
)) != 0)
501 /* See if we can find a fixed set of initial characters for the pattern. */
503 memset(start_bits
, 0, 32 * sizeof(uschar
));
504 if (!set_start_bits(re
->code
, start_bits
)) return NULL
;
506 /* If this studying is caseless, scan the created bit map and duplicate the
507 bits for any letters. */
512 for (c
= 0; c
< 256; c
++)
514 if ((start_bits
[c
/8] & (1 << (c
&7))) != 0 &&
515 (pcre_ctypes
[c
] & ctype_letter
) != 0)
518 start_bits
[d
/8] |= (1 << (d
&7));
523 /* Get an "extra" block and put the information therein. */
525 extra
= (real_pcre_extra
*)(pcre_malloc
)(sizeof(real_pcre_extra
));
529 *errorptr
= "failed to get memory";
533 extra
->options
= PCRE_STUDY_MAPPED
| (caseless
? PCRE_STUDY_CASELESS
: 0);
534 memcpy(extra
->start_bits
, start_bits
, sizeof(start_bits
));
536 return (pcre_extra
*)extra
;
540 /*************************************************
541 * Perl-Compatible Regular Expressions *
542 *************************************************/
545 This is a library of functions to support regular expressions whose syntax
546 and semantics are as close as possible to those of the Perl 5 language. See
547 the file Tech.Notes for some information on the internals.
549 Written by: Philip Hazel <ph10@cam.ac.uk>
551 Copyright (c) 1998 University of Cambridge
553 -----------------------------------------------------------------------------
554 Permission is granted to anyone to use this software for any purpose on any
555 computer system, and to redistribute it freely, subject to the following
558 1. This software is distributed in the hope that it will be useful,
559 but WITHOUT ANY WARRANTY; without even the implied warranty of
560 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
562 2. The origin of this software must not be misrepresented, either by
563 explicit claim or by omission.
565 3. Altered versions must be plainly marked as such, and must not be
566 misrepresented as being the original software.
567 -----------------------------------------------------------------------------
571 /* Define DEBUG to get debugging output on stdout. */
575 /* Use a macro for debugging printing, 'cause that eliminates the the use
576 of #ifdef inline, and there are *still* stupid compilers about that don't like
577 indented pre-processor statements. I suppose it's only been 10 years... */
580 #define DPRINTF(p) printf p
582 #define DPRINTF(p) /*nothing*/
585 /* Include the internals header, which itself includes Standard C headers plus
586 the external pcre header. */
591 #ifndef Py_eval_input
592 /* For Python 1.4, graminit.h has to be explicitly included */
593 #define Py_eval_input eval_input
595 #endif /* FOR_PYTHON */
597 /* Allow compilation as C++ source code, should anybody want to do that. */
600 #define class pcre_class
604 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
606 static const char rep_min
[] = { 0, 0, 1, 1, 0, 0 };
607 static const char rep_max
[] = { 0, 0, 0, 0, 1, 1 };
609 /* Text forms of OP_ values and things, for debugging (not all used) */
612 static const char *OP_names
[] = {
613 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
614 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
615 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
616 "^", "$", "Any", "chars",
618 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{",
622 "class", "negclass", "classL", "Ref",
623 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
624 "Brazero", "Braminzero", "Bra"
628 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
629 are simple data values; negative values are for special things like \d and so
630 on. Zero means further processing is needed (for things like \x), or the escape
633 static const short int escapes
[] = {
634 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
635 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
636 '@', -ESC_A
, -ESC_B
, 0, -ESC_D
, 0, 0, 0, /* @ - G */
637 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
638 0, 0, 0, -ESC_S
, 0, 0, 0, -ESC_W
, /* P - W */
639 0, 0, -ESC_Z
, '[', '\\', ']', '^', '_', /* X - _ */
640 '`', 7, -ESC_b
, 0, -ESC_d
, 0, '\f', 0, /* ` - g */
641 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
642 0, 0, '\r', -ESC_s
, '\t', 0, '\v', -ESC_w
, /* p - w */
646 /* Definition to allow mutual recursion */
649 compile_regex(int, int *, uschar
**, const uschar
**, const char **,
652 /* Structure for passing "static" information around between the functions
653 doing the matching, so that they are thread-safe. */
655 typedef struct match_data
{
656 int errorcode
; /* As it says */
657 int *offset_vector
; /* Offset vector */
658 int offset_end
; /* One past the end */
659 BOOL offset_overflow
; /* Set if too many extractions */
660 BOOL caseless
; /* Case-independent flag */
661 BOOL runtime_caseless
; /* Caseless forced at run time */
662 BOOL multiline
; /* Multiline flag */
663 BOOL notbol
; /* NOTBOL flag */
664 BOOL noteol
; /* NOTEOL flag */
665 BOOL dotall
; /* Dot matches any char */
666 BOOL endonly
; /* Dollar not before final \n */
667 const uschar
*start_subject
; /* Start of the subject string */
668 const uschar
*end_subject
; /* End of the subject string */
669 jmp_buf fail_env
; /* Environment for longjump() break out */
670 const uschar
*end_match_ptr
; /* Subject position at end match */
671 int end_offset_top
; /* Highwater mark at end of match */
672 jmp_buf error_env
; /* For longjmp() if an error occurs deep inside a
673 matching operation */
674 int length
; /* Length of the allocated stacks */
675 int point
; /* Point to add next item pushed onto stacks */
676 /* Pointers to the 6 stacks */
677 int *off_num
, *offset_top
, *r1
, *r2
;
678 const uschar
**eptr
, **ecode
;
683 /*************************************************
685 *************************************************/
687 /* PCRE is thread-clean and doesn't use any global variables in the normal
688 sense. However, it calls memory allocation and free functions via the two
689 indirections below, which are can be changed by the caller, but are shared
690 between all threads. */
692 void *(*pcre_malloc
)(size_t) = malloc
;
693 void (*pcre_free
)(void *) = free
;
698 /*************************************************
699 * Return version string *
700 *************************************************/
711 /*************************************************
712 * Return info about a compiled pattern *
713 *************************************************/
715 /* This function picks potentially useful data out of the private
719 external_re points to compiled code
720 optptr where to pass back the options
721 first_char where to pass back the first character,
722 or -1 if multiline and all branches start ^,
725 Returns: number of identifying extraction brackets
726 or negative values on error
730 pcre_info(const pcre
*external_re
, int *optptr
, int *first_char
)
732 const real_pcre
*re
= (real_pcre
*)external_re
;
733 if (re
== NULL
) return PCRE_ERROR_NULL
;
734 if (re
->magic_number
!= MAGIC_NUMBER
) return PCRE_ERROR_BADMAGIC
;
735 if (optptr
!= NULL
) *optptr
= (re
->options
& PUBLIC_OPTIONS
);
736 if (first_char
!= NULL
)
737 *first_char
= ((re
->options
& PCRE_FIRSTSET
) != 0)? re
->first_char
:
738 ((re
->options
& PCRE_STARTLINE
) != 0)? -1 : -2;
739 return re
->top_bracket
;
746 /*************************************************
747 * Debugging function to print chars *
748 *************************************************/
750 /* Print a sequence of chars in printable format, stopping at the end of the
751 subject if the requested.
754 p points to characters
755 length number to print
756 is_subject TRUE if printing from within md->start_subject
757 md pointer to matching data block, if is_subject is TRUE
763 pchars(const uschar
*p
, int length
, BOOL is_subject
, match_data
*md
)
766 if (is_subject
&& length
> md
->end_subject
- p
) length
= md
->end_subject
- p
;
768 if (isprint(c
= *(p
++))) printf("%c", c
); else printf("\\x%02x", c
);
775 /*************************************************
776 * Check subpattern for empty operand *
777 *************************************************/
779 /* This function checks a bracketed subpattern to see if any of the paths
780 through it could match an empty string. This is used to diagnose an error if
781 such a subpattern is followed by a quantifier with an unlimited upper bound.
784 code points to the opening bracket
786 Returns: TRUE or FALSE
790 could_be_empty(uschar
*code
)
793 uschar
*cc
= code
+ 3;
795 /* Scan along the opcodes for this branch; as soon as we find something
796 that matches a non-empty string, break out and advance to test the next
797 branch. If we get to the end of the branch, return TRUE for the whole
802 /* Test an embedded subpattern; if it could not be empty, break the
803 loop. Otherwise carry on in the branch. */
805 if ((int)(*cc
) >= OP_BRA
|| (int)(*cc
) == OP_ONCE
)
807 if (!could_be_empty(cc
)) break;
808 do cc
+= (cc
[1] << 8) + cc
[2]; while (*cc
== OP_ALT
);
814 /* Reached end of a branch: the subpattern may match the empty string */
822 /* Skip over entire bracket groups with zero lower bound */
829 /* Skip over assertive subpatterns */
833 do cc
+= (cc
[1] << 8) + cc
[2]; while (*cc
== OP_ALT
);
837 /* Skip over things that don't match chars */
843 case OP_NOT_WORD_BOUNDARY
:
844 case OP_WORD_BOUNDARY
:
845 case OP_NOT_WORD_BOUNDARY_L
:
846 case OP_WORD_BOUNDARY_L
:
850 /* Skip over simple repeats with zero lower bound */
863 case OP_TYPEMINQUERY
:
867 /* Skip over UPTOs (lower bound is zero) */
876 /* Check a class or a back reference for a zero minimum */
884 case (OP_REF
): cc
+= 2; break;
885 case (OP_CLASS
): case (OP_NEGCLASS
): cc
+= 1+32; break;
886 case (OP_CLASS_L
): cc
+= 1+1+32; break;
900 if ((cc
[1] << 8) + cc
[2] != 0) goto NEXT_BRANCH
;
909 /* Anything else matches at least one character */
917 code
+= (code
[1] << 8) + code
[2];
919 while (*code
== OP_ALT
);
921 /* No branches match the empty string */
926 /* Determine the length of a group ID in an expression like
929 ptr pattern position pointer (say that 3 times fast)
930 finalchar the character that will mark the end of the ID
931 errorptr points to the pointer to the error message
935 get_group_id(const uschar
*ptr
, char finalchar
, const char **errorptr
)
937 const uschar
*start
= ptr
;
939 /* If the first character is not in \w, or is in \w but is a digit,
941 if (!(pcre_ctypes
[*ptr
] & ctype_word
) ||
942 (pcre_ctypes
[*ptr
++] & ctype_digit
))
944 *errorptr
= "(?P identifier must start with a letter or underscore";
948 /* Increment ptr until we either hit a null byte, the desired
949 final character, or a non-word character */
950 for(; (*ptr
!= 0) && (*ptr
!= finalchar
) &&
951 (pcre_ctypes
[*ptr
] & ctype_word
); ptr
++)
953 /* Empty loop body */
959 *errorptr
= "unterminated (?P identifier";
962 *errorptr
= "illegal character in (?P identifier";
966 /*************************************************
968 *************************************************/
970 /* This function is called when a \ has been encountered. It either returns a
971 positive value for a simple escape such as \n, or a negative value which
972 encodes one of the more complicated things such as \d. On entry, ptr is
973 pointing at the \. On exit, it is on the final character of the escape
977 ptrptr points to the pattern position pointer
978 errorptr points to the pointer to the error message
979 bracount number of previous extracting brackets
980 options the options bits
981 isclass TRUE if inside a character class
983 Returns: zero or positive => a data character
984 negative => a special escape sequence
985 on error, errorptr is set
989 check_escape(const uschar
**ptrptr
, const char **errorptr
, int bracount
,
990 int options
, BOOL isclass
)
992 const uschar
*ptr
= *ptrptr
;
993 int c
= *(++ptr
) & 255; /* Ensure > 0 on signed-char systems */
996 if (c
== 0) *errorptr
= ERR1
;
998 /* Digits or letters may have special meaning; all others are literals. */
1000 else if (c
< '0' || c
> 'z') {}
1002 /* Do an initial lookup in a table. A non-zero result is something that can be
1003 returned immediately. Otherwise further processing may be required. */
1005 else if ((i
= escapes
[c
- '0']) != 0) c
= i
;
1007 /* Escapes that need further processing, or are illegal. */
1014 /* The handling of escape sequences consisting of a string of digits
1015 starting with one that is not zero is not straightforward. By experiment,
1016 the way Perl works seems to be as follows:
1018 Outside a character class, the digits are read as a decimal number. If the
1019 number is less than 10, or if there are that many previous extracting
1020 left brackets, then it is a back reference. Otherwise, up to three octal
1021 digits are read to form an escaped byte. Thus \123 is likely to be octal
1022 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1023 value is greater than 377, the least significant 8 bits are taken. Inside a
1024 character class, \ followed by a digit is always an octal number. */
1026 case '1': case '2': case '3': case '4': case '5':
1027 case '6': case '7': case '8': case '9':
1030 /* PYTHON: Try to compute an octal value for a character */
1031 for(c
=0, i
=0; ptr
[i
]!=0 && i
<3; i
++)
1033 if (( pcre_ctypes
[ ptr
[i
] ] & ctype_odigit
) != 0)
1034 c
= c
* 8 + ptr
[i
]-'0';
1036 break; /* Non-octal character--break out of the loop */
1038 /* It's a character if there were exactly 3 octal digits, or if
1039 we're inside a character class and there was at least one
1041 if ( (i
== 3) || (isclass
&& i
!=0) )
1046 c
= ptr
[0]; /* Restore the first character after the \ */
1048 while (i
<2 && (pcre_ctypes
[ptr
[1]] & ctype_digit
) != 0)
1050 c
= c
* 10 + ptr
[1] - '0';
1053 if (c
> 255 - ESC_REF
) *errorptr
= "back reference too big";
1058 /* \0 always starts an octal number, but we may drop through to here with a
1059 larger first octal digit */
1063 while(i
++ < 2 && (pcre_ctypes
[ptr
[1]] & ctype_digit
) != 0 &&
1064 ptr
[1] != '8' && ptr
[1] != '9')
1065 c
= c
* 8 + *(++ptr
) - '0';
1068 /* Special escapes not starting with a digit are straightforward */
1072 while ( (pcre_ctypes
[ptr
[1]] & ctype_xdigit
) != 0)
1075 c
= c
* 16 + pcre_lcc
[*ptr
] -
1076 (((pcre_ctypes
[*ptr
] & ctype_digit
) != 0)? '0' : 'W');
1082 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1083 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1084 for Perl compatibility, it is a literal. */
1087 if ((options
& PCRE_EXTRA
) != 0) switch(c
)
1090 c
= -ESC_X
; /* This could be a lookup if it ever got into Perl */
1107 /*************************************************
1108 * Check for counted repeat *
1109 *************************************************/
1111 /* This function is called when a '{' is encountered in a place where it might
1112 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1113 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1114 where the ddds are digits.
1117 p pointer to the first char after '{'
1119 Returns: TRUE or FALSE
1123 is_counted_repeat(const uschar
*p
)
1125 if ((pcre_ctypes
[*p
++] & ctype_digit
) == 0) return FALSE
;
1126 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) p
++;
1127 if (*p
== '}') return TRUE
;
1129 if (*p
++ != ',') return FALSE
;
1130 if (*p
== '}') return TRUE
;
1132 if ((pcre_ctypes
[*p
++] & ctype_digit
) == 0) return FALSE
;
1133 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) p
++;
1139 /*************************************************
1140 * Read repeat counts *
1141 *************************************************/
1143 /* Read an item of the form {n,m} and return the values. This is called only
1144 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1145 so the syntax is guaranteed to be correct, but we need to check the values.
1148 p pointer to first char after '{'
1149 minp pointer to int for min
1150 maxp pointer to int for max
1151 returned as -1 if no max
1152 errorptr points to pointer to error message
1154 Returns: pointer to '}' on success;
1155 current ptr on error, with errorptr set
1158 static const uschar
*
1159 read_repeat_counts(const uschar
*p
, int *minp
, int *maxp
, const char **errorptr
)
1164 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) min
= min
* 10 + *p
++ - '0';
1166 if (*p
== '}') max
= min
; else
1171 while((pcre_ctypes
[*p
] & ctype_digit
) != 0) max
= max
* 10 + *p
++ - '0';
1180 /* Do paranoid checks, then fill in the required variables, and pass back the
1181 pointer to the terminating '}'. */
1183 if (min
> 65535 || max
> 65535)
1195 /*************************************************
1196 * Compile one branch *
1197 *************************************************/
1199 /* Scan the pattern, compiling it into the code vector.
1202 options the option bits
1203 bracket points to number of brackets used
1204 code points to the pointer to the current code point
1205 ptrptr points to the current pattern pointer
1206 errorptr points to pointer to error message
1208 Returns: TRUE on success
1209 FALSE, with *errorptr set on error
1213 compile_branch(int options
, int *brackets
, uschar
**codeptr
,
1214 const uschar
**ptrptr
, const char **errorptr
, PyObject
*dictionary
)
1216 int repeat_type
, op_type
;
1217 int repeat_min
, repeat_max
;
1218 int bravalue
, length
;
1219 int greedy_default
, greedy_non_default
;
1221 register uschar
*code
= *codeptr
;
1222 const uschar
*ptr
= *ptrptr
;
1223 const uschar
*oldptr
;
1224 uschar
*previous
= NULL
;
1226 uschar
*class_flag
; /* Pointer to the single-byte flag for OP_CLASS_L */
1228 /* Set up the default and non-default settings for greediness */
1230 greedy_default
= ((options
& PCRE_UNGREEDY
) != 0);
1231 greedy_non_default
= greedy_default
^ 1;
1233 /* Switch on next character until the end of the branch */
1238 int class_charcount
;
1242 if ((options
& PCRE_EXTENDED
) != 0)
1244 if ((pcre_ctypes
[c
] & ctype_space
) != 0) continue;
1247 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
1254 /* The branch terminates at end of string, |, or ). */
1263 /* Handle single-character metacharacters */
1280 /* Character classes. These always build a 32-byte bitmap of the permitted
1281 characters, except in the special case where there is only one character.
1282 For negated classes, we build the map as usual, then invert it at the end.
1287 if (options
& PCRE_LOCALE
)
1289 *code
++ = OP_CLASS_L
;
1290 /* Set the flag for localized classes (like \w) to 0 */
1300 /* If the first character is '^', set the negation flag, and use a
1301 different opcode. This only matters if caseless matching is specified at
1304 if ((c
= *(++ptr
)) == '^')
1306 negate_class
= TRUE
;
1307 if (*(code
-1)==OP_CLASS
) *(code
-1) = OP_NEGCLASS
;
1310 else negate_class
= FALSE
;
1312 /* Keep a count of chars so that we can optimize the case of just a single
1315 class_charcount
= 0;
1316 class_lastchar
= -1;
1318 /* Initialize the 32-char bit map to all zeros. We have to build the
1319 map in a temporary bit of store, in case the class contains only 1
1320 character, because in that case the compiled code doesn't use the
1323 memset(class, 0, 32 * sizeof(uschar
));
1325 /* Process characters until ] is reached. By writing this as a "do" it
1326 means that an initial ] is taken as a data character. */
1336 /* Backslash may introduce a single character, or it may introduce one
1337 of the specials, which just set a flag. Escaped items are checked for
1338 validity in the pre-compiling pass. The sequence \b is a special case.
1339 Inside a class (and only there) it is treated as backspace. Elsewhere
1340 it marks a word boundary. Other escapes have preset maps ready to
1341 or into the one we are building. We assume they have more than one
1342 character in them, so set class_count bigger than one. */
1346 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, TRUE
);
1347 if (-c
== ESC_b
) c
= '\b';
1350 class_charcount
= 10;
1355 for (c
= 0; c
< 32; c
++) class[c
] |= pcre_cbits
[c
+cbit_digit
];
1361 for (c
= 0; c
< 32; c
++) class[c
] |= ~pcre_cbits
[c
+cbit_digit
];
1366 if (options
& PCRE_LOCALE
)
1372 for (c
= 0; c
< 32; c
++)
1373 class[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
1378 if (options
& PCRE_LOCALE
)
1384 for (c
= 0; c
< 32; c
++)
1385 class[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
1391 for (c
= 0; c
< 32; c
++) class[c
] |= pcre_cbits
[c
+cbit_space
];
1397 for (c
= 0; c
< 32; c
++) class[c
] |= ~pcre_cbits
[c
+cbit_space
];
1406 /* Fall through if single character */
1409 /* A single character may be followed by '-' to form a range. However,
1410 Perl does not permit ']' to be the end of the range. A '-' character
1411 here is treated as a literal. */
1413 if (ptr
[1] == '-' && ptr
[2] != ']')
1425 /* The second part of a range can be a single-character escape, but
1426 not any of the other escapes. */
1430 d
= check_escape(&ptr
, errorptr
, *brackets
, options
, TRUE
);
1433 if (d
== -ESC_b
) d
= '\b'; else
1449 class[c
/8] |= (1 << (c
&7));
1450 if ((options
& PCRE_CASELESS
) != 0)
1452 int uc
= pcre_fcc
[c
]; /* flip case */
1453 class[uc
/8] |= (1 << (uc
&7));
1455 class_charcount
++; /* in case a one-char range */
1458 continue; /* Go get the next char in the class */
1461 /* Handle a lone single character - we can get here for a normal
1462 non-escape char, or after \ that introduces a single character. */
1464 class [c
/8] |= (1 << (c
&7));
1465 if ((options
& PCRE_CASELESS
) != 0)
1467 c
= pcre_fcc
[c
]; /* flip case */
1468 class[c
/8] |= (1 << (c
&7));
1474 /* Loop until ']' reached; the check for end of string happens inside the
1475 loop. This "while" is the end of the "do" above. */
1477 while ((c
= *(++ptr
)) != ']');
1479 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1480 precisely one character. This doesn't need the whole 32-byte bit map.
1481 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1484 if (class_charcount
== 1 && class_lastchar
>= 0)
1492 code
[-1] = OP_CHARS
;
1495 *code
++ = class_lastchar
;
1498 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1503 /* If this is a localized opcode, bump the code pointer up */
1504 if (class_flag
) code
++;
1507 if (class_flag
) *class_flag
= (*class_flag
) ^ 63;
1508 for (c
= 0; c
< 32; c
++) code
[c
] = ~class[c
];
1511 memcpy(code
, class, 32);
1516 /* Various kinds of repeat */
1519 if (!is_counted_repeat(ptr
+1)) goto NORMAL_CHAR
;
1520 ptr
= read_repeat_counts(ptr
+1, &repeat_min
, &repeat_max
, errorptr
);
1521 if (*errorptr
!= NULL
) goto FAILED
;
1539 if (previous
== NULL
)
1545 /* If the next character is '?' this is a minimizing repeat, by default,
1546 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1550 { repeat_type
= greedy_non_default
; ptr
++; }
1551 else repeat_type
= greedy_default
;
1553 /* If the maximum is zero then the minimum must also be zero; Perl allows
1554 this case, so we do too - by simply omitting the item altogether. */
1556 if (repeat_max
== 0) code
= previous
;
1558 /* If previous was a string of characters, chop off the last one and use it
1559 as the subject of the repeat. If there was only one character, we can
1560 abolish the previous item altogether. */
1562 else if (*previous
== OP_CHARS
)
1564 int len
= previous
[1];
1572 c
= previous
[len
+1];
1576 op_type
= 0; /* Use single-char op codes */
1577 goto OUTPUT_SINGLE_REPEAT
; /* Code shared with single character types */
1580 /* If previous was a single negated character ([^a] or similar), we use
1581 one of the special opcodes, replacing it. The code is shared with single-
1582 character repeats by adding a suitable offset into repeat_type. */
1584 else if ((int)*previous
== OP_NOT
)
1586 op_type
= OP_NOTSTAR
- OP_STAR
; /* Use "not" opcodes */
1589 goto OUTPUT_SINGLE_REPEAT
;
1592 /* If previous was a character type match (\d or similar), abolish it and
1593 create a suitable repeat item. The code is shared with single-character
1594 repeats by adding a suitable offset into repeat_type. */
1596 else if ((int)*previous
< OP_CIRC
|| *previous
== OP_ANY
)
1598 op_type
= OP_TYPESTAR
- OP_STAR
; /* Use type opcodes */
1602 OUTPUT_SINGLE_REPEAT
:
1603 repeat_type
+= op_type
; /* Combine both values for many cases */
1605 /* A minimum of zero is handled either as the special case * or ?, or as
1606 an UPTO, with the maximum given. */
1608 if (repeat_min
== 0)
1610 if (repeat_max
== -1) *code
++ = OP_STAR
+ repeat_type
;
1611 else if (repeat_max
== 1) *code
++ = OP_QUERY
+ repeat_type
;
1614 *code
++ = OP_UPTO
+ repeat_type
;
1615 *code
++ = repeat_max
>> 8;
1616 *code
++ = (repeat_max
& 255);
1620 /* The case {1,} is handled as the special case + */
1622 else if (repeat_min
== 1 && repeat_max
== -1)
1623 *code
++ = OP_PLUS
+ repeat_type
;
1625 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1626 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1630 if (repeat_min
!= 1)
1632 *code
++ = OP_EXACT
+ op_type
; /* NB EXACT doesn't have repeat_type */
1633 *code
++ = repeat_min
>> 8;
1634 *code
++ = (repeat_min
& 255);
1637 /* If the mininum is 1 and the previous item was a character string,
1638 we either have to put back the item that got cancelled if the string
1639 length was 1, or add the character back onto the end of a longer
1640 string. For a character type nothing need be done; it will just get
1641 put back naturally. Note that the final character is always going to
1644 else if (*previous
== OP_CHARS
)
1646 if (code
== previous
) code
+= 2; else previous
[1]++;
1649 /* For a single negated character we also have to put back the
1650 item that got cancelled. */
1652 else if (*previous
== OP_NOT
) code
++;
1654 /* If the maximum is unlimited, insert an OP_STAR. */
1659 *code
++ = OP_STAR
+ repeat_type
;
1662 /* Else insert an UPTO if the max is greater than the min. */
1664 else if (repeat_max
!= repeat_min
)
1667 repeat_max
-= repeat_min
;
1668 *code
++ = OP_UPTO
+ repeat_type
;
1669 *code
++ = repeat_max
>> 8;
1670 *code
++ = (repeat_max
& 255);
1674 /* The character or character type itself comes last in all cases. */
1679 /* If previous was a character class or a back reference, we put the repeat
1682 else if (*previous
== OP_CLASS
|| *previous
== OP_NEGCLASS
||
1683 *previous
==OP_CLASS_L
|| *previous
== OP_REF
)
1685 if (repeat_min
== 0 && repeat_max
== -1)
1686 *code
++ = OP_CRSTAR
+ repeat_type
;
1687 else if (repeat_min
== 1 && repeat_max
== -1)
1688 *code
++ = OP_CRPLUS
+ repeat_type
;
1689 else if (repeat_min
== 0 && repeat_max
== 1)
1690 *code
++ = OP_CRQUERY
+ repeat_type
;
1693 *code
++ = OP_CRRANGE
+ repeat_type
;
1694 *code
++ = repeat_min
>> 8;
1695 *code
++ = repeat_min
& 255;
1696 if (repeat_max
== -1) repeat_max
= 0; /* 2-byte encoding for max */
1697 *code
++ = repeat_max
>> 8;
1698 *code
++ = repeat_max
& 255;
1702 /* If previous was a bracket group, we may have to replicate it in certain
1703 cases. If the maximum repeat count is unlimited, check that the bracket
1704 group cannot match the empty string, and diagnose an error if it can. */
1706 else if ((int)*previous
>= OP_BRA
)
1709 int len
= code
- previous
;
1711 if (repeat_max
== -1 && could_be_empty(previous
))
1717 /* If the minimum is greater than zero, and the maximum is unlimited or
1718 equal to the minimum, the first copy remains where it is, and is
1719 replicated up to the minimum number of times. This case includes the +
1720 repeat, but of course no replication is needed in that case. */
1722 if (repeat_min
> 0 && (repeat_max
== -1 || repeat_max
== repeat_min
))
1724 for (i
= 1; i
< repeat_min
; i
++)
1726 memcpy(code
, previous
, len
);
1731 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1732 Then, if there is a fixed upper limit, replicated up to that many times,
1733 sticking BRAZERO in front of all the optional ones. */
1737 if (repeat_min
== 0)
1739 memmove(previous
+1, previous
, len
);
1741 *previous
++ = OP_BRAZERO
+ repeat_type
;
1744 for (i
= 1; i
< repeat_min
; i
++)
1746 memcpy(code
, previous
, len
);
1750 for (i
= (repeat_min
> 0)? repeat_min
: 1; i
< repeat_max
; i
++)
1752 *code
++ = OP_BRAZERO
+ repeat_type
;
1753 memcpy(code
, previous
, len
);
1758 /* If the maximum is unlimited, set a repeater in the final copy. */
1760 if (repeat_max
== -1) code
[-3] = OP_KETRMAX
+ repeat_type
;
1763 /* Else there's some kind of shambles */
1771 /* In all case we no longer have a previous item. */
1777 /* Start of nested bracket sub-expression, or comment or lookahead.
1778 First deal with special things that can come after a bracket; all are
1779 introduced by ?, and the appearance of any of them means that this is not a
1780 referencing group. They were checked for validity in the first pass over
1781 the string, so we don't have to check for syntax errors here. */
1784 previous
= code
; /* Only real brackets can be repeated */
1785 if (*(++ptr
) == '?')
1798 while (*ptr
!= ')') ptr
++;
1802 case ':': /* Non-extracting bracket */
1806 case '=': /* Assertions can't be repeated */
1807 bravalue
= OP_ASSERT
;
1813 bravalue
= OP_ASSERT_NOT
;
1822 /* (?P<groupname>...) */
1824 PyObject
*string
, *intobj
;
1827 idlen
= get_group_id(ptr
, '>', errorptr
);
1831 string
= PyString_FromStringAndSize((char*)ptr
, idlen
);
1832 intobj
= PyInt_FromLong( brackets
[0] + 1 );
1833 if (intobj
== NULL
|| string
== NULL
)
1837 *errorptr
= "exception raised";
1840 PyDict_SetItem(dictionary
, string
, intobj
);
1841 Py_DECREF(string
); Py_DECREF(intobj
); /* XXX DECREF commented out! */
1842 ptr
+= idlen
+1; /* Point to rest of expression */
1843 goto do_grouping_bracket
;
1847 /* (?P=groupname) */
1849 PyObject
*string
, *intobj
;
1852 idlen
= get_group_id(ptr
, ')', errorptr
);
1856 string
= PyString_FromStringAndSize((char *)ptr
, idlen
);
1858 *errorptr
= "exception raised";
1861 intobj
= PyDict_GetItem(dictionary
, string
);
1864 *errorptr
= "?P= group identifier isn't defined";
1868 refnum
= PyInt_AsLong(intobj
);
1870 /* The caller doesn't own the reference to the value
1871 returned from PyDict_GetItem, so intobj is not
1876 /* The continue will cause the top-level for() loop to
1877 be resumed, so ptr will be immediately incremented.
1878 Therefore, the following line adds just idlen, not
1883 /* The character after ?P is neither < nor =, so
1884 report an error. Add more Python-extensions here. */
1885 *errorptr
="unknown after (?P";
1888 case '>': /* "Match once" brackets */
1889 if ((options
& PCRE_EXTRA
) != 0) /* Not yet standard */
1896 /* Else fall through */
1904 /* Else we have a referencing group */
1908 do_grouping_bracket
:
1909 if (++(*brackets
) > EXTRACT_MAX
)
1914 bravalue
= OP_BRA
+ *brackets
;
1917 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1918 code into a non-register variable in order to be able to pass its address
1919 because some compilers complain otherwise. */
1923 uschar
*mcode
= code
;
1924 if (!compile_regex(options
, brackets
, &mcode
, &ptr
, errorptr
, dictionary
))
1936 /* Check \ for being a real metacharacter; if not, fall through and handle
1937 it as a data character at the start of a string. Escape items are checked
1938 for validity in the pre-compiling pass. */
1942 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, FALSE
);
1944 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1945 are arranged to be the negation of the corresponding OP_values. For the
1946 back references, the values are ESC_REF plus the reference number. Only
1947 back references and those types that consume a character may be repeated.
1948 We can test for values between ESC_b and ESC_Z for the latter; this may
1949 have to change if any new ones are ever created. */
1955 int refnum
= -c
- ESC_REF
;
1956 if (*brackets
< refnum
)
1967 previous
= (-c
> ESC_b
&& -c
< ESC_X
)? code
: NULL
;
1968 if ( (options
& PCRE_LOCALE
) != 0)
1972 case (-ESC_b
): c
= -OP_WORD_BOUNDARY_L
; break;
1973 case (-ESC_B
): c
= -OP_NOT_WORD_BOUNDARY_L
; break;
1974 case (-ESC_w
): c
= -OP_WORDCHAR_L
; break;
1975 case (-ESC_W
): c
= -OP_NOT_WORDCHAR_L
; break;
1983 /* Data character: Reset and fall through */
1988 /* Handle a run of data characters until a metacharacter is encountered.
1989 The first character is guaranteed not to be whitespace or # when the
1990 extended flag is set. */
2001 if ((options
& PCRE_EXTENDED
) != 0)
2003 if ((pcre_ctypes
[c
] & ctype_space
) != 0) continue;
2006 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2012 /* Backslash may introduce a data char or a metacharacter. Escaped items
2013 are checked for validity in the pre-compiling pass. Stop the string
2014 before a metaitem. */
2019 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, FALSE
);
2020 if (c
< 0) { ptr
= oldptr
; break; }
2023 /* Ordinary character or single-char escape */
2029 /* This "while" is the end of the "do" above. */
2031 while (length
< 255 && (pcre_ctypes
[c
= *(++ptr
)] & ctype_meta
) == 0);
2033 /* Compute the length and set it in the data vector, and advance to
2036 previous
[1] = length
;
2037 if (length
< 255) ptr
--;
2040 } /* end of big loop */
2042 /* Control never reaches here by falling through, only by a goto for all the
2043 error states. Pass back the position in the pattern so that it can be displayed
2044 to the user for diagnosing the error. */
2054 /*************************************************
2055 * Compile sequence of alternatives *
2056 *************************************************/
2058 /* On entry, ptr is pointing past the bracket character, but on return
2059 it points to the closing bracket, or vertical bar, or end of string.
2060 The code variable is pointing at the byte into which the BRA operator has been
2064 options the option bits
2065 brackets -> int containing the number of extracting brackets used
2066 codeptr -> the address of the current code pointer
2067 ptrptr -> the address of the current pattern pointer
2068 errorptr -> pointer to error message
2070 Returns: TRUE on success
2074 compile_regex(int options
, int *brackets
, uschar
**codeptr
,
2075 const uschar
**ptrptr
, const char **errorptr
, PyObject
*dictionary
)
2077 const uschar
*ptr
= *ptrptr
;
2078 uschar
*code
= *codeptr
;
2079 uschar
*start_bracket
= code
;
2084 uschar
*last_branch
= code
;
2087 if (!compile_branch(options
, brackets
, &code
, &ptr
, errorptr
, dictionary
))
2093 /* Fill in the length of the last branch */
2095 length
= code
- last_branch
;
2096 last_branch
[1] = length
>> 8;
2097 last_branch
[2] = length
& 255;
2099 /* Reached end of expression, either ')' or end of pattern. Insert a
2100 terminating ket and the length of the whole bracketed item, and return,
2101 leaving the pointer at the terminating char. */
2105 length
= code
- start_bracket
;
2107 *code
++ = length
>> 8;
2108 *code
++ = length
& 255;
2114 /* Another branch follows; insert an "or" node and advance the pointer. */
2119 /* Control never reaches here */
2124 /*************************************************
2125 * Check for anchored expression *
2126 *************************************************/
2128 /* Try to find out if this is an anchored regular expression. Consider each
2129 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2130 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2131 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2132 counts, since OP_CIRC can match in the middle.
2134 A branch is also implicitly anchored if it starts with .* because that will try
2135 the rest of the pattern at all possible matching points, so there is no point
2138 Argument: points to start of expression (the bracket)
2139 Returns: TRUE or FALSE
2143 is_anchored(register const uschar
*code
, BOOL multiline
)
2146 int op
= (int)code
[3];
2147 if (op
>= OP_BRA
|| op
== OP_ASSERT
|| op
== OP_ONCE
)
2148 { if (!is_anchored(code
+3, multiline
)) return FALSE
; }
2149 else if (op
== OP_TYPESTAR
|| op
== OP_TYPEMINSTAR
)
2150 { if (code
[4] != OP_ANY
) return FALSE
; }
2151 else if (op
!= OP_SOD
&& (multiline
|| op
!= OP_CIRC
)) return FALSE
;
2152 code
+= (code
[1] << 8) + code
[2];
2154 while (*code
== OP_ALT
);
2160 /*************************************************
2161 * Check for start with \n line expression *
2162 *************************************************/
2164 /* This is called for multiline expressions to try to find out if every branch
2165 starts with ^ so that "first char" processing can be done to speed things up.
2167 Argument: points to start of expression (the bracket)
2168 Returns: TRUE or FALSE
2172 is_startline(const uschar
*code
)
2175 if ((int)code
[3] >= OP_BRA
|| code
[3] == OP_ASSERT
)
2176 { if (!is_startline(code
+3)) return FALSE
; }
2177 else if (code
[3] != OP_CIRC
) return FALSE
;
2178 code
+= (code
[1] << 8) + code
[2];
2180 while (*code
== OP_ALT
);
2186 /*************************************************
2187 * Check for fixed first char *
2188 *************************************************/
2190 /* Try to find out if there is a fixed first character. This is called for
2191 unanchored expressions, as it speeds up their processing quite considerably.
2192 Consider each alternative branch. If they all start with the same char, or with
2193 a bracket all of whose alternatives start with the same char (recurse ad lib),
2194 then we return that char, otherwise -1.
2196 Argument: points to start of expression (the bracket)
2197 Returns: -1 or the fixed first char
2201 find_firstchar(uschar
*code
)
2203 register int c
= -1;
2206 register int charoffset
= 4;
2208 if ((int)code
[3] >= OP_BRA
|| code
[3] == OP_ASSERT
)
2211 if ((d
= find_firstchar(code
+3)) < 0) return -1;
2212 if (c
< 0) c
= d
; else if (c
!= d
) return -1;
2215 else switch(code
[3])
2220 case OP_EXACT
: /* Fall through */
2223 case OP_CHARS
: /* Fall through */
2228 if (c
< 0) c
= code
[charoffset
]; else if (c
!= code
[charoffset
]) return -1;
2231 code
+= (code
[1] << 8) + code
[2];
2233 while (*code
== OP_ALT
);
2239 /*************************************************
2240 * Compile a Regular Expression *
2241 *************************************************/
2243 /* This function takes a string and returns a pointer to a block of store
2244 holding a compiled version of the expression.
2247 pattern the regular expression
2248 options various option bits
2249 errorptr pointer to pointer to error text
2250 erroroffset ptr offset in pattern where error was detected
2252 Returns: pointer to compiled data block, or NULL on error,
2253 with errorptr and erroroffset set
2257 pcre_compile(const char *pattern
, int options
, const char **errorptr
,
2258 int *erroroffset
, PyObject
*dictionary
)
2262 int length
= 3; /* For initial BRA plus length */
2267 int top_backref
= 0;
2268 unsigned int brastackptr
= 0;
2273 uschar
*code_base
, *code_end
;
2276 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2277 can do is just return NULL. */
2279 if (errorptr
== NULL
) return NULL
;
2282 /* However, we can give a message for this error */
2284 if (erroroffset
== NULL
)
2291 if ((options
& ~PUBLIC_OPTIONS
) != 0)
2297 DPRINTF(("------------------------------------------------------------------\n"));
2298 DPRINTF(("%s\n", pattern
));
2300 /* The first thing to do is to make a pass over the pattern to compute the
2301 amount of store required to hold the compiled code. This does not have to be
2302 perfect as long as errors are overestimates. At the same time we can detect any
2303 internal flag settings. Make an attempt to correct for any counted white space
2304 if an "extended" flag setting appears late in the pattern. We can't be so
2305 clever for #-comments. */
2307 ptr
= (const uschar
*)(pattern
- 1);
2308 while ((c
= *(++ptr
)) != 0)
2311 int class_charcount
;
2313 if ((pcre_ctypes
[c
] & ctype_space
) != 0)
2315 if ((options
& PCRE_EXTENDED
) != 0) continue;
2319 if (c
== '#' && (options
& PCRE_EXTENDED
) != 0)
2321 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2327 /* A backslashed item may be an escaped "normal" character or a
2328 character type. For a "normal" character, put the pointers and
2329 character back so that tests for whitespace etc. in the input
2330 are done correctly. */
2334 const uschar
*save_ptr
= ptr
;
2335 c
= check_escape(&ptr
, errorptr
, bracount
, options
, FALSE
);
2336 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2346 /* A back reference needs an additional char, plus either one or 5
2347 bytes for a repeat. We also need to keep the value of the highest
2352 int refnum
= -c
- ESC_REF
;
2353 if (refnum
> top_backref
) top_backref
= refnum
;
2354 length
++; /* For single back reference */
2355 if (ptr
[1] == '{' && is_counted_repeat(ptr
+2))
2357 ptr
= read_repeat_counts(ptr
+2, &min
, &max
, errorptr
);
2358 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2359 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2360 (min
== 1 && max
== -1))
2363 if (ptr
[1] == '?') ptr
++;
2371 case '*': /* These repeats won't be after brackets; */
2372 case '+': /* those are handled separately */
2377 /* This covers the cases of repeats after a single char, metachar, class,
2378 or back reference. */
2381 if (!is_counted_repeat(ptr
+1)) goto NORMAL_CHAR
;
2382 ptr
= read_repeat_counts(ptr
+1, &min
, &max
, errorptr
);
2383 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2384 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2385 (min
== 1 && max
== -1))
2389 length
--; /* Uncount the original char or metachar */
2390 if (min
== 1) length
++; else if (min
> 0) length
+= 4;
2391 if (max
> 0) length
+= 4; else length
+= 2;
2393 if (ptr
[1] == '?') ptr
++;
2396 /* An alternation contains an offset to the next branch or ket. */
2401 /* A character class uses 33 characters. Don't worry about character types
2402 that aren't allowed in classes - they'll get picked up during the compile.
2403 A character class that contains only one character uses 2 or 3 bytes,
2404 depending on whether it is negated or not. Notice this where we can. */
2407 class_charcount
= 0;
2408 if (*(++ptr
) == '^') ptr
++;
2413 int ch
= check_escape(&ptr
, errorptr
, bracount
, options
, TRUE
);
2414 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2415 if (-ch
== ESC_b
) class_charcount
++; else class_charcount
= 10;
2417 else class_charcount
++;
2420 while (*ptr
!= 0 && *ptr
!= ']');
2422 /* Repeats for negated single chars are handled by the general code */
2424 if (class_charcount
== 1) length
+= 3; else
2427 if (options
& PCRE_LOCALE
) length
++; /* Add a byte for the localization flag */
2429 /* A repeat needs either 1 or 5 bytes. */
2431 if (*ptr
!= 0 && ptr
[1] == '{' && is_counted_repeat(ptr
+2))
2433 ptr
= read_repeat_counts(ptr
+2, &min
, &max
, errorptr
);
2434 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2435 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2436 (min
== 1 && max
== -1))
2439 if (ptr
[1] == '?') ptr
++;
2444 /* Brackets may be genuine groups or special things */
2448 /* Handle special forms of bracket, which all start (? */
2450 if (ptr
[1] == '?') switch (c
= ptr
[2])
2452 /* Skip over comments entirely */
2455 while (*ptr
!= 0 && *ptr
!= ')') ptr
++;
2459 goto PCRE_ERROR_RETURN
;
2463 /* Non-referencing groups and lookaheads just move the pointer on, and
2464 then behave like a non-special bracket, except that they don't increment
2465 the count of extracting brackets. */
2478 idlen
= get_group_id(ptr
++, '>', errorptr
);
2479 if (*errorptr
) goto PCRE_ERROR_RETURN
;
2483 idlen
= get_group_id(ptr
++, ')', errorptr
);
2484 if (*errorptr
) goto PCRE_ERROR_RETURN
;
2492 /* Ditto for the "once only" bracket, allowed only if the extra bit
2496 if ((options
& PCRE_EXTRA
) != 0)
2501 /* Else fall through */
2503 /* Else loop setting valid options until ) is met. Anything else is an
2510 if ((c
= *ptr
) == 'i')
2512 options
|= PCRE_CASELESS
;
2515 else if ((c
= *ptr
) == 'L')
2517 options
|= PCRE_LOCALE
;
2520 else if ((c
= *ptr
) == 'm')
2522 options
|= PCRE_MULTILINE
;
2527 options
|= PCRE_DOTALL
;
2532 options
|= PCRE_EXTENDED
;
2533 length
-= spaces
; /* Already counted spaces */
2536 else if (c
== ')') break;
2539 goto PCRE_ERROR_RETURN
;
2541 continue; /* End of this bracket handling */
2544 /* Extracting brackets must be counted so we can process escapes in a
2549 /* Non-special forms of bracket. Save length for computing whole length
2550 at end if there's a repeat that requires duplication of the group. */
2552 if (brastackptr
>= sizeof(brastack
)/sizeof(int))
2555 goto PCRE_ERROR_RETURN
;
2558 brastack
[brastackptr
++] = length
;
2562 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2563 have to replicate this bracket up to that many times. If brastackptr is
2564 0 this is an unmatched bracket which will generate an error, but take care
2565 not to try to access brastack[-1]. */
2572 int duplength
= (brastackptr
> 0)? length
- brastack
[--brastackptr
] : 0;
2574 /* Leave ptr at the final char; for read_repeat_counts this happens
2575 automatically; for the others we need an increment. */
2577 if ((c
= ptr
[1]) == '{' && is_counted_repeat(ptr
+2))
2579 ptr
= read_repeat_counts(ptr
+2, &minval
, &maxval
, errorptr
);
2580 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2582 else if (c
== '*') { minval
= 0; maxval
= -1; ptr
++; }
2583 else if (c
== '+') { maxval
= -1; ptr
++; }
2584 else if (c
== '?') { minval
= 0; ptr
++; }
2586 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2587 if there is a limited maximum we have to replicate up to maxval-1 times
2588 and allow for a BRAZERO item before each optional copy, as we also have
2589 to do before the first copy if the minimum is zero. */
2591 if (minval
== 0) length
++;
2592 else if (minval
> 1) length
+= (minval
- 1) * duplength
;
2593 if (maxval
> minval
) length
+= (maxval
- minval
) * (duplength
+ 1);
2597 /* Non-special character. For a run of such characters the length required
2598 is the number of characters + 2, except that the maximum run length is 255.
2599 We won't get a skipped space or a non-data escape or the start of a #
2600 comment as the first character, so the length can't be zero. */
2608 if ((pcre_ctypes
[c
] & ctype_space
) != 0)
2610 if ((options
& PCRE_EXTENDED
) != 0) continue;
2614 if (c
== '#' && (options
& PCRE_EXTENDED
) != 0)
2616 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2620 /* Backslash may introduce a data char or a metacharacter; stop the
2621 string before the latter. */
2625 const uschar
*saveptr
= ptr
;
2626 c
= check_escape(&ptr
, errorptr
, bracount
, options
, FALSE
);
2627 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2628 if (c
< 0) { ptr
= saveptr
; break; }
2631 /* Ordinary character or single-char escape */
2636 /* This "while" is the end of the "do" above. */
2638 while (runlength
< 255 && (pcre_ctypes
[c
= *(++ptr
)] & ctype_meta
) == 0);
2641 length
+= runlength
;
2646 length
+= 4; /* For final KET and END */
2654 /* Compute the size of data block needed and get it, either from malloc or
2655 externally provided function. We specify "code[0]" in the offsetof() expression
2656 rather than just "code", because it has been reported that one broken compiler
2657 fails on "code" because it is also an independent variable. It should make no
2658 difference to the value of the offsetof(). */
2660 size
= length
+ offsetof(real_pcre
, code
[0]);
2661 re
= (real_pcre
*)(pcre_malloc
)(size
+50);
2669 /* Put in the magic number and the options. */
2671 re
->magic_number
= MAGIC_NUMBER
;
2672 re
->options
= options
;
2674 /* Set up a starting, non-extracting bracket, then compile the expression. On
2675 error, *errorptr will be set non-NULL, so we don't need to look at the result
2676 of the function here. */
2678 ptr
= (const uschar
*)pattern
;
2682 (void)compile_regex(options
, &bracount
, &code
, &ptr
, errorptr
, dictionary
);
2683 re
->top_bracket
= bracount
;
2684 re
->top_backref
= top_backref
;
2686 /* If not reached end of pattern on success, there's an excess bracket. */
2688 if (*errorptr
== NULL
&& *ptr
!= 0) *errorptr
= ERR22
;
2690 /* Fill in the terminating state and check for disastrous overflow, but
2691 if debugging, leave the test till after things are printed out. */
2697 if (code
- re
->code
> length
) *errorptr
= ERR23
;
2700 /* Failed to compile */
2702 if (*errorptr
!= NULL
)
2706 *erroroffset
= ptr
- (const uschar
*)pattern
;
2710 /* If the anchored option was not passed, set flag if we can determine that it
2711 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2712 we can determine what the first character has to be, because that speeds up
2713 unanchored matches no end. In the case of multiline matches, an alternative is
2714 to set the PCRE_STARTLINE flag if all branches start with ^. */
2716 if ((options
& PCRE_ANCHORED
) == 0)
2718 if (is_anchored(re
->code
, (options
& PCRE_MULTILINE
) != 0))
2719 re
->options
|= PCRE_ANCHORED
;
2722 int ch
= find_firstchar(re
->code
);
2725 re
->first_char
= ch
;
2726 re
->options
|= PCRE_FIRSTSET
;
2728 else if (is_startline(re
->code
))
2729 re
->options
|= PCRE_STARTLINE
;
2733 /* Print out the compiled data for debugging */
2737 printf("Length = %d top_bracket = %d top_backref=%d\n",
2738 length
, re
->top_bracket
, re
->top_backref
);
2740 if (re
->options
!= 0)
2742 printf("%s%s%s%s%s%s%s%s\n",
2743 ((re
->options
& PCRE_ANCHORED
) != 0)? "anchored " : "",
2744 ((re
->options
& PCRE_CASELESS
) != 0)? "caseless " : "",
2745 ((re
->options
& PCRE_EXTENDED
) != 0)? "extended " : "",
2746 ((re
->options
& PCRE_MULTILINE
) != 0)? "multiline " : "",
2747 ((re
->options
& PCRE_DOTALL
) != 0)? "dotall " : "",
2748 ((re
->options
& PCRE_DOLLAR_ENDONLY
) != 0)? "endonly " : "",
2749 ((re
->options
& PCRE_EXTRA
) != 0)? "extra " : "",
2750 ((re
->options
& PCRE_UNGREEDY
) != 0)? "ungreedy " : "");
2753 if ((re
->options
& PCRE_FIRSTSET
) != 0)
2755 if (isprint(re
->first_char
)) printf("First char = %c\n", re
->first_char
);
2756 else printf("First char = \\x%02x\n", re
->first_char
);
2760 code_base
= code
= re
->code
;
2762 while (code
< code_end
)
2766 printf("%3d ", code
- code_base
);
2768 if (*code
>= OP_BRA
)
2770 printf("%3d Bra %d", (code
[1] << 8) + code
[2], *code
- OP_BRA
);
2777 charlength
= *(++code
);
2778 printf("%3d ", charlength
);
2779 while (charlength
-- > 0)
2780 if (isprint(c
= *(++code
))) printf("%c", c
); else printf("\\x%02x", c
);
2790 printf("%3d %s", (code
[1] << 8) + code
[2], OP_names
[*code
]);
2801 case OP_TYPEMINSTAR
:
2803 case OP_TYPEMINPLUS
:
2805 case OP_TYPEMINQUERY
:
2806 if (*code
>= OP_TYPESTAR
)
2807 printf(" %s", OP_names
[code
[1]]);
2808 else if (isprint(c
= code
[1])) printf(" %c", c
);
2809 else printf(" \\x%02x", c
);
2810 printf("%s", OP_names
[*code
++]);
2816 if (isprint(c
= code
[3])) printf(" %c{", c
);
2817 else printf(" \\x%02x{", c
);
2818 if (*code
!= OP_EXACT
) printf("0,");
2819 printf("%d}", (code
[1] << 8) + code
[2]);
2820 if (*code
== OP_MINUPTO
) printf("?");
2826 case OP_TYPEMINUPTO
:
2827 printf(" %s{", OP_names
[code
[3]]);
2828 if (*code
!= OP_TYPEEXACT
) printf(",");
2829 printf("%d}", (code
[1] << 8) + code
[2]);
2830 if (*code
== OP_TYPEMINUPTO
) printf("?");
2835 if (isprint(c
= *(++code
))) printf(" [^%c]", c
);
2836 else printf(" [^\\x%02x]", c
);
2844 case OP_NOTMINQUERY
:
2845 if (isprint(c
= code
[1])) printf(" [^%c]", c
);
2846 else printf(" [^\\x%02x]", c
);
2847 printf("%s", OP_names
[*code
++]);
2853 if (isprint(c
= code
[3])) printf(" [^%c]{", c
);
2854 else printf(" [^\\x%02x]{", c
);
2855 if (*code
!= OP_NOTEXACT
) printf(",");
2856 printf("%d}", (code
[1] << 8) + code
[2]);
2857 if (*code
== OP_NOTMINUPTO
) printf("?");
2862 printf(" \\%d", *(++code
));
2864 goto CLASS_REF_REPEAT
;
2872 if (*code
==OP_CLASS_L
)
2875 printf("Locflag = %i ", *code
++);
2880 if (*code
++ == OP_CLASS
) printf(" [");
2885 for (i
= 0; i
< 256; i
++)
2887 if ((code
[i
/8] & (1 << (i
&7))) != 0)
2890 for (j
= i
+1; j
< 256; j
++)
2891 if ((code
[j
/8] & (1 << (j
&7))) == 0) break;
2892 if (i
== '-' || i
== ']') printf("\\");
2893 if (isprint(i
)) printf("%c", i
); else printf("\\x%02x", i
);
2897 if (j
== '-' || j
== ']') printf("\\");
2898 if (isprint(j
)) printf("%c", j
); else printf("\\x%02x", j
);
2917 printf("%s", OP_names
[*code
]);
2922 min
= (code
[1] << 8) + code
[2];
2923 max
= (code
[3] << 8) + code
[4];
2924 if (max
== 0) printf("{%d,}", min
);
2925 else printf("{%d,%d}", min
, max
);
2926 if (*code
== OP_CRMINRANGE
) printf("?");
2936 /* Anything else is just a one-node item */
2939 printf(" %s", OP_names
[*code
]);
2946 printf("------------------------------------------------------------------\n");
2948 /* This check is done here in the debugging case so that the code that
2949 was compiled can be seen. */
2951 if (code
- re
->code
> length
)
2953 printf("length=%i, code length=%i\n", length
, code
-re
->code
);
2956 *erroroffset
= ptr
- (uschar
*)pattern
;
2966 /*************************************************
2967 * Match a character type *
2968 *************************************************/
2970 /* Not used in all the places it might be as it's sometimes faster
2971 to put the code inline.
2974 type the character type
2976 dotall the dotall flag
2978 Returns: TRUE if character is of the type
2982 match_type(int type
, int c
, BOOL dotall
)
2986 if (isprint(c
)) printf("matching subject %c against ", c
);
2987 else printf("matching subject \\x%02x against ", c
);
2988 printf("%s\n", OP_names
[type
]);
2993 case OP_ANY
: return dotall
|| c
!= '\n';
2994 case OP_NOT_DIGIT
: return (pcre_ctypes
[c
] & ctype_digit
) == 0;
2995 case OP_DIGIT
: return (pcre_ctypes
[c
] & ctype_digit
) != 0;
2996 case OP_NOT_WHITESPACE
: return (pcre_ctypes
[c
] & ctype_space
) == 0;
2997 case OP_WHITESPACE
: return (pcre_ctypes
[c
] & ctype_space
) != 0;
2998 case OP_NOT_WORDCHAR
: return (pcre_ctypes
[c
] & ctype_word
) == 0;
2999 case OP_WORDCHAR
: return (pcre_ctypes
[c
] & ctype_word
) != 0;
3000 case OP_NOT_WORDCHAR_L
: return (c
!='_' && !isalnum(c
));
3001 case OP_WORDCHAR_L
: return (c
=='_' || isalnum(c
));
3008 /*************************************************
3009 * Match a back-reference *
3010 *************************************************/
3012 /* If a back reference hasn't been set, the match fails.
3015 number reference number
3016 eptr points into the subject
3017 length length to be matched
3018 md points to match data block
3020 Returns: TRUE if matched
3024 match_ref(int number
, register const uschar
*eptr
, int length
, match_data
*md
)
3026 const uschar
*p
= md
->start_subject
+ md
->offset_vector
[number
];
3029 if (eptr
>= md
->end_subject
)
3030 printf("matching subject <null>");
3033 printf("matching subject ");
3034 pchars(eptr
, length
, TRUE
, md
);
3036 printf(" against backref ");
3037 pchars(p
, length
, FALSE
, md
);
3041 /* Always fail if not enough characters left */
3043 if (length
> md
->end_subject
- p
) return FALSE
;
3045 /* Separate the caseless case for speed */
3048 { while (length
-- > 0) if (pcre_lcc
[*p
++] != pcre_lcc
[*eptr
++]) return FALSE
; }
3050 { while (length
-- > 0) if (*p
++ != *eptr
++) return FALSE
; }
3055 static int free_stack(match_data
*md
)
3057 /* Free any stack space that was allocated by the call to match(). */
3058 if (md
->off_num
) free(md
->off_num
);
3059 if (md
->offset_top
) free(md
->offset_top
);
3060 if (md
->r1
) free(md
->r1
);
3061 if (md
->r2
) free(md
->r2
);
3062 if (md
->eptr
) free((char *)md
->eptr
);
3063 if (md
->ecode
) free((char *)md
->ecode
);
3067 static int grow_stack(match_data
*md
)
3069 if (md
->length
!= 0)
3071 md
->length
= md
->length
+ md
->length
/2;
3075 int string_len
= md
->end_subject
- md
->start_subject
+ 1;
3076 if (string_len
< 80) {md
->length
= string_len
; }
3077 else {md
->length
= 80;}
3079 PyMem_RESIZE(md
->offset_top
, int, md
->length
);
3080 PyMem_RESIZE(md
->eptr
, const uschar
*, md
->length
);
3081 PyMem_RESIZE(md
->ecode
, const uschar
*, md
->length
);
3082 PyMem_RESIZE(md
->off_num
, int, md
->length
);
3083 PyMem_RESIZE(md
->r1
, int, md
->length
);
3084 PyMem_RESIZE(md
->r2
, int, md
->length
);
3085 if (md
->offset_top
== NULL
|| md
->eptr
== NULL
|| md
->ecode
== NULL
||
3086 md
->off_num
== NULL
|| md
->r1
== NULL
|| md
->r2
== NULL
)
3089 longjmp(md
->error_env
, 1);
3095 /*************************************************
3096 * Match from current position *
3097 *************************************************/
3099 /* On entry ecode points to the first opcode, and eptr to the first character.
3102 eptr pointer in subject
3103 ecode position in code
3104 offset_top current top pointer
3105 md pointer to "static" info for the match
3107 Returns: TRUE if matched
3111 match(register const uschar
*eptr
, register const uschar
*ecode
, int offset_top
,
3114 int save_stack_position
= md
->point
;
3117 #define SUCCEED goto succeed
3118 #define FAIL goto fail
3122 int min
, max
, ctype
;
3125 BOOL minimize
= FALSE
;
3127 /* Opening bracket. Check the alternative branches in turn, failing if none
3128 match. We have to set the start offset if required and there is space
3129 in the offset vector so that it is available for subsequent back references
3130 if the bracket matches. However, if the bracket fails, we must put back the
3131 previous value of both offsets in case they were set by a previous copy of
3132 the same bracket. Don't worry about setting the flag for the error case here;
3133 that is handled in the code for KET. */
3135 if ((int)*ecode
>= OP_BRA
)
3137 int number
= (*ecode
- OP_BRA
) << 1;
3138 int save_offset1
= 0, save_offset2
= 0;
3140 DPRINTF(("start bracket %d\n", number
/2));
3142 if (number
> 0 && number
< md
->offset_end
)
3144 save_offset1
= md
->offset_vector
[number
];
3145 save_offset2
= md
->offset_vector
[number
+1];
3146 md
->offset_vector
[number
] = eptr
- md
->start_subject
;
3148 DPRINTF(("saving %d %d\n", save_offset1
, save_offset2
));
3151 /* Recurse for all the alternatives. */
3155 if (match(eptr
, ecode
+3, offset_top
, md
)) SUCCEED
;
3156 ecode
+= (ecode
[1] << 8) + ecode
[2];
3158 while (*ecode
== OP_ALT
);
3160 DPRINTF(("bracket %d failed\n", number
/2));
3162 if (number
> 0 && number
< md
->offset_end
)
3164 md
->offset_vector
[number
] = save_offset1
;
3165 md
->offset_vector
[number
+1] = save_offset2
;
3171 /* Other types of node can be handled by a switch */
3176 md
->end_match_ptr
= eptr
; /* Record where we ended */
3177 md
->end_offset_top
= offset_top
; /* and how many extracts were taken */
3180 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3181 whole thing doesn't match, so we have to get out via a longjmp(). */
3184 if (match(eptr
, ecode
+1, offset_top
, md
)) SUCCEED
;
3185 longjmp(md
->fail_env
, 1);
3187 /* Assertion brackets. Check the alternative branches in turn - the
3188 matching won't pass the KET for an assertion. If any one branch matches,
3189 the assertion is true. */
3194 if (match(eptr
, ecode
+3, offset_top
, md
)) break;
3195 ecode
+= (ecode
[1] << 8) + ecode
[2];
3197 while (*ecode
== OP_ALT
);
3198 if (*ecode
== OP_KET
) FAIL
;
3200 /* Continue from after the assertion, updating the offsets high water
3201 mark, since extracts may have been taken during the assertion. */
3203 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3205 offset_top
= md
->end_offset_top
;
3208 /* Negative assertion: all branches must fail to match */
3213 if (match(eptr
, ecode
+3, offset_top
, md
)) FAIL
;
3214 ecode
+= (ecode
[1] << 8) + ecode
[2];
3216 while (*ecode
== OP_ALT
);
3220 /* "Once" brackets are like assertion brackets except that after a match,
3221 the point in the subject string is not moved back. Thus there can never be
3222 a move back into the brackets. Check the alternative branches in turn - the
3223 matching won't pass the KET for this kind of subpattern. If any one branch
3224 matches, we carry on, leaving the subject pointer. */
3229 if (match(eptr
, ecode
+3, offset_top
, md
)) break;
3230 ecode
+= (ecode
[1] << 8) + ecode
[2];
3232 while (*ecode
== OP_ALT
);
3233 if (*ecode
== OP_KET
) FAIL
;
3235 /* Continue as from after the assertion, updating the offsets high water
3236 mark, since extracts may have been taken. */
3238 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3240 offset_top
= md
->end_offset_top
;
3241 eptr
= md
->end_match_ptr
;
3244 /* An alternation is the end of a branch; scan along to find the end of the
3245 bracketed group and go to there. */
3248 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3251 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3252 that it may occur zero times. It may repeat infinitely, or not at all -
3253 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3254 repeat limits are compiled as a number of copies, with the optional ones
3255 preceded by BRAZERO or BRAMINZERO. */
3259 const uschar
*next
= ecode
+1;
3260 if (match(eptr
, next
, offset_top
, md
)) SUCCEED
;
3261 do next
+= (next
[1] << 8) + next
[2]; while (*next
== OP_ALT
);
3268 const uschar
*next
= ecode
+1;
3269 do next
+= (next
[1] << 8) + next
[2]; while (*next
== OP_ALT
);
3270 if (match(eptr
, next
+3, offset_top
, md
)) SUCCEED
;
3275 /* End of a group, repeated or non-repeating. If we are at the end of
3276 an assertion "group", stop matching and SUCCEED, but record the
3277 current high water mark for use by positive assertions. */
3284 const uschar
*prev
= ecode
- (ecode
[1] << 8) - ecode
[2];
3286 if (*prev
== OP_ASSERT
|| *prev
== OP_ASSERT_NOT
|| *prev
== OP_ONCE
)
3288 md
->end_match_ptr
= eptr
; /* For ONCE */
3289 md
->end_offset_top
= offset_top
;
3293 /* In all other cases we have to check the group number back at the
3294 start and if necessary complete handling an extraction by setting the
3295 final offset and bumping the high water mark. */
3297 number
= (*prev
- OP_BRA
) << 1;
3299 DPRINTF(("end bracket %d\n", number
/2));
3303 if (number
>= md
->offset_end
) md
->offset_overflow
= TRUE
; else
3305 md
->offset_vector
[number
+1] = eptr
- md
->start_subject
;
3306 if (offset_top
<= number
) offset_top
= number
+ 2;
3310 /* For a non-repeating ket, just advance to the next node and continue at
3313 if (*ecode
== OP_KET
)
3319 /* The repeating kets try the rest of the pattern or restart from the
3320 preceding bracket, in the appropriate order. */
3322 if (*ecode
== OP_KETRMIN
)
3325 if (match(eptr
, ecode
+3, offset_top
, md
)) goto succeed
;
3326 /* Handle alternation inside the BRA...KET; push the additional
3327 alternatives onto the stack */
3330 ptr
+= (ptr
[1]<<8)+ ptr
[2];
3333 if (md
->length
== md
->point
)
3337 md
->offset_top
[md
->point
] = offset_top
;
3338 md
->eptr
[md
->point
] = eptr
;
3339 md
->ecode
[md
->point
] = ptr
+3;
3340 md
->r1
[md
->point
] = 0;
3341 md
->r2
[md
->point
] = 0;
3342 md
->off_num
[md
->point
] = 0;
3345 } while (*ptr
==OP_ALT
);
3346 ecode
=prev
+3; goto match_loop
;
3348 else /* OP_KETRMAX */
3351 /*int points_pushed=0;*/
3353 /* Push one failure point, that will resume matching at the code after
3354 the KETRMAX opcode. */
3355 if (md
->length
== md
->point
)
3359 md
->offset_top
[md
->point
] = offset_top
;
3360 md
->eptr
[md
->point
] = eptr
;
3361 md
->ecode
[md
->point
] = ecode
+3;
3362 md
->r1
[md
->point
] = md
->offset_vector
[number
];
3363 md
->r2
[md
->point
] = md
->offset_vector
[number
+1];
3364 md
->off_num
[md
->point
] = number
;
3367 md
->offset_vector
[number
] = eptr
- md
->start_subject
;
3368 /* Handle alternation inside the BRA...KET; push each of the
3369 additional alternatives onto the stack */
3372 ptr
+= (ptr
[1]<<8)+ ptr
[2];
3375 if (md
->length
== md
->point
)
3376 if (md
->length
== md
->point
)
3380 md
->offset_top
[md
->point
] = offset_top
;
3381 md
->eptr
[md
->point
] = eptr
;
3382 md
->ecode
[md
->point
] = ptr
+3;
3383 md
->r1
[md
->point
] = 0;
3384 md
->r2
[md
->point
] = 0;
3385 md
->off_num
[md
->point
] = 0;
3387 /*points_pushed++;*/
3389 } while (*ptr
==OP_ALT
);
3390 /* Jump to the first (or only) alternative and resume trying to match */
3391 ecode
=prev
+3; goto match_loop
;
3395 /* Start of subject unless notbol, or after internal newline if multiline */
3398 if (md
->notbol
&& eptr
== md
->start_subject
) FAIL
;
3401 if (eptr
!= md
->start_subject
&& eptr
[-1] != '\n') FAIL
;
3405 /* ... else fall through */
3407 /* Start of subject assertion */
3410 if (eptr
!= md
->start_subject
) FAIL
;
3414 /* Assert before internal newline if multiline, or before
3415 a terminating newline unless endonly is set, else end of subject unless
3419 if (md
->noteol
&& eptr
>= md
->end_subject
) FAIL
;
3422 if (eptr
< md
->end_subject
&& *eptr
!= '\n') FAIL
;
3426 else if (!md
->endonly
)
3428 if (eptr
< md
->end_subject
- 1 ||
3429 (eptr
== md
->end_subject
- 1 && *eptr
!= '\n')) FAIL
;
3433 /* ... else fall through */
3435 /* End of subject assertion */
3438 if (eptr
< md
->end_subject
) FAIL
;
3442 /* Word boundary assertions */
3444 case OP_NOT_WORD_BOUNDARY
:
3445 case OP_WORD_BOUNDARY
:
3447 BOOL prev_is_word
= (eptr
!= md
->start_subject
) &&
3448 ((pcre_ctypes
[eptr
[-1]] & ctype_word
) != 0);
3449 BOOL cur_is_word
= (eptr
< md
->end_subject
) &&
3450 ((pcre_ctypes
[*eptr
] & ctype_word
) != 0);
3451 if ((*ecode
++ == OP_WORD_BOUNDARY
)?
3452 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
3457 case OP_NOT_WORD_BOUNDARY_L
:
3458 case OP_WORD_BOUNDARY_L
:
3460 BOOL prev_is_word
= (eptr
!= md
->start_subject
) &&
3461 (isalnum(eptr
[-1]) || eptr
[-1]=='_');
3462 BOOL cur_is_word
= (eptr
< md
->end_subject
) &&
3463 (isalnum(*eptr
) || *eptr
=='_');
3464 if ((*ecode
++ == OP_WORD_BOUNDARY_L
)?
3465 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
3471 /* Match a single character type; inline for speed */
3474 if (!md
->dotall
&& eptr
< md
->end_subject
&& *eptr
== '\n') FAIL
;
3475 if (eptr
++ >= md
->end_subject
) FAIL
;
3480 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_digit
) != 0)
3486 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_digit
) == 0)
3491 case OP_NOT_WHITESPACE
:
3492 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_space
) != 0)
3498 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_space
) == 0)
3503 case OP_NOT_WORDCHAR
:
3504 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_word
) != 0)
3510 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_word
) == 0)
3515 case OP_NOT_WORDCHAR_L
:
3516 if (eptr
>= md
->end_subject
|| (*eptr
=='_' || isalnum(*eptr
) ))
3523 if (eptr
>= md
->end_subject
|| (*eptr
!='_' && !isalnum(*eptr
) ))
3529 /* Match a back reference, possibly repeatedly. Look past the end of the
3530 item to see if there is repeat information following. The code is similar
3531 to that for character classes, but repeated for efficiency. Then obey
3532 similar code to character type repeats - written out again for speed.
3533 However, if the referenced string is the empty string, always treat
3534 it as matched, any number of times (otherwise there could be infinite
3540 int number
= ecode
[1] << 1; /* Doubled reference number */
3541 ecode
+= 2; /* Advance past the item */
3543 if (number
>= offset_top
|| md
->offset_vector
[number
] < 0)
3545 md
->errorcode
= PCRE_ERROR_BADREF
;
3549 length
= md
->offset_vector
[number
+1] - md
->offset_vector
[number
];
3559 c
= *ecode
++ - OP_CRSTAR
;
3560 minimize
= (c
& 1) != 0;
3561 min
= rep_min
[c
]; /* Pick up values from tables; */
3562 max
= rep_max
[c
]; /* zero for max => infinity */
3563 if (max
== 0) max
= INT_MAX
;
3568 minimize
= (*ecode
== OP_CRMINRANGE
);
3569 min
= (ecode
[1] << 8) + ecode
[2];
3570 max
= (ecode
[3] << 8) + ecode
[4];
3571 if (max
== 0) max
= INT_MAX
;
3575 default: /* No repeat follows */
3576 if (!match_ref(number
, eptr
, length
, md
)) FAIL
;
3578 continue; /* With the main loop */
3581 /* If the length of the reference is zero, just continue with the
3584 if (length
== 0) continue;
3586 /* First, ensure the minimum number of matches are present. We get back
3587 the length of the reference string explicitly rather than passing the
3588 address of eptr, so that eptr can be a register variable. */
3590 for (i
= 1; i
<= min
; i
++)
3592 if (!match_ref(number
, eptr
, length
, md
)) FAIL
;
3596 /* If min = max, continue at the same level without recursion.
3597 They are not both allowed to be zero. */
3599 if (min
== max
) continue;
3601 /* If minimizing, keep trying and advancing the pointer */
3607 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3608 if (i
>= max
|| !match_ref(number
, eptr
, length
, md
))
3612 /* Control never gets here */
3615 /* If maximizing, find the longest string and work backwards */
3619 const uschar
*pp
= eptr
;
3620 for (i
= min
; i
< max
; i
++)
3622 if (!match_ref(number
, eptr
, length
, md
)) break;
3627 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3633 /* Control never gets here */
3635 /* Match a character class, possibly repeatedly. Look past the end of the
3636 item to see if there is repeat information following. Then obey similar
3637 code to character type repeats - written out again for speed. If caseless
3638 matching was set at runtime but not at compile time, we have to check both
3639 versions of a character, and we have to behave differently for positive and
3640 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3641 treated differently. */
3646 BOOL nasty_case
= *ecode
== OP_NEGCLASS
&& md
->runtime_caseless
;
3647 const uschar
*data
= ecode
+ 1; /* Save for matching */
3648 ecode
+= 33; /* Advance past the item */
3658 c
= *ecode
++ - OP_CRSTAR
;
3659 minimize
= (c
& 1) != 0;
3660 min
= rep_min
[c
]; /* Pick up values from tables; */
3661 max
= rep_max
[c
]; /* zero for max => infinity */
3662 if (max
== 0) max
= INT_MAX
;
3667 minimize
= (*ecode
== OP_CRMINRANGE
);
3668 min
= (ecode
[1] << 8) + ecode
[2];
3669 max
= (ecode
[3] << 8) + ecode
[4];
3670 if (max
== 0) max
= INT_MAX
;
3674 default: /* No repeat follows */
3679 /* First, ensure the minimum number of matches are present. */
3681 for (i
= 1; i
<= min
; i
++)
3683 if (eptr
>= md
->end_subject
) FAIL
;
3686 /* Either not runtime caseless, or it was a positive class. For
3687 runtime caseless, continue if either case is in the map. */
3691 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3692 if (md
->runtime_caseless
)
3695 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3699 /* Runtime caseless and it was a negative class. Continue only if
3700 both cases are in the map. */
3704 if ((data
[c
/8] & (1 << (c
&7))) == 0) FAIL
;
3706 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3712 /* If max == min we can continue with the main loop without the
3715 if (min
== max
) continue;
3717 /* If minimizing, keep testing the rest of the expression and advancing
3718 the pointer while it matches the class. */
3724 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3725 if (i
>= max
|| eptr
>= md
->end_subject
) FAIL
;
3728 /* Either not runtime caseless, or it was a positive class. For
3729 runtime caseless, continue if either case is in the map. */
3733 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3734 if (md
->runtime_caseless
)
3737 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3741 /* Runtime caseless and it was a negative class. Continue only if
3742 both cases are in the map. */
3746 if ((data
[c
/8] & (1 << (c
&7))) == 0) return FALSE
;
3748 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3753 /* Control never gets here */
3756 /* If maximizing, find the longest possible run, then work backwards. */
3760 const uschar
*pp
= eptr
;
3761 for (i
= min
; i
< max
; eptr
++, i
++)
3763 if (eptr
>= md
->end_subject
) break;
3766 /* Either not runtime caseless, or it was a positive class. For
3767 runtime caseless, continue if either case is in the map. */
3771 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3772 if (md
->runtime_caseless
)
3775 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3779 /* Runtime caseless and it was a negative class. Continue only if
3780 both cases are in the map. */
3784 if ((data
[c
/8] & (1 << (c
&7))) == 0) break;
3786 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3793 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
3797 /* Control never gets here */
3799 /* OP_CLASS_L opcode: handles localized character classes */
3803 const uschar
*data
= ecode
+ 1; /* Save for matching */
3804 const uschar locale_flag
= *data
;
3805 ecode
++; data
++; /* The localization support adds an extra byte */
3807 ecode
+= 33; /* Advance past the item */
3817 c
= *ecode
++ - OP_CRSTAR
;
3818 minimize
= (c
& 1) != 0;
3819 min
= rep_min
[c
]; /* Pick up values from tables; */
3820 max
= rep_max
[c
]; /* zero for max => infinity */
3821 if (max
== 0) max
= INT_MAX
;
3826 minimize
= (*ecode
== OP_CRMINRANGE
);
3827 min
= (ecode
[1] << 8) + ecode
[2];
3828 max
= (ecode
[3] << 8) + ecode
[4];
3829 if (max
== 0) max
= INT_MAX
;
3833 default: /* No repeat follows */
3834 if (eptr
>= md
->end_subject
) FAIL
;
3836 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue; /* With main loop */
3837 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3838 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3840 if ( (locale_flag
& 4) && isdigit(c
) ) continue; /* Locale \d */
3841 if ( (locale_flag
& 8) && !isdigit(c
) ) continue; /* Locale \D */
3842 if ( (locale_flag
& 16) && isspace(c
) ) continue; /* Locale \s */
3843 if ( (locale_flag
& 32) && !isspace(c
) ) continue; /* Locale \S */
3846 if (md
->runtime_caseless
)
3849 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue; /* With main loop */
3851 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3852 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3857 /* First, ensure the minimum number of matches are present. */
3859 for (i
= 1; i
<= min
; i
++)
3861 if (eptr
>= md
->end_subject
) FAIL
;
3863 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3864 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3865 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3867 if (md
->runtime_caseless
)
3870 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3871 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3872 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3877 /* If max == min we can continue with the main loop without the
3880 if (min
== max
) continue;
3882 /* If minimizing, keep testing the rest of the expression and advancing
3883 the pointer while it matches the class. */
3889 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3890 if (i
>= max
|| eptr
>= md
->end_subject
) FAIL
;
3892 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3893 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3894 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3896 if (md
->runtime_caseless
)
3899 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3900 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3901 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3905 /* Control never gets here */
3908 /* If maximizing, find the longest possible run, then work backwards. */
3912 const uschar
*pp
= eptr
;
3913 for (i
= min
; i
< max
; eptr
++, i
++)
3915 if (eptr
>= md
->end_subject
) break;
3917 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3918 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3919 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3920 if (md
->runtime_caseless
)
3923 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3924 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3925 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3931 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
3935 /* Control never gets here */
3937 /* Match a run of characters */
3941 register int length
= ecode
[1];
3944 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3945 if (eptr
>= md
->end_subject
)
3946 printf("matching subject <null> against pattern ");
3949 printf("matching subject ");
3950 pchars(eptr
, length
, TRUE
, md
);
3951 printf(" against pattern ");
3953 pchars(ecode
, length
, FALSE
, md
);
3957 if (length
> md
->end_subject
- eptr
) FAIL
;
3960 while (length
-- > 0) if (pcre_lcc
[*ecode
++] != pcre_lcc
[*eptr
++]) FAIL
;
3964 while (length
-- > 0) if (*ecode
++ != *eptr
++) FAIL
;
3969 /* Match a single character repeatedly; different opcodes share code. */
3972 min
= max
= (ecode
[1] << 8) + ecode
[2];
3979 max
= (ecode
[1] << 8) + ecode
[2];
3980 minimize
= *ecode
== OP_MINUPTO
;
3990 c
= *ecode
++ - OP_STAR
;
3991 minimize
= (c
& 1) != 0;
3992 min
= rep_min
[c
]; /* Pick up values from tables; */
3993 max
= rep_max
[c
]; /* zero for max => infinity */
3994 if (max
== 0) max
= INT_MAX
;
3996 /* Common code for all repeated single-character matches. We can give
3997 up quickly if there are fewer than the minimum number of characters left in
4001 if (min
> md
->end_subject
- eptr
) FAIL
;
4004 /* The code is duplicated for the caseless and caseful cases, for speed,
4005 since matching characters is likely to be quite common. First, ensure the
4006 minimum number of matches are present. If min = max, continue at the same
4007 level without recursing. Otherwise, if minimizing, keep trying the rest of
4008 the expression and advancing one matching character if failing, up to the
4009 maximum. Alternatively, if maximizing, find the maximum number of
4010 characters and work backwards. */
4012 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c
, min
, max
,
4018 for (i
= 1; i
<= min
; i
++) if (c
!= pcre_lcc
[*eptr
++]) FAIL
;
4019 if (min
== max
) continue;
4024 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4025 if (i
>= max
|| eptr
>= md
->end_subject
|| c
!= pcre_lcc
[*eptr
++])
4028 /* Control never gets here */
4032 const uschar
*pp
= eptr
;
4033 for (i
= min
; i
< max
; i
++)
4035 if (eptr
>= md
->end_subject
|| c
!= pcre_lcc
[*eptr
]) break;
4039 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4042 /* Control never gets here */
4045 /* Caseful comparisons */
4049 for (i
= 1; i
<= min
; i
++) if (c
!= *eptr
++) FAIL
;
4050 if (min
== max
) continue;
4055 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4056 if (i
>= max
|| eptr
>= md
->end_subject
|| c
!= *eptr
++) FAIL
;
4058 /* Control never gets here */
4062 const uschar
*pp
= eptr
;
4063 for (i
= min
; i
< max
; i
++)
4065 if (eptr
>= md
->end_subject
|| c
!= *eptr
) break;
4069 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4073 /* Control never gets here */
4075 /* Match a negated single character */
4078 if (eptr
>= md
->end_subject
) FAIL
;
4082 if (pcre_lcc
[*ecode
++] == pcre_lcc
[*eptr
++]) FAIL
;
4086 if (*ecode
++ == *eptr
++) FAIL
;
4090 /* Match a negated single character repeatedly. This is almost a repeat of
4091 the code for a repeated single character, but I haven't found a nice way of
4092 commoning these up that doesn't require a test of the positive/negative
4093 option for each character match. Maybe that wouldn't add very much to the
4094 time taken, but character matching *is* what this is all about... */
4097 min
= max
= (ecode
[1] << 8) + ecode
[2];
4104 max
= (ecode
[1] << 8) + ecode
[2];
4105 minimize
= *ecode
== OP_NOTMINUPTO
;
4114 case OP_NOTMINQUERY
:
4115 c
= *ecode
++ - OP_NOTSTAR
;
4116 minimize
= (c
& 1) != 0;
4117 min
= rep_min
[c
]; /* Pick up values from tables; */
4118 max
= rep_max
[c
]; /* zero for max => infinity */
4119 if (max
== 0) max
= INT_MAX
;
4121 /* Common code for all repeated single-character matches. We can give
4122 up quickly if there are fewer than the minimum number of characters left in
4126 if (min
> md
->end_subject
- eptr
) FAIL
;
4129 /* The code is duplicated for the caseless and caseful cases, for speed,
4130 since matching characters is likely to be quite common. First, ensure the
4131 minimum number of matches are present. If min = max, continue at the same
4132 level without recursing. Otherwise, if minimizing, keep trying the rest of
4133 the expression and advancing one matching character if failing, up to the
4134 maximum. Alternatively, if maximizing, find the maximum number of
4135 characters and work backwards. */
4137 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c
, min
, max
,
4143 for (i
= 1; i
<= min
; i
++) if (c
== pcre_lcc
[*eptr
++]) FAIL
;
4144 if (min
== max
) continue;
4149 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4150 if (i
>= max
|| eptr
>= md
->end_subject
|| c
== pcre_lcc
[*eptr
++])
4153 /* Control never gets here */
4157 const uschar
*pp
= eptr
;
4158 for (i
= min
; i
< max
; i
++)
4160 if (eptr
>= md
->end_subject
|| c
== pcre_lcc
[*eptr
]) break;
4164 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4167 /* Control never gets here */
4170 /* Caseful comparisons */
4174 for (i
= 1; i
<= min
; i
++) if (c
== *eptr
++) FAIL
;
4175 if (min
== max
) continue;
4180 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4181 if (i
>= max
|| eptr
>= md
->end_subject
|| c
== *eptr
++) FAIL
;
4183 /* Control never gets here */
4187 const uschar
*pp
= eptr
;
4188 for (i
= min
; i
< max
; i
++)
4190 if (eptr
>= md
->end_subject
|| c
== *eptr
) break;
4194 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4198 /* Control never gets here */
4200 /* Match a single character type repeatedly; several different opcodes
4201 share code. This is very similar to the code for single characters, but we
4202 repeat it in the interests of efficiency. */
4205 min
= max
= (ecode
[1] << 8) + ecode
[2];
4211 case OP_TYPEMINUPTO
:
4213 max
= (ecode
[1] << 8) + ecode
[2];
4214 minimize
= *ecode
== OP_TYPEMINUPTO
;
4219 case OP_TYPEMINSTAR
:
4221 case OP_TYPEMINPLUS
:
4223 case OP_TYPEMINQUERY
:
4224 c
= *ecode
++ - OP_TYPESTAR
;
4225 minimize
= (c
& 1) != 0;
4226 min
= rep_min
[c
]; /* Pick up values from tables; */
4227 max
= rep_max
[c
]; /* zero for max => infinity */
4228 if (max
== 0) max
= INT_MAX
;
4230 /* Common code for all repeated single character type matches */
4233 ctype
= *ecode
++; /* Code for the character type */
4235 /* First, ensure the minimum number of matches are present. Use inline
4236 code for maximizing the speed, and do the type test once at the start
4237 (i.e. keep it out of the loop). Also test that there are at least the
4238 minimum number of characters before we start. */
4240 if (min
> md
->end_subject
- eptr
) FAIL
;
4241 if (min
> 0) switch(ctype
)
4245 { for (i
= 1; i
<= min
; i
++) if (*eptr
++ == '\n') FAIL
; }
4250 for (i
= 1; i
<= min
; i
++)
4251 if ((pcre_ctypes
[*eptr
++] & ctype_digit
) != 0) FAIL
;
4255 for (i
= 1; i
<= min
; i
++)
4256 if ((pcre_ctypes
[*eptr
++] & ctype_digit
) == 0) FAIL
;
4259 case OP_NOT_WHITESPACE
:
4260 for (i
= 1; i
<= min
; i
++)
4261 if ((pcre_ctypes
[*eptr
++] & ctype_space
) != 0) FAIL
;
4265 for (i
= 1; i
<= min
; i
++)
4266 if ((pcre_ctypes
[*eptr
++] & ctype_space
) == 0) FAIL
;
4269 case OP_NOT_WORDCHAR
:
4270 for (i
= 1; i
<= min
; i
++) if ((pcre_ctypes
[*eptr
++] & ctype_word
) != 0)
4275 for (i
= 1; i
<= min
; i
++) if ((pcre_ctypes
[*eptr
++] & ctype_word
) == 0)
4279 case OP_NOT_WORDCHAR_L
:
4280 for (i
= 1; i
<= min
; i
++, eptr
++) if (*eptr
=='_' || isalnum(*eptr
))
4285 for (i
= 1; i
<= min
; i
++, eptr
++) if (*eptr
!='_' && !isalnum(*eptr
))
4290 /* If min = max, continue at the same level without recursing */
4292 if (min
== max
) continue;
4294 /* If minimizing, we have to test the rest of the pattern before each
4295 subsequent match, so inlining isn't much help; just use the function. */
4301 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4302 if (i
>= max
|| eptr
>= md
->end_subject
||
4303 !match_type(ctype
, *eptr
++, md
->dotall
))
4306 /* Control never gets here */
4309 /* If maximizing it is worth using inline code for speed, doing the type
4310 test once at the start (i.e. keep it out of the loop). */
4314 const uschar
*pp
= eptr
;
4320 for (i
= min
; i
< max
; i
++)
4322 if (eptr
>= md
->end_subject
|| *eptr
== '\n') break;
4329 if (c
> md
->end_subject
- eptr
) c
= md
->end_subject
- eptr
;
4335 for (i
= min
; i
< max
; i
++)
4337 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_digit
) != 0)
4344 for (i
= min
; i
< max
; i
++)
4346 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_digit
) == 0)
4352 case OP_NOT_WHITESPACE
:
4353 for (i
= min
; i
< max
; i
++)
4355 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_space
) != 0)
4362 for (i
= min
; i
< max
; i
++)
4364 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_space
) == 0)
4370 case OP_NOT_WORDCHAR
:
4371 for (i
= min
; i
< max
; i
++)
4373 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_word
) != 0)
4380 for (i
= min
; i
< max
; i
++)
4382 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_word
) == 0)
4387 case OP_NOT_WORDCHAR_L
:
4388 for (i
= min
; i
< max
; i
++)
4390 if (eptr
>= md
->end_subject
|| (*eptr
=='_' || isalnum(*eptr
) ) )
4397 for (i
= min
; i
< max
; i
++)
4399 if (eptr
>= md
->end_subject
|| (*eptr
!='_' && !isalnum(*eptr
) ) )
4407 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4410 /* Control never gets here */
4412 /* There's been some horrible disaster. */
4415 DPRINTF(("Unknown opcode %d\n", *ecode
));
4416 md
->errorcode
= PCRE_ERROR_UNKNOWN_NODE
;
4420 /* Do not stick any code in here without much thought; it is assumed
4421 that "continue" in the code above comes out to here to repeat the main
4424 } /* End of main loop */
4425 /* Control never reaches here */
4428 if (md
->point
> save_stack_position
)
4430 /* If there are still points remaining on the stack, pop the next one off */
4434 offset_top
= md
->offset_top
[md
->point
];
4435 eptr
= md
->eptr
[md
->point
];
4436 ecode
= md
->ecode
[md
->point
];
4437 off_num
= md
->off_num
[md
->point
];
4438 md
->offset_vector
[off_num
] = md
->r1
[md
->point
];
4439 md
->offset_vector
[off_num
+1] = md
->r2
[md
->point
];
4442 /* Failure, and nothing left on the stack, so end this function call */
4444 /* Restore the top of the stack to where it was before this function
4445 call. This lets us use one stack for everything; recursive calls
4446 can push and pop information, and may increase the stack. When
4447 the call returns, the parent function can resume pushing and
4448 popping wherever it was. */
4450 md
->point
= save_stack_position
;
4459 /*************************************************
4460 * Segregate setjmp() *
4461 *************************************************/
4463 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4464 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4465 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4466 since it's needed only for the extension \X option, and with any luck, a good
4467 compiler will spot the tail recursion and compile it efficiently.
4470 eptr pointer in subject
4471 ecode position in code
4472 offset_top current top pointer
4473 md pointer to "static" info for the match
4475 Returns: TRUE if matched
4479 match_with_setjmp(const uschar
*eptr
, const uschar
*ecode
, int offset_top
,
4480 match_data
*match_block
)
4482 return setjmp(match_block
->fail_env
) == 0 &&
4483 match(eptr
, ecode
, offset_top
, match_block
);
4488 /*************************************************
4489 * Execute a Regular Expression *
4490 *************************************************/
4492 /* This function applies a compiled re to a subject string and picks out
4493 portions of the string if it matches. Two elements in the vector are set for
4494 each substring: the offsets to the start and end of the substring.
4497 external_re points to the compiled expression
4498 external_extra points to "hints" from pcre_study() or is NULL
4499 subject points to the subject string
4500 length length of subject string (may contain binary zeros)
4502 offsets points to a vector of ints to be filled in with offsets
4503 offsetcount the number of elements in the vector
4505 Returns: > 0 => success; value is the number of elements filled in
4506 = 0 => success, but offsets is not big enough
4507 -1 => failed to match
4508 < -1 => some kind of unexpected problem
4512 pcre_exec(const pcre
*external_re
, const pcre_extra
*external_extra
,
4513 const char *subject
, int length
, int start_pos
, int options
,
4514 int *offsets
, int offsetcount
)
4516 /* The "volatile" directives are to make gcc -Wall stop complaining
4517 that these variables can be clobbered by the longjmp. Hopefully
4518 they won't cost too much performance. */
4519 volatile int resetcount
, ocount
;
4520 volatile int first_char
= -1;
4521 match_data match_block
;
4522 const uschar
*start_bits
= NULL
;
4523 const uschar
*start_match
= (const uschar
*)subject
+ start_pos
;
4524 const uschar
*end_subject
;
4525 const real_pcre
*re
= (const real_pcre
*)external_re
;
4526 const real_pcre_extra
*extra
= (const real_pcre_extra
*)external_extra
;
4527 volatile BOOL using_temporary_offsets
= FALSE
;
4528 volatile BOOL anchored
= ((re
->options
| options
) & PCRE_ANCHORED
) != 0;
4529 volatile BOOL startline
= (re
->options
& PCRE_STARTLINE
) != 0;
4531 if ((options
& ~PUBLIC_EXEC_OPTIONS
) != 0) return PCRE_ERROR_BADOPTION
;
4533 if (re
== NULL
|| subject
== NULL
||
4534 (offsets
== NULL
&& offsetcount
> 0)) return PCRE_ERROR_NULL
;
4535 if (re
->magic_number
!= MAGIC_NUMBER
) return PCRE_ERROR_BADMAGIC
;
4537 match_block
.start_subject
= (const uschar
*)subject
;
4538 match_block
.end_subject
= match_block
.start_subject
+ length
;
4539 end_subject
= match_block
.end_subject
;
4541 match_block
.caseless
= ((re
->options
| options
) & PCRE_CASELESS
) != 0;
4542 match_block
.runtime_caseless
= match_block
.caseless
&&
4543 (re
->options
& PCRE_CASELESS
) == 0;
4545 match_block
.multiline
= ((re
->options
| options
) & PCRE_MULTILINE
) != 0;
4546 match_block
.dotall
= ((re
->options
| options
) & PCRE_DOTALL
) != 0;
4547 match_block
.endonly
= ((re
->options
| options
) & PCRE_DOLLAR_ENDONLY
) != 0;
4549 match_block
.notbol
= (options
& PCRE_NOTBOL
) != 0;
4550 match_block
.noteol
= (options
& PCRE_NOTEOL
) != 0;
4552 match_block
.errorcode
= PCRE_ERROR_NOMATCH
; /* Default error */
4554 /* Set the stack state to empty */
4555 match_block
.off_num
= match_block
.offset_top
= NULL
;
4556 match_block
.r1
= match_block
.r2
= NULL
;
4557 match_block
.eptr
= match_block
.ecode
= NULL
;
4558 match_block
.point
= match_block
.length
= 0;
4560 /* If the expression has got more back references than the offsets supplied can
4561 hold, we get a temporary bit of working store to use during the matching.
4562 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4565 ocount
= offsetcount
& (-2);
4566 if (re
->top_backref
> 0 && re
->top_backref
>= ocount
/2)
4568 ocount
= re
->top_backref
* 2 + 2;
4569 match_block
.offset_vector
= (int *)(pcre_malloc
)(ocount
* sizeof(int));
4570 if (match_block
.offset_vector
== NULL
) return PCRE_ERROR_NOMEMORY
;
4571 using_temporary_offsets
= TRUE
;
4572 DPRINTF(("Got memory to hold back references\n"));
4574 else match_block
.offset_vector
= offsets
;
4576 match_block
.offset_end
= ocount
;
4577 match_block
.offset_overflow
= FALSE
;
4579 /* Compute the minimum number of offsets that we need to reset each time. Doing
4580 this makes a huge difference to execution time when there aren't many brackets
4583 resetcount
= 2 + re
->top_bracket
* 2;
4584 if (resetcount
> offsetcount
) resetcount
= ocount
;
4586 /* If MULTILINE is set at exec time but was not set at compile time, and the
4587 anchored flag is set, we must re-check because a setting provoked by ^ in the
4588 pattern is not right in multi-line mode. Calling is_anchored() again here does
4589 the right check, because multiline is now set. If it now yields FALSE, the
4590 expression must have had ^ starting some of its branches. Check to see if
4591 that is true for *all* branches, and if so, set the startline flag. */
4593 if (match_block
.multiline
&& anchored
&& (re
->options
& PCRE_MULTILINE
) == 0 &&
4594 !is_anchored(re
->code
, match_block
.multiline
))
4597 if (is_startline(re
->code
)) startline
= TRUE
;
4600 /* Set up the first character to match, if available. The first_char value is
4601 never set for an anchored regular expression, but the anchoring may be forced
4602 at run time, so we have to test for anchoring. The first char may be unset for
4603 an unanchored pattern, of course. If there's no first char and the pattern was
4604 studied, the may be a bitmap of possible first characters. However, we can
4605 use this only if the caseless state of the studying was correct. */
4609 if ((re
->options
& PCRE_FIRSTSET
) != 0)
4611 first_char
= re
->first_char
;
4612 if (match_block
.caseless
) first_char
= pcre_lcc
[first_char
];
4615 if (!startline
&& extra
!= NULL
&&
4616 (extra
->options
& PCRE_STUDY_MAPPED
) != 0 &&
4617 ((extra
->options
& PCRE_STUDY_CASELESS
) != 0) == match_block
.caseless
)
4618 start_bits
= extra
->start_bits
;
4621 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4626 register int *iptr
= match_block
.offset_vector
;
4627 register int *iend
= iptr
+ resetcount
;
4629 /* Reset the maximum number of extractions we might see. */
4631 while (iptr
< iend
) *iptr
++ = -1;
4633 /* Advance to a unique first char if possible */
4635 if (first_char
>= 0)
4637 if (match_block
.caseless
)
4638 while (start_match
< end_subject
&& pcre_lcc
[*start_match
] != first_char
)
4641 while (start_match
< end_subject
&& *start_match
!= first_char
)
4645 /* Or to just after \n for a multiline match if possible */
4649 if (start_match
> match_block
.start_subject
)
4651 while (start_match
< end_subject
&& start_match
[-1] != '\n')
4656 /* Or to a non-unique first char */
4658 else if (start_bits
!= NULL
)
4660 while (start_match
< end_subject
)
4662 register int c
= *start_match
;
4663 if ((start_bits
[c
/8] & (1 << (c
&7))) == 0) start_match
++; else break;
4667 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4668 printf(">>>> Match against: ");
4669 pchars(start_match
, end_subject
- start_match
, TRUE
, &match_block
);
4673 /* When a match occurs, substrings will be set for all internal extractions;
4674 we just need to set up the whole thing as substring 0 before returning. If
4675 there were too many extractions, set the return code to zero. In the case
4676 where we had to get some local store to hold offsets for backreferences, copy
4677 those back references that we can. In this case there need not be overflow
4678 if certain parts of the pattern were not used.
4680 Before starting the match, we have to set up a longjmp() target to enable
4681 the "cut" operation to fail a match completely without backtracking. This
4682 is done in a separate function to avoid compiler warnings. We need not do
4683 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4686 /* To handle errors such as running out of memory for the failure
4687 stack, we need to save this location via setjmp(), so
4688 error-handling code can call longjmp() to jump out of deeply-nested code. */
4689 if (setjmp(match_block
.error_env
)==0)
4692 if ((re
->options
& PCRE_EXTRA
) != 0)
4694 if (!match_with_setjmp(start_match
, re
->code
, 2, &match_block
))
4697 else if (!match(start_match
, re
->code
, 2, &match_block
)) continue;
4699 /* Copy the offset information from temporary store if necessary */
4701 if (using_temporary_offsets
)
4703 if (offsetcount
>= 4)
4705 memcpy(offsets
+ 2, match_block
.offset_vector
+ 2,
4706 (offsetcount
- 2) * sizeof(int));
4707 DPRINTF(("Copied offsets from temporary memory\n"));
4709 if (match_block
.end_offset_top
> offsetcount
)
4710 match_block
.offset_overflow
= TRUE
;
4712 DPRINTF(("Freeing temporary memory\n"));
4713 (pcre_free
)(match_block
.offset_vector
);
4716 rc
= match_block
.offset_overflow
? 0 : match_block
.end_offset_top
/2;
4718 if (match_block
.offset_end
< 2) rc
= 0; else
4720 offsets
[0] = start_match
- match_block
.start_subject
;
4721 offsets
[1] = match_block
.end_match_ptr
- match_block
.start_subject
;
4724 DPRINTF((">>>> returning %d\n", rc
));
4725 free_stack(&match_block
);
4727 } /* End of (if setjmp(match_block.error_env)...) */
4728 free_stack(&match_block
);
4730 /* Return an error code; pcremodule.c will preserve the exception */
4731 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY
;
4734 match_block
.errorcode
== PCRE_ERROR_NOMATCH
&&
4735 start_match
++ < end_subject
);
4737 if (using_temporary_offsets
)
4739 DPRINTF(("Freeing temporary memory\n"));
4740 (pcre_free
)(match_block
.offset_vector
);
4744 printf(">>>> returning %d\n", match_block
.errorcode
);
4747 return match_block
.errorcode
;