2 /*************************************************
3 * Perl-Compatible Regular Expressions *
4 *************************************************/
6 /* DO NOT EDIT THIS FILE! */
8 /* This file is automatically written by the merge-files.py script
9 included with the PCRE distribution for Python; it's produced from
10 several C files, and code is removed in the process. If you want to
11 modify the code or track down bugs, it will be much easier to work
12 with the code in its original, multiple-file form. Don't edit this
13 file by hand, or submit patches to it.
15 The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
18 The unmodified original PCRE distribution is available at
19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20 written by: Philip Hazel <ph10@cam.ac.uk>
22 Extensively modified by the Python String-SIG: <string-sig@python.org>
23 Send bug reports to: <string-sig@python.org>
24 (They'll figure out if it's a bug in PCRE or in the Python-specific
27 Copyright (c) 1997 University of Cambridge
29 -----------------------------------------------------------------------------
30 Permission is granted to anyone to use this software for any purpose on any
31 computer system, and to redistribute it freely, subject to the following
34 1. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 2. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
41 3. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43 -----------------------------------------------------------------------------
54 /*************************************************
55 * Perl-Compatible Regular Expressions *
56 *************************************************/
58 /* This file is automatically written by the makechartables auxiliary
59 program. If you edit it by hand, you might like to edit the Makefile to
60 prevent its ever being regenerated. */
62 /* This table is a lower casing table. */
64 unsigned char pcre_lcc
[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
98 /* This table is a case flipping table. */
100 unsigned char pcre_fcc
[] = {
101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
134 /* This table contains bit maps for digits, letters, 'word' chars, and
135 white space. Each map is 32 bytes long and the bits run from the least
136 significant end of each byte. */
138 unsigned char pcre_cbits
[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
159 /* This table identifies various classes of character by individual bits:
160 0x01 white space character
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
168 unsigned char pcre_ctypes
[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
202 /* End of chartables.c */
203 /*************************************************
204 * Perl-Compatible Regular Expressions *
205 *************************************************/
208 This is a library of functions to support regular expressions whose syntax
209 and semantics are as close as possible to those of the Perl 5 language. See
210 the file Tech.Notes for some information on the internals.
212 Written by: Philip Hazel <ph10@cam.ac.uk>
214 Copyright (c) 1998 University of Cambridge
216 -----------------------------------------------------------------------------
217 Permission is granted to anyone to use this software for any purpose on any
218 computer system, and to redistribute it freely, subject to the following
221 1. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
225 2. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
228 3. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230 -----------------------------------------------------------------------------
234 /* Include the internals header, which itself includes Standard C headers plus
235 the external pcre header. */
240 /*************************************************
241 * Create bitmap of starting chars *
242 *************************************************/
244 /* This function scans a compiled unanchored expression and attempts to build a
245 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246 goes by, we may be able to get more clever at doing this.
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
252 Returns: TRUE if table built, FALSE otherwise
256 set_start_bits(const uschar
*code
, uschar
*start_bits
)
263 const uschar
*tcode
= code
+ 3;
264 BOOL try_next
= TRUE
;
270 if ((int)*tcode
>= OP_BRA
|| *tcode
== OP_ASSERT
)
272 if (!set_start_bits(tcode
, start_bits
)) return FALSE
;
280 /* BRAZERO does the bracket, but carries on. */
284 if (!set_start_bits(++tcode
, start_bits
)) return FALSE
;
286 do tcode
+= (tcode
[1] << 8) + tcode
[2]; while (*tcode
== OP_ALT
);
291 /* Single-char * or ? sets the bit and tries the next item */
297 start_bits
[tcode
[1]/8] |= (1 << (tcode
[1]&7));
302 /* Single-char upto sets the bit and tries the next */
306 start_bits
[tcode
[3]/8] |= (1 << (tcode
[3]&7));
311 /* At least one single char sets the bit and stops */
313 case OP_EXACT
: /* Fall through */
316 case OP_CHARS
: /* Fall through */
321 start_bits
[tcode
[1]/8] |= (1 << (tcode
[1]&7));
324 /* Single character type sets the bits and stops */
327 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_digit
];
331 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_digit
];
334 case OP_NOT_WHITESPACE
:
335 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_space
];
339 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_space
];
342 case OP_NOT_WORDCHAR
:
343 for (c
= 0; c
< 32; c
++)
344 start_bits
[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
348 for (c
= 0; c
< 32; c
++)
349 start_bits
[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
352 /* One or more character type fudges the pointer and restarts, knowing
353 it will hit a single character type and stop there. */
366 /* Zero or more repeats of character types set the bits and then
371 tcode
+= 2; /* Fall through */
376 case OP_TYPEMINQUERY
:
380 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_digit
];
384 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_digit
];
387 case OP_NOT_WHITESPACE
:
388 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= ~pcre_cbits
[c
+cbit_space
];
392 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= pcre_cbits
[c
+cbit_space
];
395 case OP_NOT_WORDCHAR
:
396 for (c
= 0; c
< 32; c
++)
397 start_bits
[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
401 for (c
= 0; c
< 32; c
++)
402 start_bits
[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
410 /* Character class: set the bits and either carry on or not,
411 according to the repeat count. */
417 for (c
= 0; c
< 32; c
++) start_bits
[c
] |= tcode
[c
];
431 if (((tcode
[1] << 8) + tcode
[2]) == 0)
439 break; /* End of class handling */
441 } /* End of switch */
442 } /* End of try_next loop */
444 code
+= (code
[1] << 8) + code
[2]; /* Advance to next branch */
446 while (*code
== OP_ALT
);
452 /*************************************************
453 * Study a compiled expression *
454 *************************************************/
456 /* This function is handed a compiled expression that it must study to produce
457 information that will speed up the matching. It returns a pcre_extra block
458 which then gets handed back to pcre_exec().
461 re points to the compiled expression
462 options contains option bits
463 errorptr points to where to place error messages;
464 set NULL unless error
466 Returns: pointer to a pcre_extra block,
467 NULL on error or if no optimization possible
471 pcre_study(const pcre
*external_re
, int options
, const char **errorptr
)
474 uschar start_bits
[32];
475 real_pcre_extra
*extra
;
476 const real_pcre
*re
= (const real_pcre
*)external_re
;
480 if (re
== NULL
|| re
->magic_number
!= MAGIC_NUMBER
)
482 *errorptr
= "argument is not a compiled regular expression";
486 if ((options
& ~PUBLIC_STUDY_OPTIONS
) != 0)
488 *errorptr
= "unknown or incorrect option bit(s) set";
492 /* Caseless can either be from the compiled regex or from options. */
494 caseless
= ((re
->options
| options
) & PCRE_CASELESS
) != 0;
496 /* For an anchored pattern, or an unchored pattern that has a first char, or a
497 multiline pattern that matches only at "line starts", no further processing at
500 if ((re
->options
& (PCRE_ANCHORED
|PCRE_FIRSTSET
|PCRE_STARTLINE
)) != 0)
503 /* See if we can find a fixed set of initial characters for the pattern. */
505 memset(start_bits
, 0, 32 * sizeof(uschar
));
506 if (!set_start_bits(re
->code
, start_bits
)) return NULL
;
508 /* If this studying is caseless, scan the created bit map and duplicate the
509 bits for any letters. */
514 for (c
= 0; c
< 256; c
++)
516 if ((start_bits
[c
/8] & (1 << (c
&7))) != 0 &&
517 (pcre_ctypes
[c
] & ctype_letter
) != 0)
520 start_bits
[d
/8] |= (1 << (d
&7));
525 /* Get an "extra" block and put the information therein. */
527 extra
= (real_pcre_extra
*)(pcre_malloc
)(sizeof(real_pcre_extra
));
531 *errorptr
= "failed to get memory";
535 extra
->options
= PCRE_STUDY_MAPPED
| (caseless
? PCRE_STUDY_CASELESS
: 0);
536 memcpy(extra
->start_bits
, start_bits
, sizeof(start_bits
));
538 return (pcre_extra
*)extra
;
542 /*************************************************
543 * Perl-Compatible Regular Expressions *
544 *************************************************/
547 This is a library of functions to support regular expressions whose syntax
548 and semantics are as close as possible to those of the Perl 5 language. See
549 the file Tech.Notes for some information on the internals.
551 Written by: Philip Hazel <ph10@cam.ac.uk>
553 Copyright (c) 1998 University of Cambridge
555 -----------------------------------------------------------------------------
556 Permission is granted to anyone to use this software for any purpose on any
557 computer system, and to redistribute it freely, subject to the following
560 1. This software is distributed in the hope that it will be useful,
561 but WITHOUT ANY WARRANTY; without even the implied warranty of
562 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
564 2. The origin of this software must not be misrepresented, either by
565 explicit claim or by omission.
567 3. Altered versions must be plainly marked as such, and must not be
568 misrepresented as being the original software.
569 -----------------------------------------------------------------------------
573 /* Define DEBUG to get debugging output on stdout. */
577 /* Use a macro for debugging printing, 'cause that eliminates the the use
578 of #ifdef inline, and there are *still* stupid compilers about that don't like
579 indented pre-processor statements. I suppose it's only been 10 years... */
582 #define DPRINTF(p) printf p
584 #define DPRINTF(p) /*nothing*/
587 /* Include the internals header, which itself includes Standard C headers plus
588 the external pcre header. */
593 #ifndef Py_eval_input
594 /* For Python 1.4, graminit.h has to be explicitly included */
595 #define Py_eval_input eval_input
597 #endif /* FOR_PYTHON */
599 /* Allow compilation as C++ source code, should anybody want to do that. */
602 #define class pcre_class
606 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
608 static const char rep_min
[] = { 0, 0, 1, 1, 0, 0 };
609 static const char rep_max
[] = { 0, 0, 0, 0, 1, 1 };
611 /* Text forms of OP_ values and things, for debugging (not all used) */
614 static const char *OP_names
[] = {
615 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
616 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
617 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
618 "^", "$", "Any", "chars",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
623 "*", "*?", "+", "+?", "?", "??", "{", "{",
624 "class", "negclass", "classL", "Ref",
625 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
626 "Brazero", "Braminzero", "Bra"
630 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
631 are simple data values; negative values are for special things like \d and so
632 on. Zero means further processing is needed (for things like \x), or the escape
635 static const short int escapes
[] = {
636 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
637 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
638 '@', -ESC_A
, -ESC_B
, 0, -ESC_D
, 0, 0, 0, /* @ - G */
639 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
640 0, 0, 0, -ESC_S
, 0, 0, 0, -ESC_W
, /* P - W */
641 0, 0, -ESC_Z
, '[', '\\', ']', '^', '_', /* X - _ */
642 '`', 7, -ESC_b
, 0, -ESC_d
, 0, '\f', 0, /* ` - g */
643 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
644 0, 0, '\r', -ESC_s
, '\t', 0, '\v', -ESC_w
, /* p - w */
648 /* Definition to allow mutual recursion */
651 compile_regex(int, int *, uschar
**, const uschar
**, const char **,
654 /* Structure for passing "static" information around between the functions
655 doing the matching, so that they are thread-safe. */
657 typedef struct match_data
{
658 int errorcode
; /* As it says */
659 int *offset_vector
; /* Offset vector */
660 int offset_end
; /* One past the end */
661 BOOL offset_overflow
; /* Set if too many extractions */
662 BOOL caseless
; /* Case-independent flag */
663 BOOL runtime_caseless
; /* Caseless forced at run time */
664 BOOL multiline
; /* Multiline flag */
665 BOOL notbol
; /* NOTBOL flag */
666 BOOL noteol
; /* NOTEOL flag */
667 BOOL dotall
; /* Dot matches any char */
668 BOOL endonly
; /* Dollar not before final \n */
669 const uschar
*start_subject
; /* Start of the subject string */
670 const uschar
*end_subject
; /* End of the subject string */
671 jmp_buf fail_env
; /* Environment for longjump() break out */
672 const uschar
*end_match_ptr
; /* Subject position at end match */
673 int end_offset_top
; /* Highwater mark at end of match */
674 jmp_buf error_env
; /* For longjmp() if an error occurs deep inside a
675 matching operation */
676 int length
; /* Length of the allocated stacks */
677 int point
; /* Point to add next item pushed onto stacks */
678 /* Pointers to the 6 stacks */
679 int *off_num
, *offset_top
, *r1
, *r2
;
680 const uschar
**eptr
, **ecode
;
685 /*************************************************
687 *************************************************/
689 /* PCRE is thread-clean and doesn't use any global variables in the normal
690 sense. However, it calls memory allocation and free functions via the two
691 indirections below, which are can be changed by the caller, but are shared
692 between all threads. */
694 void *(*pcre_malloc
)(size_t) = malloc
;
695 void (*pcre_free
)(void *) = free
;
700 /*************************************************
701 * Return version string *
702 *************************************************/
713 /*************************************************
714 * Return info about a compiled pattern *
715 *************************************************/
717 /* This function picks potentially useful data out of the private
721 external_re points to compiled code
722 optptr where to pass back the options
723 first_char where to pass back the first character,
724 or -1 if multiline and all branches start ^,
727 Returns: number of identifying extraction brackets
728 or negative values on error
732 pcre_info(const pcre
*external_re
, int *optptr
, int *first_char
)
734 const real_pcre
*re
= (real_pcre
*)external_re
;
735 if (re
== NULL
) return PCRE_ERROR_NULL
;
736 if (re
->magic_number
!= MAGIC_NUMBER
) return PCRE_ERROR_BADMAGIC
;
737 if (optptr
!= NULL
) *optptr
= (re
->options
& PUBLIC_OPTIONS
);
738 if (first_char
!= NULL
)
739 *first_char
= ((re
->options
& PCRE_FIRSTSET
) != 0)? re
->first_char
:
740 ((re
->options
& PCRE_STARTLINE
) != 0)? -1 : -2;
741 return re
->top_bracket
;
748 /*************************************************
749 * Debugging function to print chars *
750 *************************************************/
752 /* Print a sequence of chars in printable format, stopping at the end of the
753 subject if the requested.
756 p points to characters
757 length number to print
758 is_subject TRUE if printing from within md->start_subject
759 md pointer to matching data block, if is_subject is TRUE
765 pchars(const uschar
*p
, int length
, BOOL is_subject
, match_data
*md
)
768 if (is_subject
&& length
> md
->end_subject
- p
) length
= md
->end_subject
- p
;
770 if (isprint(c
= *(p
++))) printf("%c", c
); else printf("\\x%02x", c
);
777 /*************************************************
778 * Check subpattern for empty operand *
779 *************************************************/
781 /* This function checks a bracketed subpattern to see if any of the paths
782 through it could match an empty string. This is used to diagnose an error if
783 such a subpattern is followed by a quantifier with an unlimited upper bound.
786 code points to the opening bracket
788 Returns: TRUE or FALSE
792 could_be_empty(uschar
*code
)
795 uschar
*cc
= code
+ 3;
797 /* Scan along the opcodes for this branch; as soon as we find something
798 that matches a non-empty string, break out and advance to test the next
799 branch. If we get to the end of the branch, return TRUE for the whole
804 /* Test an embedded subpattern; if it could not be empty, break the
805 loop. Otherwise carry on in the branch. */
807 if ((int)(*cc
) >= OP_BRA
|| (int)(*cc
) == OP_ONCE
)
809 if (!could_be_empty(cc
)) break;
810 do cc
+= (cc
[1] << 8) + cc
[2]; while (*cc
== OP_ALT
);
816 /* Reached end of a branch: the subpattern may match the empty string */
824 /* Skip over entire bracket groups with zero lower bound */
831 /* Skip over assertive subpatterns */
835 do cc
+= (cc
[1] << 8) + cc
[2]; while (*cc
== OP_ALT
);
839 /* Skip over things that don't match chars */
845 case OP_NOT_WORD_BOUNDARY
:
846 case OP_WORD_BOUNDARY
:
847 case OP_NOT_WORD_BOUNDARY_L
:
848 case OP_WORD_BOUNDARY_L
:
852 /* Skip over simple repeats with zero lower bound */
865 case OP_TYPEMINQUERY
:
869 /* Skip over UPTOs (lower bound is zero) */
878 /* Check a class or a back reference for a zero minimum */
886 case (OP_REF
): cc
+= 2; break;
887 case (OP_CLASS
): case (OP_NEGCLASS
): cc
+= 1+32; break;
888 case (OP_CLASS_L
): cc
+= 1+1+32; break;
902 if ((cc
[1] << 8) + cc
[2] != 0) goto NEXT_BRANCH
;
911 /* Anything else matches at least one character */
919 code
+= (code
[1] << 8) + code
[2];
921 while (*code
== OP_ALT
);
923 /* No branches match the empty string */
928 /* Determine the length of a group ID in an expression like
931 ptr pattern position pointer (say that 3 times fast)
932 finalchar the character that will mark the end of the ID
933 errorptr points to the pointer to the error message
937 get_group_id(const uschar
*ptr
, char finalchar
, const char **errorptr
)
939 const uschar
*start
= ptr
;
941 /* If the first character is not in \w, or is in \w but is a digit,
943 if (!(pcre_ctypes
[*ptr
] & ctype_word
) ||
944 (pcre_ctypes
[*ptr
++] & ctype_digit
))
946 *errorptr
= "(?P identifier must start with a letter or underscore";
950 /* Increment ptr until we either hit a null byte, the desired
951 final character, or a non-word character */
952 for(; (*ptr
!= 0) && (*ptr
!= finalchar
) &&
953 (pcre_ctypes
[*ptr
] & ctype_word
); ptr
++)
955 /* Empty loop body */
961 *errorptr
= "unterminated (?P identifier";
964 *errorptr
= "illegal character in (?P identifier";
968 /*************************************************
970 *************************************************/
972 /* This function is called when a \ has been encountered. It either returns a
973 positive value for a simple escape such as \n, or a negative value which
974 encodes one of the more complicated things such as \d. On entry, ptr is
975 pointing at the \. On exit, it is on the final character of the escape
979 ptrptr points to the pattern position pointer
980 errorptr points to the pointer to the error message
981 bracount number of previous extracting brackets
982 options the options bits
983 isclass TRUE if inside a character class
985 Returns: zero or positive => a data character
986 negative => a special escape sequence
987 on error, errorptr is set
991 check_escape(const uschar
**ptrptr
, const char **errorptr
, int bracount
,
992 int options
, BOOL isclass
)
994 const uschar
*ptr
= *ptrptr
;
995 int c
= *(++ptr
) & 255; /* Ensure > 0 on signed-char systems */
998 if (c
== 0) *errorptr
= ERR1
;
1000 /* Digits or letters may have special meaning; all others are literals. */
1002 else if (c
< '0' || c
> 'z') {}
1004 /* Do an initial lookup in a table. A non-zero result is something that can be
1005 returned immediately. Otherwise further processing may be required. */
1007 else if ((i
= escapes
[c
- '0']) != 0) c
= i
;
1009 /* Escapes that need further processing, or are illegal. */
1016 /* The handling of escape sequences consisting of a string of digits
1017 starting with one that is not zero is not straightforward. By experiment,
1018 the way Perl works seems to be as follows:
1020 Outside a character class, the digits are read as a decimal number. If the
1021 number is less than 10, or if there are that many previous extracting
1022 left brackets, then it is a back reference. Otherwise, up to three octal
1023 digits are read to form an escaped byte. Thus \123 is likely to be octal
1024 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1025 value is greater than 377, the least significant 8 bits are taken. Inside a
1026 character class, \ followed by a digit is always an octal number. */
1028 case '1': case '2': case '3': case '4': case '5':
1029 case '6': case '7': case '8': case '9':
1032 /* PYTHON: Try to compute an octal value for a character */
1033 for(c
=0, i
=0; ptr
[i
]!=0 && i
<3; i
++)
1035 if (( pcre_ctypes
[ ptr
[i
] ] & ctype_odigit
) != 0)
1036 c
= c
* 8 + ptr
[i
]-'0';
1038 break; /* Non-octal character--break out of the loop */
1040 /* It's a character if there were exactly 3 octal digits, or if
1041 we're inside a character class and there was at least one
1043 if ( (i
== 3) || (isclass
&& i
!=0) )
1048 c
= ptr
[0]; /* Restore the first character after the \ */
1050 while (i
<2 && (pcre_ctypes
[ptr
[1]] & ctype_digit
) != 0)
1052 c
= c
* 10 + ptr
[1] - '0';
1055 if (c
> 255 - ESC_REF
) *errorptr
= "back reference too big";
1060 /* \0 always starts an octal number, but we may drop through to here with a
1061 larger first octal digit */
1065 while(i
++ < 2 && (pcre_ctypes
[ptr
[1]] & ctype_digit
) != 0 &&
1066 ptr
[1] != '8' && ptr
[1] != '9')
1067 c
= c
* 8 + *(++ptr
) - '0';
1070 /* Special escapes not starting with a digit are straightforward */
1074 while ( (pcre_ctypes
[ptr
[1]] & ctype_xdigit
) != 0)
1077 c
= c
* 16 + pcre_lcc
[*ptr
] -
1078 (((pcre_ctypes
[*ptr
] & ctype_digit
) != 0)? '0' : 'W');
1084 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1085 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1086 for Perl compatibility, it is a literal. */
1089 if ((options
& PCRE_EXTRA
) != 0) switch(c
)
1092 c
= -ESC_X
; /* This could be a lookup if it ever got into Perl */
1109 /*************************************************
1110 * Check for counted repeat *
1111 *************************************************/
1113 /* This function is called when a '{' is encountered in a place where it might
1114 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1115 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1116 where the ddds are digits.
1119 p pointer to the first char after '{'
1121 Returns: TRUE or FALSE
1125 is_counted_repeat(const uschar
*p
)
1127 if ((pcre_ctypes
[*p
++] & ctype_digit
) == 0) return FALSE
;
1128 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) p
++;
1129 if (*p
== '}') return TRUE
;
1131 if (*p
++ != ',') return FALSE
;
1132 if (*p
== '}') return TRUE
;
1134 if ((pcre_ctypes
[*p
++] & ctype_digit
) == 0) return FALSE
;
1135 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) p
++;
1141 /*************************************************
1142 * Read repeat counts *
1143 *************************************************/
1145 /* Read an item of the form {n,m} and return the values. This is called only
1146 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1147 so the syntax is guaranteed to be correct, but we need to check the values.
1150 p pointer to first char after '{'
1151 minp pointer to int for min
1152 maxp pointer to int for max
1153 returned as -1 if no max
1154 errorptr points to pointer to error message
1156 Returns: pointer to '}' on success;
1157 current ptr on error, with errorptr set
1160 static const uschar
*
1161 read_repeat_counts(const uschar
*p
, int *minp
, int *maxp
, const char **errorptr
)
1166 while ((pcre_ctypes
[*p
] & ctype_digit
) != 0) min
= min
* 10 + *p
++ - '0';
1168 if (*p
== '}') max
= min
; else
1173 while((pcre_ctypes
[*p
] & ctype_digit
) != 0) max
= max
* 10 + *p
++ - '0';
1182 /* Do paranoid checks, then fill in the required variables, and pass back the
1183 pointer to the terminating '}'. */
1185 if (min
> 65535 || max
> 65535)
1197 /*************************************************
1198 * Compile one branch *
1199 *************************************************/
1201 /* Scan the pattern, compiling it into the code vector.
1204 options the option bits
1205 bracket points to number of brackets used
1206 code points to the pointer to the current code point
1207 ptrptr points to the current pattern pointer
1208 errorptr points to pointer to error message
1210 Returns: TRUE on success
1211 FALSE, with *errorptr set on error
1215 compile_branch(int options
, int *brackets
, uschar
**codeptr
,
1216 const uschar
**ptrptr
, const char **errorptr
, PyObject
*dictionary
)
1218 int repeat_type
, op_type
;
1219 int repeat_min
, repeat_max
;
1220 int bravalue
, length
;
1221 int greedy_default
, greedy_non_default
;
1223 register uschar
*code
= *codeptr
;
1224 const uschar
*ptr
= *ptrptr
;
1225 const uschar
*oldptr
;
1226 uschar
*previous
= NULL
;
1228 uschar
*class_flag
; /* Pointer to the single-byte flag for OP_CLASS_L */
1230 /* Set up the default and non-default settings for greediness */
1232 greedy_default
= ((options
& PCRE_UNGREEDY
) != 0);
1233 greedy_non_default
= greedy_default
^ 1;
1235 /* Switch on next character until the end of the branch */
1240 int class_charcount
;
1244 if ((options
& PCRE_EXTENDED
) != 0)
1246 if ((pcre_ctypes
[c
] & ctype_space
) != 0) continue;
1249 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
1256 /* The branch terminates at end of string, |, or ). */
1265 /* Handle single-character metacharacters */
1282 /* Character classes. These always build a 32-byte bitmap of the permitted
1283 characters, except in the special case where there is only one character.
1284 For negated classes, we build the map as usual, then invert it at the end.
1289 if (options
& PCRE_LOCALE
)
1291 *code
++ = OP_CLASS_L
;
1292 /* Set the flag for localized classes (like \w) to 0 */
1302 /* If the first character is '^', set the negation flag, and use a
1303 different opcode. This only matters if caseless matching is specified at
1306 if ((c
= *(++ptr
)) == '^')
1308 negate_class
= TRUE
;
1309 if (*(code
-1)==OP_CLASS
) *(code
-1) = OP_NEGCLASS
;
1312 else negate_class
= FALSE
;
1314 /* Keep a count of chars so that we can optimize the case of just a single
1317 class_charcount
= 0;
1318 class_lastchar
= -1;
1320 /* Initialize the 32-char bit map to all zeros. We have to build the
1321 map in a temporary bit of store, in case the class contains only 1
1322 character, because in that case the compiled code doesn't use the
1325 memset(class, 0, 32 * sizeof(uschar
));
1327 /* Process characters until ] is reached. By writing this as a "do" it
1328 means that an initial ] is taken as a data character. */
1338 /* Backslash may introduce a single character, or it may introduce one
1339 of the specials, which just set a flag. Escaped items are checked for
1340 validity in the pre-compiling pass. The sequence \b is a special case.
1341 Inside a class (and only there) it is treated as backspace. Elsewhere
1342 it marks a word boundary. Other escapes have preset maps ready to
1343 or into the one we are building. We assume they have more than one
1344 character in them, so set class_count bigger than one. */
1348 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, TRUE
);
1349 if (-c
== ESC_b
) c
= '\b';
1352 class_charcount
= 10;
1357 for (c
= 0; c
< 32; c
++) class[c
] |= pcre_cbits
[c
+cbit_digit
];
1363 for (c
= 0; c
< 32; c
++) class[c
] |= ~pcre_cbits
[c
+cbit_digit
];
1368 if (options
& PCRE_LOCALE
)
1374 for (c
= 0; c
< 32; c
++)
1375 class[c
] |= (pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
1380 if (options
& PCRE_LOCALE
)
1386 for (c
= 0; c
< 32; c
++)
1387 class[c
] |= ~(pcre_cbits
[c
] | pcre_cbits
[c
+cbit_word
]);
1393 for (c
= 0; c
< 32; c
++) class[c
] |= pcre_cbits
[c
+cbit_space
];
1399 for (c
= 0; c
< 32; c
++) class[c
] |= ~pcre_cbits
[c
+cbit_space
];
1408 /* Fall through if single character */
1411 /* A single character may be followed by '-' to form a range. However,
1412 Perl does not permit ']' to be the end of the range. A '-' character
1413 here is treated as a literal. */
1415 if (ptr
[1] == '-' && ptr
[2] != ']')
1427 /* The second part of a range can be a single-character escape, but
1428 not any of the other escapes. */
1432 d
= check_escape(&ptr
, errorptr
, *brackets
, options
, TRUE
);
1435 if (d
== -ESC_b
) d
= '\b'; else
1451 class[c
/8] |= (1 << (c
&7));
1452 if ((options
& PCRE_CASELESS
) != 0)
1454 int uc
= pcre_fcc
[c
]; /* flip case */
1455 class[uc
/8] |= (1 << (uc
&7));
1457 class_charcount
++; /* in case a one-char range */
1460 continue; /* Go get the next char in the class */
1463 /* Handle a lone single character - we can get here for a normal
1464 non-escape char, or after \ that introduces a single character. */
1466 class [c
/8] |= (1 << (c
&7));
1467 if ((options
& PCRE_CASELESS
) != 0)
1469 c
= pcre_fcc
[c
]; /* flip case */
1470 class[c
/8] |= (1 << (c
&7));
1476 /* Loop until ']' reached; the check for end of string happens inside the
1477 loop. This "while" is the end of the "do" above. */
1479 while ((c
= *(++ptr
)) != ']');
1481 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1482 precisely one character. This doesn't need the whole 32-byte bit map.
1483 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1486 if (class_charcount
== 1 && class_lastchar
>= 0)
1494 code
[-1] = OP_CHARS
;
1497 *code
++ = class_lastchar
;
1500 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1505 /* If this is a localized opcode, bump the code pointer up */
1506 if (class_flag
) code
++;
1509 if (class_flag
) *class_flag
= (*class_flag
) ^ 63;
1510 for (c
= 0; c
< 32; c
++) code
[c
] = ~class[c
];
1513 memcpy(code
, class, 32);
1518 /* Various kinds of repeat */
1521 if (!is_counted_repeat(ptr
+1)) goto NORMAL_CHAR
;
1522 ptr
= read_repeat_counts(ptr
+1, &repeat_min
, &repeat_max
, errorptr
);
1523 if (*errorptr
!= NULL
) goto FAILED
;
1541 if (previous
== NULL
)
1547 /* If the next character is '?' this is a minimizing repeat, by default,
1548 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1552 { repeat_type
= greedy_non_default
; ptr
++; }
1553 else repeat_type
= greedy_default
;
1555 /* If the maximum is zero then the minimum must also be zero; Perl allows
1556 this case, so we do too - by simply omitting the item altogether. */
1558 if (repeat_max
== 0) code
= previous
;
1560 /* If previous was a string of characters, chop off the last one and use it
1561 as the subject of the repeat. If there was only one character, we can
1562 abolish the previous item altogether. */
1564 else if (*previous
== OP_CHARS
)
1566 int len
= previous
[1];
1574 c
= previous
[len
+1];
1578 op_type
= 0; /* Use single-char op codes */
1579 goto OUTPUT_SINGLE_REPEAT
; /* Code shared with single character types */
1582 /* If previous was a single negated character ([^a] or similar), we use
1583 one of the special opcodes, replacing it. The code is shared with single-
1584 character repeats by adding a suitable offset into repeat_type. */
1586 else if ((int)*previous
== OP_NOT
)
1588 op_type
= OP_NOTSTAR
- OP_STAR
; /* Use "not" opcodes */
1591 goto OUTPUT_SINGLE_REPEAT
;
1594 /* If previous was a character type match (\d or similar), abolish it and
1595 create a suitable repeat item. The code is shared with single-character
1596 repeats by adding a suitable offset into repeat_type. */
1598 else if ((int)*previous
< OP_CIRC
|| *previous
== OP_ANY
)
1600 op_type
= OP_TYPESTAR
- OP_STAR
; /* Use type opcodes */
1604 OUTPUT_SINGLE_REPEAT
:
1605 repeat_type
+= op_type
; /* Combine both values for many cases */
1607 /* A minimum of zero is handled either as the special case * or ?, or as
1608 an UPTO, with the maximum given. */
1610 if (repeat_min
== 0)
1612 if (repeat_max
== -1) *code
++ = OP_STAR
+ repeat_type
;
1613 else if (repeat_max
== 1) *code
++ = OP_QUERY
+ repeat_type
;
1616 *code
++ = OP_UPTO
+ repeat_type
;
1617 *code
++ = repeat_max
>> 8;
1618 *code
++ = (repeat_max
& 255);
1622 /* The case {1,} is handled as the special case + */
1624 else if (repeat_min
== 1 && repeat_max
== -1)
1625 *code
++ = OP_PLUS
+ repeat_type
;
1627 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1628 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1632 if (repeat_min
!= 1)
1634 *code
++ = OP_EXACT
+ op_type
; /* NB EXACT doesn't have repeat_type */
1635 *code
++ = repeat_min
>> 8;
1636 *code
++ = (repeat_min
& 255);
1639 /* If the mininum is 1 and the previous item was a character string,
1640 we either have to put back the item that got cancelled if the string
1641 length was 1, or add the character back onto the end of a longer
1642 string. For a character type nothing need be done; it will just get
1643 put back naturally. Note that the final character is always going to
1646 else if (*previous
== OP_CHARS
)
1648 if (code
== previous
) code
+= 2; else previous
[1]++;
1651 /* For a single negated character we also have to put back the
1652 item that got cancelled. */
1654 else if (*previous
== OP_NOT
) code
++;
1656 /* If the maximum is unlimited, insert an OP_STAR. */
1661 *code
++ = OP_STAR
+ repeat_type
;
1664 /* Else insert an UPTO if the max is greater than the min. */
1666 else if (repeat_max
!= repeat_min
)
1669 repeat_max
-= repeat_min
;
1670 *code
++ = OP_UPTO
+ repeat_type
;
1671 *code
++ = repeat_max
>> 8;
1672 *code
++ = (repeat_max
& 255);
1676 /* The character or character type itself comes last in all cases. */
1681 /* If previous was a character class or a back reference, we put the repeat
1684 else if (*previous
== OP_CLASS
|| *previous
== OP_NEGCLASS
||
1685 *previous
==OP_CLASS_L
|| *previous
== OP_REF
)
1687 if (repeat_min
== 0 && repeat_max
== -1)
1688 *code
++ = OP_CRSTAR
+ repeat_type
;
1689 else if (repeat_min
== 1 && repeat_max
== -1)
1690 *code
++ = OP_CRPLUS
+ repeat_type
;
1691 else if (repeat_min
== 0 && repeat_max
== 1)
1692 *code
++ = OP_CRQUERY
+ repeat_type
;
1695 *code
++ = OP_CRRANGE
+ repeat_type
;
1696 *code
++ = repeat_min
>> 8;
1697 *code
++ = repeat_min
& 255;
1698 if (repeat_max
== -1) repeat_max
= 0; /* 2-byte encoding for max */
1699 *code
++ = repeat_max
>> 8;
1700 *code
++ = repeat_max
& 255;
1704 /* If previous was a bracket group, we may have to replicate it in certain
1705 cases. If the maximum repeat count is unlimited, check that the bracket
1706 group cannot match the empty string, and diagnose an error if it can. */
1708 else if ((int)*previous
>= OP_BRA
)
1711 int len
= code
- previous
;
1713 if (repeat_max
== -1 && could_be_empty(previous
))
1719 /* If the minimum is greater than zero, and the maximum is unlimited or
1720 equal to the minimum, the first copy remains where it is, and is
1721 replicated up to the minimum number of times. This case includes the +
1722 repeat, but of course no replication is needed in that case. */
1724 if (repeat_min
> 0 && (repeat_max
== -1 || repeat_max
== repeat_min
))
1726 for (i
= 1; i
< repeat_min
; i
++)
1728 memcpy(code
, previous
, len
);
1733 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1734 Then, if there is a fixed upper limit, replicated up to that many times,
1735 sticking BRAZERO in front of all the optional ones. */
1739 if (repeat_min
== 0)
1741 memmove(previous
+1, previous
, len
);
1743 *previous
++ = OP_BRAZERO
+ repeat_type
;
1746 for (i
= 1; i
< repeat_min
; i
++)
1748 memcpy(code
, previous
, len
);
1752 for (i
= (repeat_min
> 0)? repeat_min
: 1; i
< repeat_max
; i
++)
1754 *code
++ = OP_BRAZERO
+ repeat_type
;
1755 memcpy(code
, previous
, len
);
1760 /* If the maximum is unlimited, set a repeater in the final copy. */
1762 if (repeat_max
== -1) code
[-3] = OP_KETRMAX
+ repeat_type
;
1765 /* Else there's some kind of shambles */
1773 /* In all case we no longer have a previous item. */
1779 /* Start of nested bracket sub-expression, or comment or lookahead.
1780 First deal with special things that can come after a bracket; all are
1781 introduced by ?, and the appearance of any of them means that this is not a
1782 referencing group. They were checked for validity in the first pass over
1783 the string, so we don't have to check for syntax errors here. */
1786 previous
= code
; /* Only real brackets can be repeated */
1787 if (*(++ptr
) == '?')
1800 while (*ptr
!= ')') ptr
++;
1804 case ':': /* Non-extracting bracket */
1808 case '=': /* Assertions can't be repeated */
1809 bravalue
= OP_ASSERT
;
1815 bravalue
= OP_ASSERT_NOT
;
1824 /* (?P<groupname>...) */
1826 PyObject
*string
, *intobj
;
1829 idlen
= get_group_id(ptr
, '>', errorptr
);
1833 string
= PyString_FromStringAndSize((char*)ptr
, idlen
);
1834 intobj
= PyInt_FromLong( brackets
[0] + 1 );
1835 if (intobj
== NULL
|| string
== NULL
)
1839 *errorptr
= "exception raised";
1842 PyDict_SetItem(dictionary
, string
, intobj
);
1843 Py_DECREF(string
); Py_DECREF(intobj
); /* XXX DECREF commented out! */
1844 ptr
+= idlen
+1; /* Point to rest of expression */
1845 goto do_grouping_bracket
;
1849 /* (?P=groupname) */
1851 PyObject
*string
, *intobj
;
1854 idlen
= get_group_id(ptr
, ')', errorptr
);
1858 string
= PyString_FromStringAndSize((char *)ptr
, idlen
);
1860 *errorptr
= "exception raised";
1863 intobj
= PyDict_GetItem(dictionary
, string
);
1866 *errorptr
= "?P= group identifier isn't defined";
1870 refnum
= PyInt_AsLong(intobj
);
1872 /* The caller doesn't own the reference to the value
1873 returned from PyDict_GetItem, so intobj is not
1878 /* The continue will cause the top-level for() loop to
1879 be resumed, so ptr will be immediately incremented.
1880 Therefore, the following line adds just idlen, not
1885 /* The character after ?P is neither < nor =, so
1886 report an error. Add more Python-extensions here. */
1887 *errorptr
="unknown after (?P";
1890 case '>': /* "Match once" brackets */
1891 if ((options
& PCRE_EXTRA
) != 0) /* Not yet standard */
1898 /* Else fall through */
1906 /* Else we have a referencing group */
1910 do_grouping_bracket
:
1911 if (++(*brackets
) > EXTRACT_MAX
)
1916 bravalue
= OP_BRA
+ *brackets
;
1919 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1920 code into a non-register variable in order to be able to pass its address
1921 because some compilers complain otherwise. */
1925 uschar
*mcode
= code
;
1926 if (!compile_regex(options
, brackets
, &mcode
, &ptr
, errorptr
, dictionary
))
1938 /* Check \ for being a real metacharacter; if not, fall through and handle
1939 it as a data character at the start of a string. Escape items are checked
1940 for validity in the pre-compiling pass. */
1944 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, FALSE
);
1946 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1947 are arranged to be the negation of the corresponding OP_values. For the
1948 back references, the values are ESC_REF plus the reference number. Only
1949 back references and those types that consume a character may be repeated.
1950 We can test for values between ESC_b and ESC_Z for the latter; this may
1951 have to change if any new ones are ever created. */
1957 int refnum
= -c
- ESC_REF
;
1958 if (*brackets
< refnum
)
1969 previous
= (-c
> ESC_b
&& -c
< ESC_X
)? code
: NULL
;
1970 if ( (options
& PCRE_LOCALE
) != 0)
1974 case (-ESC_b
): c
= -OP_WORD_BOUNDARY_L
; break;
1975 case (-ESC_B
): c
= -OP_NOT_WORD_BOUNDARY_L
; break;
1976 case (-ESC_w
): c
= -OP_WORDCHAR_L
; break;
1977 case (-ESC_W
): c
= -OP_NOT_WORDCHAR_L
; break;
1985 /* Data character: Reset and fall through */
1990 /* Handle a run of data characters until a metacharacter is encountered.
1991 The first character is guaranteed not to be whitespace or # when the
1992 extended flag is set. */
2003 if ((options
& PCRE_EXTENDED
) != 0)
2005 if ((pcre_ctypes
[c
] & ctype_space
) != 0) continue;
2008 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2014 /* Backslash may introduce a data char or a metacharacter. Escaped items
2015 are checked for validity in the pre-compiling pass. Stop the string
2016 before a metaitem. */
2021 c
= check_escape(&ptr
, errorptr
, *brackets
, options
, FALSE
);
2022 if (c
< 0) { ptr
= oldptr
; break; }
2025 /* Ordinary character or single-char escape */
2031 /* This "while" is the end of the "do" above. */
2033 while (length
< 255 && (pcre_ctypes
[c
= *(++ptr
)] & ctype_meta
) == 0);
2035 /* Compute the length and set it in the data vector, and advance to
2038 previous
[1] = length
;
2039 if (length
< 255) ptr
--;
2042 } /* end of big loop */
2044 /* Control never reaches here by falling through, only by a goto for all the
2045 error states. Pass back the position in the pattern so that it can be displayed
2046 to the user for diagnosing the error. */
2056 /*************************************************
2057 * Compile sequence of alternatives *
2058 *************************************************/
2060 /* On entry, ptr is pointing past the bracket character, but on return
2061 it points to the closing bracket, or vertical bar, or end of string.
2062 The code variable is pointing at the byte into which the BRA operator has been
2066 options the option bits
2067 brackets -> int containing the number of extracting brackets used
2068 codeptr -> the address of the current code pointer
2069 ptrptr -> the address of the current pattern pointer
2070 errorptr -> pointer to error message
2072 Returns: TRUE on success
2076 compile_regex(int options
, int *brackets
, uschar
**codeptr
,
2077 const uschar
**ptrptr
, const char **errorptr
, PyObject
*dictionary
)
2079 const uschar
*ptr
= *ptrptr
;
2080 uschar
*code
= *codeptr
;
2081 uschar
*start_bracket
= code
;
2086 uschar
*last_branch
= code
;
2089 if (!compile_branch(options
, brackets
, &code
, &ptr
, errorptr
, dictionary
))
2095 /* Fill in the length of the last branch */
2097 length
= code
- last_branch
;
2098 last_branch
[1] = length
>> 8;
2099 last_branch
[2] = length
& 255;
2101 /* Reached end of expression, either ')' or end of pattern. Insert a
2102 terminating ket and the length of the whole bracketed item, and return,
2103 leaving the pointer at the terminating char. */
2107 length
= code
- start_bracket
;
2109 *code
++ = length
>> 8;
2110 *code
++ = length
& 255;
2116 /* Another branch follows; insert an "or" node and advance the pointer. */
2121 /* Control never reaches here */
2126 /*************************************************
2127 * Check for anchored expression *
2128 *************************************************/
2130 /* Try to find out if this is an anchored regular expression. Consider each
2131 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2132 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2133 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2134 counts, since OP_CIRC can match in the middle.
2136 A branch is also implicitly anchored if it starts with .* because that will try
2137 the rest of the pattern at all possible matching points, so there is no point
2140 Argument: points to start of expression (the bracket)
2141 Returns: TRUE or FALSE
2145 is_anchored(register const uschar
*code
, BOOL multiline
)
2148 int op
= (int)code
[3];
2149 if (op
>= OP_BRA
|| op
== OP_ASSERT
|| op
== OP_ONCE
)
2150 { if (!is_anchored(code
+3, multiline
)) return FALSE
; }
2151 else if (op
== OP_TYPESTAR
|| op
== OP_TYPEMINSTAR
)
2152 { if (code
[4] != OP_ANY
) return FALSE
; }
2153 else if (op
!= OP_SOD
&& (multiline
|| op
!= OP_CIRC
)) return FALSE
;
2154 code
+= (code
[1] << 8) + code
[2];
2156 while (*code
== OP_ALT
);
2162 /*************************************************
2163 * Check for start with \n line expression *
2164 *************************************************/
2166 /* This is called for multiline expressions to try to find out if every branch
2167 starts with ^ so that "first char" processing can be done to speed things up.
2169 Argument: points to start of expression (the bracket)
2170 Returns: TRUE or FALSE
2174 is_startline(const uschar
*code
)
2177 if ((int)code
[3] >= OP_BRA
|| code
[3] == OP_ASSERT
)
2178 { if (!is_startline(code
+3)) return FALSE
; }
2179 else if (code
[3] != OP_CIRC
) return FALSE
;
2180 code
+= (code
[1] << 8) + code
[2];
2182 while (*code
== OP_ALT
);
2188 /*************************************************
2189 * Check for fixed first char *
2190 *************************************************/
2192 /* Try to find out if there is a fixed first character. This is called for
2193 unanchored expressions, as it speeds up their processing quite considerably.
2194 Consider each alternative branch. If they all start with the same char, or with
2195 a bracket all of whose alternatives start with the same char (recurse ad lib),
2196 then we return that char, otherwise -1.
2198 Argument: points to start of expression (the bracket)
2199 Returns: -1 or the fixed first char
2203 find_firstchar(uschar
*code
)
2205 register int c
= -1;
2208 register int charoffset
= 4;
2210 if ((int)code
[3] >= OP_BRA
|| code
[3] == OP_ASSERT
)
2213 if ((d
= find_firstchar(code
+3)) < 0) return -1;
2214 if (c
< 0) c
= d
; else if (c
!= d
) return -1;
2217 else switch(code
[3])
2222 case OP_EXACT
: /* Fall through */
2225 case OP_CHARS
: /* Fall through */
2230 if (c
< 0) c
= code
[charoffset
]; else if (c
!= code
[charoffset
]) return -1;
2233 code
+= (code
[1] << 8) + code
[2];
2235 while (*code
== OP_ALT
);
2241 /*************************************************
2242 * Compile a Regular Expression *
2243 *************************************************/
2245 /* This function takes a string and returns a pointer to a block of store
2246 holding a compiled version of the expression.
2249 pattern the regular expression
2250 options various option bits
2251 errorptr pointer to pointer to error text
2252 erroroffset ptr offset in pattern where error was detected
2254 Returns: pointer to compiled data block, or NULL on error,
2255 with errorptr and erroroffset set
2259 pcre_compile(const char *pattern
, int options
, const char **errorptr
,
2260 int *erroroffset
, PyObject
*dictionary
)
2264 int length
= 3; /* For initial BRA plus length */
2269 int top_backref
= 0;
2270 unsigned int brastackptr
= 0;
2275 uschar
*code_base
, *code_end
;
2278 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2279 can do is just return NULL. */
2281 if (errorptr
== NULL
) return NULL
;
2284 /* However, we can give a message for this error */
2286 if (erroroffset
== NULL
)
2293 if ((options
& ~PUBLIC_OPTIONS
) != 0)
2299 DPRINTF(("------------------------------------------------------------------\n"));
2300 DPRINTF(("%s\n", pattern
));
2302 /* The first thing to do is to make a pass over the pattern to compute the
2303 amount of store required to hold the compiled code. This does not have to be
2304 perfect as long as errors are overestimates. At the same time we can detect any
2305 internal flag settings. Make an attempt to correct for any counted white space
2306 if an "extended" flag setting appears late in the pattern. We can't be so
2307 clever for #-comments. */
2309 ptr
= (const uschar
*)(pattern
- 1);
2310 while ((c
= *(++ptr
)) != 0)
2313 int class_charcount
;
2315 if ((pcre_ctypes
[c
] & ctype_space
) != 0)
2317 if ((options
& PCRE_EXTENDED
) != 0) continue;
2321 if (c
== '#' && (options
& PCRE_EXTENDED
) != 0)
2323 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2329 /* A backslashed item may be an escaped "normal" character or a
2330 character type. For a "normal" character, put the pointers and
2331 character back so that tests for whitespace etc. in the input
2332 are done correctly. */
2336 const uschar
*save_ptr
= ptr
;
2337 c
= check_escape(&ptr
, errorptr
, bracount
, options
, FALSE
);
2338 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2348 /* A back reference needs an additional char, plus either one or 5
2349 bytes for a repeat. We also need to keep the value of the highest
2354 int refnum
= -c
- ESC_REF
;
2355 if (refnum
> top_backref
) top_backref
= refnum
;
2356 length
++; /* For single back reference */
2357 if (ptr
[1] == '{' && is_counted_repeat(ptr
+2))
2359 ptr
= read_repeat_counts(ptr
+2, &min
, &max
, errorptr
);
2360 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2361 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2362 (min
== 1 && max
== -1))
2365 if (ptr
[1] == '?') ptr
++;
2373 case '*': /* These repeats won't be after brackets; */
2374 case '+': /* those are handled separately */
2379 /* This covers the cases of repeats after a single char, metachar, class,
2380 or back reference. */
2383 if (!is_counted_repeat(ptr
+1)) goto NORMAL_CHAR
;
2384 ptr
= read_repeat_counts(ptr
+1, &min
, &max
, errorptr
);
2385 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2386 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2387 (min
== 1 && max
== -1))
2391 length
--; /* Uncount the original char or metachar */
2392 if (min
== 1) length
++; else if (min
> 0) length
+= 4;
2393 if (max
> 0) length
+= 4; else length
+= 2;
2395 if (ptr
[1] == '?') ptr
++;
2398 /* An alternation contains an offset to the next branch or ket. */
2403 /* A character class uses 33 characters. Don't worry about character types
2404 that aren't allowed in classes - they'll get picked up during the compile.
2405 A character class that contains only one character uses 2 or 3 bytes,
2406 depending on whether it is negated or not. Notice this where we can. */
2409 class_charcount
= 0;
2410 if (*(++ptr
) == '^') ptr
++;
2415 int ch
= check_escape(&ptr
, errorptr
, bracount
, options
, TRUE
);
2416 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2417 if (-ch
== ESC_b
) class_charcount
++; else class_charcount
= 10;
2419 else class_charcount
++;
2422 while (*ptr
!= 0 && *ptr
!= ']');
2424 /* Repeats for negated single chars are handled by the general code */
2426 if (class_charcount
== 1) length
+= 3; else
2429 if (options
& PCRE_LOCALE
) length
++; /* Add a byte for the localization flag */
2431 /* A repeat needs either 1 or 5 bytes. */
2433 if (*ptr
!= 0 && ptr
[1] == '{' && is_counted_repeat(ptr
+2))
2435 ptr
= read_repeat_counts(ptr
+2, &min
, &max
, errorptr
);
2436 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2437 if ((min
== 0 && (max
== 1 || max
== -1)) ||
2438 (min
== 1 && max
== -1))
2441 if (ptr
[1] == '?') ptr
++;
2446 /* Brackets may be genuine groups or special things */
2450 /* Handle special forms of bracket, which all start (? */
2452 if (ptr
[1] == '?') switch (c
= ptr
[2])
2454 /* Skip over comments entirely */
2457 while (*ptr
!= 0 && *ptr
!= ')') ptr
++;
2461 goto PCRE_ERROR_RETURN
;
2465 /* Non-referencing groups and lookaheads just move the pointer on, and
2466 then behave like a non-special bracket, except that they don't increment
2467 the count of extracting brackets. */
2480 idlen
= get_group_id(ptr
++, '>', errorptr
);
2481 if (*errorptr
) goto PCRE_ERROR_RETURN
;
2485 idlen
= get_group_id(ptr
++, ')', errorptr
);
2486 if (*errorptr
) goto PCRE_ERROR_RETURN
;
2494 /* Ditto for the "once only" bracket, allowed only if the extra bit
2498 if ((options
& PCRE_EXTRA
) != 0)
2503 /* Else fall through */
2505 /* Else loop setting valid options until ) is met. Anything else is an
2512 if ((c
= *ptr
) == 'i')
2514 options
|= PCRE_CASELESS
;
2517 else if ((c
= *ptr
) == 'L')
2519 options
|= PCRE_LOCALE
;
2522 else if ((c
= *ptr
) == 'm')
2524 options
|= PCRE_MULTILINE
;
2529 options
|= PCRE_DOTALL
;
2534 options
|= PCRE_EXTENDED
;
2535 length
-= spaces
; /* Already counted spaces */
2538 else if (c
== ')') break;
2541 goto PCRE_ERROR_RETURN
;
2543 continue; /* End of this bracket handling */
2546 /* Extracting brackets must be counted so we can process escapes in a
2551 /* Non-special forms of bracket. Save length for computing whole length
2552 at end if there's a repeat that requires duplication of the group. */
2554 if (brastackptr
>= sizeof(brastack
)/sizeof(int))
2557 goto PCRE_ERROR_RETURN
;
2560 brastack
[brastackptr
++] = length
;
2564 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2565 have to replicate this bracket up to that many times. If brastackptr is
2566 0 this is an unmatched bracket which will generate an error, but take care
2567 not to try to access brastack[-1]. */
2574 int duplength
= (brastackptr
> 0)? length
- brastack
[--brastackptr
] : 0;
2576 /* Leave ptr at the final char; for read_repeat_counts this happens
2577 automatically; for the others we need an increment. */
2579 if ((c
= ptr
[1]) == '{' && is_counted_repeat(ptr
+2))
2581 ptr
= read_repeat_counts(ptr
+2, &minval
, &maxval
, errorptr
);
2582 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2584 else if (c
== '*') { minval
= 0; maxval
= -1; ptr
++; }
2585 else if (c
== '+') { maxval
= -1; ptr
++; }
2586 else if (c
== '?') { minval
= 0; ptr
++; }
2588 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2589 if there is a limited maximum we have to replicate up to maxval-1 times
2590 and allow for a BRAZERO item before each optional copy, as we also have
2591 to do before the first copy if the minimum is zero. */
2593 if (minval
== 0) length
++;
2594 else if (minval
> 1) length
+= (minval
- 1) * duplength
;
2595 if (maxval
> minval
) length
+= (maxval
- minval
) * (duplength
+ 1);
2599 /* Non-special character. For a run of such characters the length required
2600 is the number of characters + 2, except that the maximum run length is 255.
2601 We won't get a skipped space or a non-data escape or the start of a #
2602 comment as the first character, so the length can't be zero. */
2610 if ((pcre_ctypes
[c
] & ctype_space
) != 0)
2612 if ((options
& PCRE_EXTENDED
) != 0) continue;
2616 if (c
== '#' && (options
& PCRE_EXTENDED
) != 0)
2618 while ((c
= *(++ptr
)) != 0 && c
!= '\n');
2622 /* Backslash may introduce a data char or a metacharacter; stop the
2623 string before the latter. */
2627 const uschar
*saveptr
= ptr
;
2628 c
= check_escape(&ptr
, errorptr
, bracount
, options
, FALSE
);
2629 if (*errorptr
!= NULL
) goto PCRE_ERROR_RETURN
;
2630 if (c
< 0) { ptr
= saveptr
; break; }
2633 /* Ordinary character or single-char escape */
2638 /* This "while" is the end of the "do" above. */
2640 while (runlength
< 255 && (pcre_ctypes
[c
= *(++ptr
)] & ctype_meta
) == 0);
2643 length
+= runlength
;
2648 length
+= 4; /* For final KET and END */
2656 /* Compute the size of data block needed and get it, either from malloc or
2657 externally provided function. We specify "code[0]" in the offsetof() expression
2658 rather than just "code", because it has been reported that one broken compiler
2659 fails on "code" because it is also an independent variable. It should make no
2660 difference to the value of the offsetof(). */
2662 size
= length
+ offsetof(real_pcre
, code
[0]);
2663 re
= (real_pcre
*)(pcre_malloc
)(size
+50);
2671 /* Put in the magic number and the options. */
2673 re
->magic_number
= MAGIC_NUMBER
;
2674 re
->options
= options
;
2676 /* Set up a starting, non-extracting bracket, then compile the expression. On
2677 error, *errorptr will be set non-NULL, so we don't need to look at the result
2678 of the function here. */
2680 ptr
= (const uschar
*)pattern
;
2684 (void)compile_regex(options
, &bracount
, &code
, &ptr
, errorptr
, dictionary
);
2685 re
->top_bracket
= bracount
;
2686 re
->top_backref
= top_backref
;
2688 /* If not reached end of pattern on success, there's an excess bracket. */
2690 if (*errorptr
== NULL
&& *ptr
!= 0) *errorptr
= ERR22
;
2692 /* Fill in the terminating state and check for disastrous overflow, but
2693 if debugging, leave the test till after things are printed out. */
2699 if (code
- re
->code
> length
) *errorptr
= ERR23
;
2702 /* Failed to compile */
2704 if (*errorptr
!= NULL
)
2708 *erroroffset
= ptr
- (const uschar
*)pattern
;
2712 /* If the anchored option was not passed, set flag if we can determine that it
2713 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2714 we can determine what the first character has to be, because that speeds up
2715 unanchored matches no end. In the case of multiline matches, an alternative is
2716 to set the PCRE_STARTLINE flag if all branches start with ^. */
2718 if ((options
& PCRE_ANCHORED
) == 0)
2720 if (is_anchored(re
->code
, (options
& PCRE_MULTILINE
) != 0))
2721 re
->options
|= PCRE_ANCHORED
;
2724 int ch
= find_firstchar(re
->code
);
2727 re
->first_char
= ch
;
2728 re
->options
|= PCRE_FIRSTSET
;
2730 else if (is_startline(re
->code
))
2731 re
->options
|= PCRE_STARTLINE
;
2735 /* Print out the compiled data for debugging */
2739 printf("Length = %d top_bracket = %d top_backref=%d\n",
2740 length
, re
->top_bracket
, re
->top_backref
);
2742 if (re
->options
!= 0)
2744 printf("%s%s%s%s%s%s%s%s\n",
2745 ((re
->options
& PCRE_ANCHORED
) != 0)? "anchored " : "",
2746 ((re
->options
& PCRE_CASELESS
) != 0)? "caseless " : "",
2747 ((re
->options
& PCRE_EXTENDED
) != 0)? "extended " : "",
2748 ((re
->options
& PCRE_MULTILINE
) != 0)? "multiline " : "",
2749 ((re
->options
& PCRE_DOTALL
) != 0)? "dotall " : "",
2750 ((re
->options
& PCRE_DOLLAR_ENDONLY
) != 0)? "endonly " : "",
2751 ((re
->options
& PCRE_EXTRA
) != 0)? "extra " : "",
2752 ((re
->options
& PCRE_UNGREEDY
) != 0)? "ungreedy " : "");
2755 if ((re
->options
& PCRE_FIRSTSET
) != 0)
2757 if (isprint(re
->first_char
)) printf("First char = %c\n", re
->first_char
);
2758 else printf("First char = \\x%02x\n", re
->first_char
);
2762 code_base
= code
= re
->code
;
2764 while (code
< code_end
)
2768 printf("%3d ", code
- code_base
);
2770 if (*code
>= OP_BRA
)
2772 printf("%3d Bra %d", (code
[1] << 8) + code
[2], *code
- OP_BRA
);
2779 charlength
= *(++code
);
2780 printf("%3d ", charlength
);
2781 while (charlength
-- > 0)
2782 if (isprint(c
= *(++code
))) printf("%c", c
); else printf("\\x%02x", c
);
2792 printf("%3d %s", (code
[1] << 8) + code
[2], OP_names
[*code
]);
2803 case OP_TYPEMINSTAR
:
2805 case OP_TYPEMINPLUS
:
2807 case OP_TYPEMINQUERY
:
2808 if (*code
>= OP_TYPESTAR
)
2809 printf(" %s", OP_names
[code
[1]]);
2810 else if (isprint(c
= code
[1])) printf(" %c", c
);
2811 else printf(" \\x%02x", c
);
2812 printf("%s", OP_names
[*code
++]);
2818 if (isprint(c
= code
[3])) printf(" %c{", c
);
2819 else printf(" \\x%02x{", c
);
2820 if (*code
!= OP_EXACT
) printf("0,");
2821 printf("%d}", (code
[1] << 8) + code
[2]);
2822 if (*code
== OP_MINUPTO
) printf("?");
2828 case OP_TYPEMINUPTO
:
2829 printf(" %s{", OP_names
[code
[3]]);
2830 if (*code
!= OP_TYPEEXACT
) printf(",");
2831 printf("%d}", (code
[1] << 8) + code
[2]);
2832 if (*code
== OP_TYPEMINUPTO
) printf("?");
2837 if (isprint(c
= *(++code
))) printf(" [^%c]", c
);
2838 else printf(" [^\\x%02x]", c
);
2846 case OP_NOTMINQUERY
:
2847 if (isprint(c
= code
[1])) printf(" [^%c]", c
);
2848 else printf(" [^\\x%02x]", c
);
2849 printf("%s", OP_names
[*code
++]);
2855 if (isprint(c
= code
[3])) printf(" [^%c]{", c
);
2856 else printf(" [^\\x%02x]{", c
);
2857 if (*code
!= OP_NOTEXACT
) printf(",");
2858 printf("%d}", (code
[1] << 8) + code
[2]);
2859 if (*code
== OP_NOTMINUPTO
) printf("?");
2864 printf(" \\%d", *(++code
));
2866 goto CLASS_REF_REPEAT
;
2874 if (*code
==OP_CLASS_L
)
2877 printf("Locflag = %i ", *code
++);
2882 if (*code
++ == OP_CLASS
) printf(" [");
2887 for (i
= 0; i
< 256; i
++)
2889 if ((code
[i
/8] & (1 << (i
&7))) != 0)
2892 for (j
= i
+1; j
< 256; j
++)
2893 if ((code
[j
/8] & (1 << (j
&7))) == 0) break;
2894 if (i
== '-' || i
== ']') printf("\\");
2895 if (isprint(i
)) printf("%c", i
); else printf("\\x%02x", i
);
2899 if (j
== '-' || j
== ']') printf("\\");
2900 if (isprint(j
)) printf("%c", j
); else printf("\\x%02x", j
);
2919 printf("%s", OP_names
[*code
]);
2924 min
= (code
[1] << 8) + code
[2];
2925 max
= (code
[3] << 8) + code
[4];
2926 if (max
== 0) printf("{%d,}", min
);
2927 else printf("{%d,%d}", min
, max
);
2928 if (*code
== OP_CRMINRANGE
) printf("?");
2938 /* Anything else is just a one-node item */
2941 printf(" %s", OP_names
[*code
]);
2948 printf("------------------------------------------------------------------\n");
2950 /* This check is done here in the debugging case so that the code that
2951 was compiled can be seen. */
2953 if (code
- re
->code
> length
)
2955 printf("length=%i, code length=%i\n", length
, code
-re
->code
);
2958 *erroroffset
= ptr
- (uschar
*)pattern
;
2968 /*************************************************
2969 * Match a character type *
2970 *************************************************/
2972 /* Not used in all the places it might be as it's sometimes faster
2973 to put the code inline.
2976 type the character type
2978 dotall the dotall flag
2980 Returns: TRUE if character is of the type
2984 match_type(int type
, int c
, BOOL dotall
)
2988 if (isprint(c
)) printf("matching subject %c against ", c
);
2989 else printf("matching subject \\x%02x against ", c
);
2990 printf("%s\n", OP_names
[type
]);
2995 case OP_ANY
: return dotall
|| c
!= '\n';
2996 case OP_NOT_DIGIT
: return (pcre_ctypes
[c
] & ctype_digit
) == 0;
2997 case OP_DIGIT
: return (pcre_ctypes
[c
] & ctype_digit
) != 0;
2998 case OP_NOT_WHITESPACE
: return (pcre_ctypes
[c
] & ctype_space
) == 0;
2999 case OP_WHITESPACE
: return (pcre_ctypes
[c
] & ctype_space
) != 0;
3000 case OP_NOT_WORDCHAR
: return (pcre_ctypes
[c
] & ctype_word
) == 0;
3001 case OP_WORDCHAR
: return (pcre_ctypes
[c
] & ctype_word
) != 0;
3002 case OP_NOT_WORDCHAR_L
: return (c
!='_' && !isalnum(c
));
3003 case OP_WORDCHAR_L
: return (c
=='_' || isalnum(c
));
3010 /*************************************************
3011 * Match a back-reference *
3012 *************************************************/
3014 /* If a back reference hasn't been set, the match fails.
3017 number reference number
3018 eptr points into the subject
3019 length length to be matched
3020 md points to match data block
3022 Returns: TRUE if matched
3026 match_ref(int number
, register const uschar
*eptr
, int length
, match_data
*md
)
3028 const uschar
*p
= md
->start_subject
+ md
->offset_vector
[number
];
3031 if (eptr
>= md
->end_subject
)
3032 printf("matching subject <null>");
3035 printf("matching subject ");
3036 pchars(eptr
, length
, TRUE
, md
);
3038 printf(" against backref ");
3039 pchars(p
, length
, FALSE
, md
);
3043 /* Always fail if not enough characters left */
3045 if (length
> md
->end_subject
- p
) return FALSE
;
3047 /* Separate the caseless case for speed */
3050 { while (length
-- > 0) if (pcre_lcc
[*p
++] != pcre_lcc
[*eptr
++]) return FALSE
; }
3052 { while (length
-- > 0) if (*p
++ != *eptr
++) return FALSE
; }
3057 static int free_stack(match_data
*md
)
3059 /* Free any stack space that was allocated by the call to match(). */
3060 if (md
->off_num
) free(md
->off_num
);
3061 if (md
->offset_top
) free(md
->offset_top
);
3062 if (md
->r1
) free(md
->r1
);
3063 if (md
->r2
) free(md
->r2
);
3064 if (md
->eptr
) free((char *)md
->eptr
);
3065 if (md
->ecode
) free((char *)md
->ecode
);
3069 static int grow_stack(match_data
*md
)
3071 if (md
->length
!= 0)
3073 md
->length
= md
->length
+ md
->length
/2;
3077 int string_len
= md
->end_subject
- md
->start_subject
+ 1;
3078 if (string_len
< 80) {md
->length
= string_len
; }
3079 else {md
->length
= 80;}
3081 PyMem_RESIZE(md
->offset_top
, int, md
->length
);
3082 PyMem_RESIZE(md
->eptr
, const uschar
*, md
->length
);
3083 PyMem_RESIZE(md
->ecode
, const uschar
*, md
->length
);
3084 PyMem_RESIZE(md
->off_num
, int, md
->length
);
3085 PyMem_RESIZE(md
->r1
, int, md
->length
);
3086 PyMem_RESIZE(md
->r2
, int, md
->length
);
3087 if (md
->offset_top
== NULL
|| md
->eptr
== NULL
|| md
->ecode
== NULL
||
3088 md
->off_num
== NULL
|| md
->r1
== NULL
|| md
->r2
== NULL
)
3091 longjmp(md
->error_env
, 1);
3097 /*************************************************
3098 * Match from current position *
3099 *************************************************/
3101 /* On entry ecode points to the first opcode, and eptr to the first character.
3104 eptr pointer in subject
3105 ecode position in code
3106 offset_top current top pointer
3107 md pointer to "static" info for the match
3109 Returns: TRUE if matched
3113 match(register const uschar
*eptr
, register const uschar
*ecode
, int offset_top
,
3116 int save_stack_position
= md
->point
;
3119 #define SUCCEED goto succeed
3120 #define FAIL goto fail
3124 int min
, max
, ctype
;
3127 BOOL minimize
= FALSE
;
3129 /* Opening bracket. Check the alternative branches in turn, failing if none
3130 match. We have to set the start offset if required and there is space
3131 in the offset vector so that it is available for subsequent back references
3132 if the bracket matches. However, if the bracket fails, we must put back the
3133 previous value of both offsets in case they were set by a previous copy of
3134 the same bracket. Don't worry about setting the flag for the error case here;
3135 that is handled in the code for KET. */
3137 if ((int)*ecode
>= OP_BRA
)
3139 int number
= (*ecode
- OP_BRA
) << 1;
3140 int save_offset1
= 0, save_offset2
= 0;
3142 DPRINTF(("start bracket %d\n", number
/2));
3144 if (number
> 0 && number
< md
->offset_end
)
3146 save_offset1
= md
->offset_vector
[number
];
3147 save_offset2
= md
->offset_vector
[number
+1];
3148 md
->offset_vector
[number
] = eptr
- md
->start_subject
;
3150 DPRINTF(("saving %d %d\n", save_offset1
, save_offset2
));
3153 /* Recurse for all the alternatives. */
3157 if (match(eptr
, ecode
+3, offset_top
, md
)) SUCCEED
;
3158 ecode
+= (ecode
[1] << 8) + ecode
[2];
3160 while (*ecode
== OP_ALT
);
3162 DPRINTF(("bracket %d failed\n", number
/2));
3164 if (number
> 0 && number
< md
->offset_end
)
3166 md
->offset_vector
[number
] = save_offset1
;
3167 md
->offset_vector
[number
+1] = save_offset2
;
3173 /* Other types of node can be handled by a switch */
3178 md
->end_match_ptr
= eptr
; /* Record where we ended */
3179 md
->end_offset_top
= offset_top
; /* and how many extracts were taken */
3182 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3183 whole thing doesn't match, so we have to get out via a longjmp(). */
3186 if (match(eptr
, ecode
+1, offset_top
, md
)) SUCCEED
;
3187 longjmp(md
->fail_env
, 1);
3189 /* Assertion brackets. Check the alternative branches in turn - the
3190 matching won't pass the KET for an assertion. If any one branch matches,
3191 the assertion is true. */
3196 if (match(eptr
, ecode
+3, offset_top
, md
)) break;
3197 ecode
+= (ecode
[1] << 8) + ecode
[2];
3199 while (*ecode
== OP_ALT
);
3200 if (*ecode
== OP_KET
) FAIL
;
3202 /* Continue from after the assertion, updating the offsets high water
3203 mark, since extracts may have been taken during the assertion. */
3205 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3207 offset_top
= md
->end_offset_top
;
3210 /* Negative assertion: all branches must fail to match */
3215 if (match(eptr
, ecode
+3, offset_top
, md
)) FAIL
;
3216 ecode
+= (ecode
[1] << 8) + ecode
[2];
3218 while (*ecode
== OP_ALT
);
3222 /* "Once" brackets are like assertion brackets except that after a match,
3223 the point in the subject string is not moved back. Thus there can never be
3224 a move back into the brackets. Check the alternative branches in turn - the
3225 matching won't pass the KET for this kind of subpattern. If any one branch
3226 matches, we carry on, leaving the subject pointer. */
3231 if (match(eptr
, ecode
+3, offset_top
, md
)) break;
3232 ecode
+= (ecode
[1] << 8) + ecode
[2];
3234 while (*ecode
== OP_ALT
);
3235 if (*ecode
== OP_KET
) FAIL
;
3237 /* Continue as from after the assertion, updating the offsets high water
3238 mark, since extracts may have been taken. */
3240 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3242 offset_top
= md
->end_offset_top
;
3243 eptr
= md
->end_match_ptr
;
3246 /* An alternation is the end of a branch; scan along to find the end of the
3247 bracketed group and go to there. */
3250 do ecode
+= (ecode
[1] << 8) + ecode
[2]; while (*ecode
== OP_ALT
);
3253 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3254 that it may occur zero times. It may repeat infinitely, or not at all -
3255 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3256 repeat limits are compiled as a number of copies, with the optional ones
3257 preceded by BRAZERO or BRAMINZERO. */
3261 const uschar
*next
= ecode
+1;
3262 if (match(eptr
, next
, offset_top
, md
)) SUCCEED
;
3263 do next
+= (next
[1] << 8) + next
[2]; while (*next
== OP_ALT
);
3270 const uschar
*next
= ecode
+1;
3271 do next
+= (next
[1] << 8) + next
[2]; while (*next
== OP_ALT
);
3272 if (match(eptr
, next
+3, offset_top
, md
)) SUCCEED
;
3277 /* End of a group, repeated or non-repeating. If we are at the end of
3278 an assertion "group", stop matching and SUCCEED, but record the
3279 current high water mark for use by positive assertions. */
3286 const uschar
*prev
= ecode
- (ecode
[1] << 8) - ecode
[2];
3288 if (*prev
== OP_ASSERT
|| *prev
== OP_ASSERT_NOT
|| *prev
== OP_ONCE
)
3290 md
->end_match_ptr
= eptr
; /* For ONCE */
3291 md
->end_offset_top
= offset_top
;
3295 /* In all other cases we have to check the group number back at the
3296 start and if necessary complete handling an extraction by setting the
3297 final offset and bumping the high water mark. */
3299 number
= (*prev
- OP_BRA
) << 1;
3301 DPRINTF(("end bracket %d\n", number
/2));
3305 if (number
>= md
->offset_end
) md
->offset_overflow
= TRUE
; else
3307 md
->offset_vector
[number
+1] = eptr
- md
->start_subject
;
3308 if (offset_top
<= number
) offset_top
= number
+ 2;
3312 /* For a non-repeating ket, just advance to the next node and continue at
3315 if (*ecode
== OP_KET
)
3321 /* The repeating kets try the rest of the pattern or restart from the
3322 preceding bracket, in the appropriate order. */
3324 if (*ecode
== OP_KETRMIN
)
3327 if (match(eptr
, ecode
+3, offset_top
, md
)) goto succeed
;
3328 /* Handle alternation inside the BRA...KET; push the additional
3329 alternatives onto the stack */
3332 ptr
+= (ptr
[1]<<8)+ ptr
[2];
3335 if (md
->length
== md
->point
)
3339 md
->offset_top
[md
->point
] = offset_top
;
3340 md
->eptr
[md
->point
] = eptr
;
3341 md
->ecode
[md
->point
] = ptr
+3;
3342 md
->r1
[md
->point
] = 0;
3343 md
->r2
[md
->point
] = 0;
3344 md
->off_num
[md
->point
] = 0;
3347 } while (*ptr
==OP_ALT
);
3348 ecode
=prev
+3; goto match_loop
;
3350 else /* OP_KETRMAX */
3353 /*int points_pushed=0;*/
3355 /* Push one failure point, that will resume matching at the code after
3356 the KETRMAX opcode. */
3357 if (md
->length
== md
->point
)
3361 md
->offset_top
[md
->point
] = offset_top
;
3362 md
->eptr
[md
->point
] = eptr
;
3363 md
->ecode
[md
->point
] = ecode
+3;
3364 md
->r1
[md
->point
] = md
->offset_vector
[number
];
3365 md
->r2
[md
->point
] = md
->offset_vector
[number
+1];
3366 md
->off_num
[md
->point
] = number
;
3369 md
->offset_vector
[number
] = eptr
- md
->start_subject
;
3370 /* Handle alternation inside the BRA...KET; push each of the
3371 additional alternatives onto the stack */
3374 ptr
+= (ptr
[1]<<8)+ ptr
[2];
3377 if (md
->length
== md
->point
)
3378 if (md
->length
== md
->point
)
3382 md
->offset_top
[md
->point
] = offset_top
;
3383 md
->eptr
[md
->point
] = eptr
;
3384 md
->ecode
[md
->point
] = ptr
+3;
3385 md
->r1
[md
->point
] = 0;
3386 md
->r2
[md
->point
] = 0;
3387 md
->off_num
[md
->point
] = 0;
3389 /*points_pushed++;*/
3391 } while (*ptr
==OP_ALT
);
3392 /* Jump to the first (or only) alternative and resume trying to match */
3393 ecode
=prev
+3; goto match_loop
;
3397 /* Start of subject unless notbol, or after internal newline if multiline */
3400 if (md
->notbol
&& eptr
== md
->start_subject
) FAIL
;
3403 if (eptr
!= md
->start_subject
&& eptr
[-1] != '\n') FAIL
;
3407 /* ... else fall through */
3409 /* Start of subject assertion */
3412 if (eptr
!= md
->start_subject
) FAIL
;
3416 /* Assert before internal newline if multiline, or before
3417 a terminating newline unless endonly is set, else end of subject unless
3421 if (md
->noteol
&& eptr
>= md
->end_subject
) FAIL
;
3424 if (eptr
< md
->end_subject
&& *eptr
!= '\n') FAIL
;
3428 else if (!md
->endonly
)
3430 if (eptr
< md
->end_subject
- 1 ||
3431 (eptr
== md
->end_subject
- 1 && *eptr
!= '\n')) FAIL
;
3435 /* ... else fall through */
3437 /* End of subject assertion */
3440 if (eptr
< md
->end_subject
) FAIL
;
3444 /* Word boundary assertions */
3446 case OP_NOT_WORD_BOUNDARY
:
3447 case OP_WORD_BOUNDARY
:
3449 BOOL prev_is_word
= (eptr
!= md
->start_subject
) &&
3450 ((pcre_ctypes
[eptr
[-1]] & ctype_word
) != 0);
3451 BOOL cur_is_word
= (eptr
< md
->end_subject
) &&
3452 ((pcre_ctypes
[*eptr
] & ctype_word
) != 0);
3453 if ((*ecode
++ == OP_WORD_BOUNDARY
)?
3454 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
3459 case OP_NOT_WORD_BOUNDARY_L
:
3460 case OP_WORD_BOUNDARY_L
:
3462 BOOL prev_is_word
= (eptr
!= md
->start_subject
) &&
3463 (isalnum(eptr
[-1]) || eptr
[-1]=='_');
3464 BOOL cur_is_word
= (eptr
< md
->end_subject
) &&
3465 (isalnum(*eptr
) || *eptr
=='_');
3466 if ((*ecode
++ == OP_WORD_BOUNDARY_L
)?
3467 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
3473 /* Match a single character type; inline for speed */
3476 if (!md
->dotall
&& eptr
< md
->end_subject
&& *eptr
== '\n') FAIL
;
3477 if (eptr
++ >= md
->end_subject
) FAIL
;
3482 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_digit
) != 0)
3488 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_digit
) == 0)
3493 case OP_NOT_WHITESPACE
:
3494 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_space
) != 0)
3500 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_space
) == 0)
3505 case OP_NOT_WORDCHAR
:
3506 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_word
) != 0)
3512 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
++] & ctype_word
) == 0)
3517 case OP_NOT_WORDCHAR_L
:
3518 if (eptr
>= md
->end_subject
|| (*eptr
=='_' || isalnum(*eptr
) ))
3525 if (eptr
>= md
->end_subject
|| (*eptr
!='_' && !isalnum(*eptr
) ))
3531 /* Match a back reference, possibly repeatedly. Look past the end of the
3532 item to see if there is repeat information following. The code is similar
3533 to that for character classes, but repeated for efficiency. Then obey
3534 similar code to character type repeats - written out again for speed.
3535 However, if the referenced string is the empty string, always treat
3536 it as matched, any number of times (otherwise there could be infinite
3542 int number
= ecode
[1] << 1; /* Doubled reference number */
3543 ecode
+= 2; /* Advance past the item */
3545 if (number
>= offset_top
|| md
->offset_vector
[number
] < 0)
3547 md
->errorcode
= PCRE_ERROR_BADREF
;
3551 length
= md
->offset_vector
[number
+1] - md
->offset_vector
[number
];
3561 c
= *ecode
++ - OP_CRSTAR
;
3562 minimize
= (c
& 1) != 0;
3563 min
= rep_min
[c
]; /* Pick up values from tables; */
3564 max
= rep_max
[c
]; /* zero for max => infinity */
3565 if (max
== 0) max
= INT_MAX
;
3570 minimize
= (*ecode
== OP_CRMINRANGE
);
3571 min
= (ecode
[1] << 8) + ecode
[2];
3572 max
= (ecode
[3] << 8) + ecode
[4];
3573 if (max
== 0) max
= INT_MAX
;
3577 default: /* No repeat follows */
3578 if (!match_ref(number
, eptr
, length
, md
)) FAIL
;
3580 continue; /* With the main loop */
3583 /* If the length of the reference is zero, just continue with the
3586 if (length
== 0) continue;
3588 /* First, ensure the minimum number of matches are present. We get back
3589 the length of the reference string explicitly rather than passing the
3590 address of eptr, so that eptr can be a register variable. */
3592 for (i
= 1; i
<= min
; i
++)
3594 if (!match_ref(number
, eptr
, length
, md
)) FAIL
;
3598 /* If min = max, continue at the same level without recursion.
3599 They are not both allowed to be zero. */
3601 if (min
== max
) continue;
3603 /* If minimizing, keep trying and advancing the pointer */
3609 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3610 if (i
>= max
|| !match_ref(number
, eptr
, length
, md
))
3614 /* Control never gets here */
3617 /* If maximizing, find the longest string and work backwards */
3621 const uschar
*pp
= eptr
;
3622 for (i
= min
; i
< max
; i
++)
3624 if (!match_ref(number
, eptr
, length
, md
)) break;
3629 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3635 /* Control never gets here */
3637 /* Match a character class, possibly repeatedly. Look past the end of the
3638 item to see if there is repeat information following. Then obey similar
3639 code to character type repeats - written out again for speed. If caseless
3640 matching was set at runtime but not at compile time, we have to check both
3641 versions of a character, and we have to behave differently for positive and
3642 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3643 treated differently. */
3648 BOOL nasty_case
= *ecode
== OP_NEGCLASS
&& md
->runtime_caseless
;
3649 const uschar
*data
= ecode
+ 1; /* Save for matching */
3650 ecode
+= 33; /* Advance past the item */
3660 c
= *ecode
++ - OP_CRSTAR
;
3661 minimize
= (c
& 1) != 0;
3662 min
= rep_min
[c
]; /* Pick up values from tables; */
3663 max
= rep_max
[c
]; /* zero for max => infinity */
3664 if (max
== 0) max
= INT_MAX
;
3669 minimize
= (*ecode
== OP_CRMINRANGE
);
3670 min
= (ecode
[1] << 8) + ecode
[2];
3671 max
= (ecode
[3] << 8) + ecode
[4];
3672 if (max
== 0) max
= INT_MAX
;
3676 default: /* No repeat follows */
3681 /* First, ensure the minimum number of matches are present. */
3683 for (i
= 1; i
<= min
; i
++)
3685 if (eptr
>= md
->end_subject
) FAIL
;
3688 /* Either not runtime caseless, or it was a positive class. For
3689 runtime caseless, continue if either case is in the map. */
3693 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3694 if (md
->runtime_caseless
)
3697 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3701 /* Runtime caseless and it was a negative class. Continue only if
3702 both cases are in the map. */
3706 if ((data
[c
/8] & (1 << (c
&7))) == 0) FAIL
;
3708 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3714 /* If max == min we can continue with the main loop without the
3717 if (min
== max
) continue;
3719 /* If minimizing, keep testing the rest of the expression and advancing
3720 the pointer while it matches the class. */
3726 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3727 if (i
>= max
|| eptr
>= md
->end_subject
) FAIL
;
3730 /* Either not runtime caseless, or it was a positive class. For
3731 runtime caseless, continue if either case is in the map. */
3735 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3736 if (md
->runtime_caseless
)
3739 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3743 /* Runtime caseless and it was a negative class. Continue only if
3744 both cases are in the map. */
3748 if ((data
[c
/8] & (1 << (c
&7))) == 0) return FALSE
;
3750 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3755 /* Control never gets here */
3758 /* If maximizing, find the longest possible run, then work backwards. */
3762 const uschar
*pp
= eptr
;
3763 for (i
= min
; i
< max
; eptr
++, i
++)
3765 if (eptr
>= md
->end_subject
) break;
3768 /* Either not runtime caseless, or it was a positive class. For
3769 runtime caseless, continue if either case is in the map. */
3773 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3774 if (md
->runtime_caseless
)
3777 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3781 /* Runtime caseless and it was a negative class. Continue only if
3782 both cases are in the map. */
3786 if ((data
[c
/8] & (1 << (c
&7))) == 0) break;
3788 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3795 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
3799 /* Control never gets here */
3801 /* OP_CLASS_L opcode: handles localized character classes */
3805 const uschar
*data
= ecode
+ 1; /* Save for matching */
3806 const uschar locale_flag
= *data
;
3807 ecode
++; data
++; /* The localization support adds an extra byte */
3809 ecode
+= 33; /* Advance past the item */
3819 c
= *ecode
++ - OP_CRSTAR
;
3820 minimize
= (c
& 1) != 0;
3821 min
= rep_min
[c
]; /* Pick up values from tables; */
3822 max
= rep_max
[c
]; /* zero for max => infinity */
3823 if (max
== 0) max
= INT_MAX
;
3828 minimize
= (*ecode
== OP_CRMINRANGE
);
3829 min
= (ecode
[1] << 8) + ecode
[2];
3830 max
= (ecode
[3] << 8) + ecode
[4];
3831 if (max
== 0) max
= INT_MAX
;
3835 default: /* No repeat follows */
3836 if (eptr
>= md
->end_subject
) FAIL
;
3838 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue; /* With main loop */
3839 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3840 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3842 if ( (locale_flag
& 4) && isdigit(c
) ) continue; /* Locale \d */
3843 if ( (locale_flag
& 8) && !isdigit(c
) ) continue; /* Locale \D */
3844 if ( (locale_flag
& 16) && isspace(c
) ) continue; /* Locale \s */
3845 if ( (locale_flag
& 32) && !isspace(c
) ) continue; /* Locale \S */
3848 if (md
->runtime_caseless
)
3851 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue; /* With main loop */
3853 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3854 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3859 /* First, ensure the minimum number of matches are present. */
3861 for (i
= 1; i
<= min
; i
++)
3863 if (eptr
>= md
->end_subject
) FAIL
;
3865 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3866 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3867 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3869 if (md
->runtime_caseless
)
3872 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3873 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3874 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3879 /* If max == min we can continue with the main loop without the
3882 if (min
== max
) continue;
3884 /* If minimizing, keep testing the rest of the expression and advancing
3885 the pointer while it matches the class. */
3891 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
3892 if (i
>= max
|| eptr
>= md
->end_subject
) FAIL
;
3894 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3895 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3896 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3898 if (md
->runtime_caseless
)
3901 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3902 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3903 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3907 /* Control never gets here */
3910 /* If maximizing, find the longest possible run, then work backwards. */
3914 const uschar
*pp
= eptr
;
3915 for (i
= min
; i
< max
; eptr
++, i
++)
3917 if (eptr
>= md
->end_subject
) break;
3919 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3920 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3921 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3922 if (md
->runtime_caseless
)
3925 if ((data
[c
/8] & (1 << (c
&7))) != 0) continue;
3926 if ( (locale_flag
& 1) && (isalnum(c
) || c
=='_') ) continue; /* Locale \w */
3927 if ( (locale_flag
& 2) && (!isalnum(c
) && c
!='_') ) continue; /* Locale \W */
3933 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
3937 /* Control never gets here */
3939 /* Match a run of characters */
3943 register int length
= ecode
[1];
3946 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3947 if (eptr
>= md
->end_subject
)
3948 printf("matching subject <null> against pattern ");
3951 printf("matching subject ");
3952 pchars(eptr
, length
, TRUE
, md
);
3953 printf(" against pattern ");
3955 pchars(ecode
, length
, FALSE
, md
);
3959 if (length
> md
->end_subject
- eptr
) FAIL
;
3962 while (length
-- > 0) if (pcre_lcc
[*ecode
++] != pcre_lcc
[*eptr
++]) FAIL
;
3966 while (length
-- > 0) if (*ecode
++ != *eptr
++) FAIL
;
3971 /* Match a single character repeatedly; different opcodes share code. */
3974 min
= max
= (ecode
[1] << 8) + ecode
[2];
3981 max
= (ecode
[1] << 8) + ecode
[2];
3982 minimize
= *ecode
== OP_MINUPTO
;
3992 c
= *ecode
++ - OP_STAR
;
3993 minimize
= (c
& 1) != 0;
3994 min
= rep_min
[c
]; /* Pick up values from tables; */
3995 max
= rep_max
[c
]; /* zero for max => infinity */
3996 if (max
== 0) max
= INT_MAX
;
3998 /* Common code for all repeated single-character matches. We can give
3999 up quickly if there are fewer than the minimum number of characters left in
4003 if (min
> md
->end_subject
- eptr
) FAIL
;
4006 /* The code is duplicated for the caseless and caseful cases, for speed,
4007 since matching characters is likely to be quite common. First, ensure the
4008 minimum number of matches are present. If min = max, continue at the same
4009 level without recursing. Otherwise, if minimizing, keep trying the rest of
4010 the expression and advancing one matching character if failing, up to the
4011 maximum. Alternatively, if maximizing, find the maximum number of
4012 characters and work backwards. */
4014 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c
, min
, max
,
4020 for (i
= 1; i
<= min
; i
++) if (c
!= pcre_lcc
[*eptr
++]) FAIL
;
4021 if (min
== max
) continue;
4026 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4027 if (i
>= max
|| eptr
>= md
->end_subject
|| c
!= pcre_lcc
[*eptr
++])
4030 /* Control never gets here */
4034 const uschar
*pp
= eptr
;
4035 for (i
= min
; i
< max
; i
++)
4037 if (eptr
>= md
->end_subject
|| c
!= pcre_lcc
[*eptr
]) break;
4041 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4044 /* Control never gets here */
4047 /* Caseful comparisons */
4051 for (i
= 1; i
<= min
; i
++) if (c
!= *eptr
++) FAIL
;
4052 if (min
== max
) continue;
4057 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4058 if (i
>= max
|| eptr
>= md
->end_subject
|| c
!= *eptr
++) FAIL
;
4060 /* Control never gets here */
4064 const uschar
*pp
= eptr
;
4065 for (i
= min
; i
< max
; i
++)
4067 if (eptr
>= md
->end_subject
|| c
!= *eptr
) break;
4071 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4075 /* Control never gets here */
4077 /* Match a negated single character */
4080 if (eptr
>= md
->end_subject
) FAIL
;
4084 if (pcre_lcc
[*ecode
++] == pcre_lcc
[*eptr
++]) FAIL
;
4088 if (*ecode
++ == *eptr
++) FAIL
;
4092 /* Match a negated single character repeatedly. This is almost a repeat of
4093 the code for a repeated single character, but I haven't found a nice way of
4094 commoning these up that doesn't require a test of the positive/negative
4095 option for each character match. Maybe that wouldn't add very much to the
4096 time taken, but character matching *is* what this is all about... */
4099 min
= max
= (ecode
[1] << 8) + ecode
[2];
4106 max
= (ecode
[1] << 8) + ecode
[2];
4107 minimize
= *ecode
== OP_NOTMINUPTO
;
4116 case OP_NOTMINQUERY
:
4117 c
= *ecode
++ - OP_NOTSTAR
;
4118 minimize
= (c
& 1) != 0;
4119 min
= rep_min
[c
]; /* Pick up values from tables; */
4120 max
= rep_max
[c
]; /* zero for max => infinity */
4121 if (max
== 0) max
= INT_MAX
;
4123 /* Common code for all repeated single-character matches. We can give
4124 up quickly if there are fewer than the minimum number of characters left in
4128 if (min
> md
->end_subject
- eptr
) FAIL
;
4131 /* The code is duplicated for the caseless and caseful cases, for speed,
4132 since matching characters is likely to be quite common. First, ensure the
4133 minimum number of matches are present. If min = max, continue at the same
4134 level without recursing. Otherwise, if minimizing, keep trying the rest of
4135 the expression and advancing one matching character if failing, up to the
4136 maximum. Alternatively, if maximizing, find the maximum number of
4137 characters and work backwards. */
4139 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c
, min
, max
,
4145 for (i
= 1; i
<= min
; i
++) if (c
== pcre_lcc
[*eptr
++]) FAIL
;
4146 if (min
== max
) continue;
4151 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4152 if (i
>= max
|| eptr
>= md
->end_subject
|| c
== pcre_lcc
[*eptr
++])
4155 /* Control never gets here */
4159 const uschar
*pp
= eptr
;
4160 for (i
= min
; i
< max
; i
++)
4162 if (eptr
>= md
->end_subject
|| c
== pcre_lcc
[*eptr
]) break;
4166 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4169 /* Control never gets here */
4172 /* Caseful comparisons */
4176 for (i
= 1; i
<= min
; i
++) if (c
== *eptr
++) FAIL
;
4177 if (min
== max
) continue;
4182 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4183 if (i
>= max
|| eptr
>= md
->end_subject
|| c
== *eptr
++) FAIL
;
4185 /* Control never gets here */
4189 const uschar
*pp
= eptr
;
4190 for (i
= min
; i
< max
; i
++)
4192 if (eptr
>= md
->end_subject
|| c
== *eptr
) break;
4196 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4200 /* Control never gets here */
4202 /* Match a single character type repeatedly; several different opcodes
4203 share code. This is very similar to the code for single characters, but we
4204 repeat it in the interests of efficiency. */
4207 min
= max
= (ecode
[1] << 8) + ecode
[2];
4213 case OP_TYPEMINUPTO
:
4215 max
= (ecode
[1] << 8) + ecode
[2];
4216 minimize
= *ecode
== OP_TYPEMINUPTO
;
4221 case OP_TYPEMINSTAR
:
4223 case OP_TYPEMINPLUS
:
4225 case OP_TYPEMINQUERY
:
4226 c
= *ecode
++ - OP_TYPESTAR
;
4227 minimize
= (c
& 1) != 0;
4228 min
= rep_min
[c
]; /* Pick up values from tables; */
4229 max
= rep_max
[c
]; /* zero for max => infinity */
4230 if (max
== 0) max
= INT_MAX
;
4232 /* Common code for all repeated single character type matches */
4235 ctype
= *ecode
++; /* Code for the character type */
4237 /* First, ensure the minimum number of matches are present. Use inline
4238 code for maximizing the speed, and do the type test once at the start
4239 (i.e. keep it out of the loop). Also test that there are at least the
4240 minimum number of characters before we start. */
4242 if (min
> md
->end_subject
- eptr
) FAIL
;
4243 if (min
> 0) switch(ctype
)
4247 { for (i
= 1; i
<= min
; i
++) if (*eptr
++ == '\n') FAIL
; }
4252 for (i
= 1; i
<= min
; i
++)
4253 if ((pcre_ctypes
[*eptr
++] & ctype_digit
) != 0) FAIL
;
4257 for (i
= 1; i
<= min
; i
++)
4258 if ((pcre_ctypes
[*eptr
++] & ctype_digit
) == 0) FAIL
;
4261 case OP_NOT_WHITESPACE
:
4262 for (i
= 1; i
<= min
; i
++)
4263 if ((pcre_ctypes
[*eptr
++] & ctype_space
) != 0) FAIL
;
4267 for (i
= 1; i
<= min
; i
++)
4268 if ((pcre_ctypes
[*eptr
++] & ctype_space
) == 0) FAIL
;
4271 case OP_NOT_WORDCHAR
:
4272 for (i
= 1; i
<= min
; i
++) if ((pcre_ctypes
[*eptr
++] & ctype_word
) != 0)
4277 for (i
= 1; i
<= min
; i
++) if ((pcre_ctypes
[*eptr
++] & ctype_word
) == 0)
4281 case OP_NOT_WORDCHAR_L
:
4282 for (i
= 1; i
<= min
; i
++, eptr
++) if (*eptr
=='_' || isalnum(*eptr
))
4287 for (i
= 1; i
<= min
; i
++, eptr
++) if (*eptr
!='_' && !isalnum(*eptr
))
4292 /* If min = max, continue at the same level without recursing */
4294 if (min
== max
) continue;
4296 /* If minimizing, we have to test the rest of the pattern before each
4297 subsequent match, so inlining isn't much help; just use the function. */
4303 if (match(eptr
, ecode
, offset_top
, md
)) SUCCEED
;
4304 if (i
>= max
|| eptr
>= md
->end_subject
||
4305 !match_type(ctype
, *eptr
++, md
->dotall
))
4308 /* Control never gets here */
4311 /* If maximizing it is worth using inline code for speed, doing the type
4312 test once at the start (i.e. keep it out of the loop). */
4316 const uschar
*pp
= eptr
;
4322 for (i
= min
; i
< max
; i
++)
4324 if (eptr
>= md
->end_subject
|| *eptr
== '\n') break;
4331 if (c
> md
->end_subject
- eptr
) c
= md
->end_subject
- eptr
;
4337 for (i
= min
; i
< max
; i
++)
4339 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_digit
) != 0)
4346 for (i
= min
; i
< max
; i
++)
4348 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_digit
) == 0)
4354 case OP_NOT_WHITESPACE
:
4355 for (i
= min
; i
< max
; i
++)
4357 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_space
) != 0)
4364 for (i
= min
; i
< max
; i
++)
4366 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_space
) == 0)
4372 case OP_NOT_WORDCHAR
:
4373 for (i
= min
; i
< max
; i
++)
4375 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_word
) != 0)
4382 for (i
= min
; i
< max
; i
++)
4384 if (eptr
>= md
->end_subject
|| (pcre_ctypes
[*eptr
] & ctype_word
) == 0)
4389 case OP_NOT_WORDCHAR_L
:
4390 for (i
= min
; i
< max
; i
++)
4392 if (eptr
>= md
->end_subject
|| (*eptr
=='_' || isalnum(*eptr
) ) )
4399 for (i
= min
; i
< max
; i
++)
4401 if (eptr
>= md
->end_subject
|| (*eptr
!='_' && !isalnum(*eptr
) ) )
4409 if (match(eptr
--, ecode
, offset_top
, md
)) SUCCEED
;
4412 /* Control never gets here */
4414 /* There's been some horrible disaster. */
4417 DPRINTF(("Unknown opcode %d\n", *ecode
));
4418 md
->errorcode
= PCRE_ERROR_UNKNOWN_NODE
;
4422 /* Do not stick any code in here without much thought; it is assumed
4423 that "continue" in the code above comes out to here to repeat the main
4426 } /* End of main loop */
4427 /* Control never reaches here */
4430 if (md
->point
> save_stack_position
)
4432 /* If there are still points remaining on the stack, pop the next one off */
4436 offset_top
= md
->offset_top
[md
->point
];
4437 eptr
= md
->eptr
[md
->point
];
4438 ecode
= md
->ecode
[md
->point
];
4439 off_num
= md
->off_num
[md
->point
];
4440 md
->offset_vector
[off_num
] = md
->r1
[md
->point
];
4441 md
->offset_vector
[off_num
+1] = md
->r2
[md
->point
];
4444 /* Failure, and nothing left on the stack, so end this function call */
4446 /* Restore the top of the stack to where it was before this function
4447 call. This lets us use one stack for everything; recursive calls
4448 can push and pop information, and may increase the stack. When
4449 the call returns, the parent function can resume pushing and
4450 popping wherever it was. */
4452 md
->point
= save_stack_position
;
4461 /*************************************************
4462 * Segregate setjmp() *
4463 *************************************************/
4465 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4466 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4467 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4468 since it's needed only for the extension \X option, and with any luck, a good
4469 compiler will spot the tail recursion and compile it efficiently.
4472 eptr pointer in subject
4473 ecode position in code
4474 offset_top current top pointer
4475 md pointer to "static" info for the match
4477 Returns: TRUE if matched
4481 match_with_setjmp(const uschar
*eptr
, const uschar
*ecode
, int offset_top
,
4482 match_data
*match_block
)
4484 return setjmp(match_block
->fail_env
) == 0 &&
4485 match(eptr
, ecode
, offset_top
, match_block
);
4490 /*************************************************
4491 * Execute a Regular Expression *
4492 *************************************************/
4494 /* This function applies a compiled re to a subject string and picks out
4495 portions of the string if it matches. Two elements in the vector are set for
4496 each substring: the offsets to the start and end of the substring.
4499 external_re points to the compiled expression
4500 external_extra points to "hints" from pcre_study() or is NULL
4501 subject points to the subject string
4502 length length of subject string (may contain binary zeros)
4504 offsets points to a vector of ints to be filled in with offsets
4505 offsetcount the number of elements in the vector
4507 Returns: > 0 => success; value is the number of elements filled in
4508 = 0 => success, but offsets is not big enough
4509 -1 => failed to match
4510 < -1 => some kind of unexpected problem
4514 pcre_exec(const pcre
*external_re
, const pcre_extra
*external_extra
,
4515 const char *subject
, int length
, int start_pos
, int options
,
4516 int *offsets
, int offsetcount
)
4518 /* The "volatile" directives are to make gcc -Wall stop complaining
4519 that these variables can be clobbered by the longjmp. Hopefully
4520 they won't cost too much performance. */
4521 volatile int resetcount
, ocount
;
4522 volatile int first_char
= -1;
4523 match_data match_block
;
4524 const uschar
*start_bits
= NULL
;
4525 const uschar
*start_match
= (const uschar
*)subject
+ start_pos
;
4526 const uschar
*end_subject
;
4527 const real_pcre
*re
= (const real_pcre
*)external_re
;
4528 const real_pcre_extra
*extra
= (const real_pcre_extra
*)external_extra
;
4529 volatile BOOL using_temporary_offsets
= FALSE
;
4530 volatile BOOL anchored
= ((re
->options
| options
) & PCRE_ANCHORED
) != 0;
4531 volatile BOOL startline
= (re
->options
& PCRE_STARTLINE
) != 0;
4533 if ((options
& ~PUBLIC_EXEC_OPTIONS
) != 0) return PCRE_ERROR_BADOPTION
;
4535 if (re
== NULL
|| subject
== NULL
||
4536 (offsets
== NULL
&& offsetcount
> 0)) return PCRE_ERROR_NULL
;
4537 if (re
->magic_number
!= MAGIC_NUMBER
) return PCRE_ERROR_BADMAGIC
;
4539 match_block
.start_subject
= (const uschar
*)subject
;
4540 match_block
.end_subject
= match_block
.start_subject
+ length
;
4541 end_subject
= match_block
.end_subject
;
4543 match_block
.caseless
= ((re
->options
| options
) & PCRE_CASELESS
) != 0;
4544 match_block
.runtime_caseless
= match_block
.caseless
&&
4545 (re
->options
& PCRE_CASELESS
) == 0;
4547 match_block
.multiline
= ((re
->options
| options
) & PCRE_MULTILINE
) != 0;
4548 match_block
.dotall
= ((re
->options
| options
) & PCRE_DOTALL
) != 0;
4549 match_block
.endonly
= ((re
->options
| options
) & PCRE_DOLLAR_ENDONLY
) != 0;
4551 match_block
.notbol
= (options
& PCRE_NOTBOL
) != 0;
4552 match_block
.noteol
= (options
& PCRE_NOTEOL
) != 0;
4554 match_block
.errorcode
= PCRE_ERROR_NOMATCH
; /* Default error */
4556 /* Set the stack state to empty */
4557 match_block
.off_num
= match_block
.offset_top
= NULL
;
4558 match_block
.r1
= match_block
.r2
= NULL
;
4559 match_block
.eptr
= match_block
.ecode
= NULL
;
4560 match_block
.point
= match_block
.length
= 0;
4562 /* If the expression has got more back references than the offsets supplied can
4563 hold, we get a temporary bit of working store to use during the matching.
4564 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4567 ocount
= offsetcount
& (-2);
4568 if (re
->top_backref
> 0 && re
->top_backref
>= ocount
/2)
4570 ocount
= re
->top_backref
* 2 + 2;
4571 match_block
.offset_vector
= (int *)(pcre_malloc
)(ocount
* sizeof(int));
4572 if (match_block
.offset_vector
== NULL
) return PCRE_ERROR_NOMEMORY
;
4573 using_temporary_offsets
= TRUE
;
4574 DPRINTF(("Got memory to hold back references\n"));
4576 else match_block
.offset_vector
= offsets
;
4578 match_block
.offset_end
= ocount
;
4579 match_block
.offset_overflow
= FALSE
;
4581 /* Compute the minimum number of offsets that we need to reset each time. Doing
4582 this makes a huge difference to execution time when there aren't many brackets
4585 resetcount
= 2 + re
->top_bracket
* 2;
4586 if (resetcount
> offsetcount
) resetcount
= ocount
;
4588 /* If MULTILINE is set at exec time but was not set at compile time, and the
4589 anchored flag is set, we must re-check because a setting provoked by ^ in the
4590 pattern is not right in multi-line mode. Calling is_anchored() again here does
4591 the right check, because multiline is now set. If it now yields FALSE, the
4592 expression must have had ^ starting some of its branches. Check to see if
4593 that is true for *all* branches, and if so, set the startline flag. */
4595 if (match_block
.multiline
&& anchored
&& (re
->options
& PCRE_MULTILINE
) == 0 &&
4596 !is_anchored(re
->code
, match_block
.multiline
))
4599 if (is_startline(re
->code
)) startline
= TRUE
;
4602 /* Set up the first character to match, if available. The first_char value is
4603 never set for an anchored regular expression, but the anchoring may be forced
4604 at run time, so we have to test for anchoring. The first char may be unset for
4605 an unanchored pattern, of course. If there's no first char and the pattern was
4606 studied, the may be a bitmap of possible first characters. However, we can
4607 use this only if the caseless state of the studying was correct. */
4611 if ((re
->options
& PCRE_FIRSTSET
) != 0)
4613 first_char
= re
->first_char
;
4614 if (match_block
.caseless
) first_char
= pcre_lcc
[first_char
];
4617 if (!startline
&& extra
!= NULL
&&
4618 (extra
->options
& PCRE_STUDY_MAPPED
) != 0 &&
4619 ((extra
->options
& PCRE_STUDY_CASELESS
) != 0) == match_block
.caseless
)
4620 start_bits
= extra
->start_bits
;
4623 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4628 register int *iptr
= match_block
.offset_vector
;
4629 register int *iend
= iptr
+ resetcount
;
4631 /* Reset the maximum number of extractions we might see. */
4633 while (iptr
< iend
) *iptr
++ = -1;
4635 /* Advance to a unique first char if possible */
4637 if (first_char
>= 0)
4639 if (match_block
.caseless
)
4640 while (start_match
< end_subject
&& pcre_lcc
[*start_match
] != first_char
)
4643 while (start_match
< end_subject
&& *start_match
!= first_char
)
4647 /* Or to just after \n for a multiline match if possible */
4651 if (start_match
> match_block
.start_subject
)
4653 while (start_match
< end_subject
&& start_match
[-1] != '\n')
4658 /* Or to a non-unique first char */
4660 else if (start_bits
!= NULL
)
4662 while (start_match
< end_subject
)
4664 register int c
= *start_match
;
4665 if ((start_bits
[c
/8] & (1 << (c
&7))) == 0) start_match
++; else break;
4669 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4670 printf(">>>> Match against: ");
4671 pchars(start_match
, end_subject
- start_match
, TRUE
, &match_block
);
4675 /* When a match occurs, substrings will be set for all internal extractions;
4676 we just need to set up the whole thing as substring 0 before returning. If
4677 there were too many extractions, set the return code to zero. In the case
4678 where we had to get some local store to hold offsets for backreferences, copy
4679 those back references that we can. In this case there need not be overflow
4680 if certain parts of the pattern were not used.
4682 Before starting the match, we have to set up a longjmp() target to enable
4683 the "cut" operation to fail a match completely without backtracking. This
4684 is done in a separate function to avoid compiler warnings. We need not do
4685 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4688 /* To handle errors such as running out of memory for the failure
4689 stack, we need to save this location via setjmp(), so
4690 error-handling code can call longjmp() to jump out of deeply-nested code. */
4691 if (setjmp(match_block
.error_env
)==0)
4694 if ((re
->options
& PCRE_EXTRA
) != 0)
4696 if (!match_with_setjmp(start_match
, re
->code
, 2, &match_block
))
4699 else if (!match(start_match
, re
->code
, 2, &match_block
)) continue;
4701 /* Copy the offset information from temporary store if necessary */
4703 if (using_temporary_offsets
)
4705 if (offsetcount
>= 4)
4707 memcpy(offsets
+ 2, match_block
.offset_vector
+ 2,
4708 (offsetcount
- 2) * sizeof(int));
4709 DPRINTF(("Copied offsets from temporary memory\n"));
4711 if (match_block
.end_offset_top
> offsetcount
)
4712 match_block
.offset_overflow
= TRUE
;
4714 DPRINTF(("Freeing temporary memory\n"));
4715 (pcre_free
)(match_block
.offset_vector
);
4718 rc
= match_block
.offset_overflow
? 0 : match_block
.end_offset_top
/2;
4720 if (match_block
.offset_end
< 2) rc
= 0; else
4722 offsets
[0] = start_match
- match_block
.start_subject
;
4723 offsets
[1] = match_block
.end_match_ptr
- match_block
.start_subject
;
4726 DPRINTF((">>>> returning %d\n", rc
));
4727 free_stack(&match_block
);
4729 } /* End of (if setjmp(match_block.error_env)...) */
4730 free_stack(&match_block
);
4732 /* Return an error code; pcremodule.c will preserve the exception */
4733 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY
;
4736 match_block
.errorcode
== PCRE_ERROR_NOMATCH
&&
4737 start_match
++ < end_subject
);
4739 if (using_temporary_offsets
)
4741 DPRINTF(("Freeing temporary memory\n"));
4742 (pcre_free
)(match_block
.offset_vector
);
4746 printf(">>>> returning %d\n", match_block
.errorcode
);
4749 free_stack(&match_block
);
4750 return match_block
.errorcode
;