Move setting of ioready 'wait' earlier in call chain, to
[python/dscho.git] / Modules / pypcre.c
bloba6ba17c756f55aa258523b916b746804b3f53eb0
2 /*************************************************
3 * Perl-Compatible Regular Expressions *
4 *************************************************/
6 /* DO NOT EDIT THIS FILE! */
8 /* This file is automatically written by the merge-files.py script
9 included with the PCRE distribution for Python; it's produced from
10 several C files, and code is removed in the process. If you want to
11 modify the code or track down bugs, it will be much easier to work
12 with the code in its original, multiple-file form. Don't edit this
13 file by hand, or submit patches to it.
15 The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
18 The unmodified original PCRE distribution is available at
19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20 written by: Philip Hazel <ph10@cam.ac.uk>
22 Extensively modified by the Python String-SIG: <string-sig@python.org>
23 Send bug reports to: <string-sig@python.org>
24 (They'll figure out if it's a bug in PCRE or in the Python-specific
25 changes.)
27 Copyright (c) 1997 University of Cambridge
29 -----------------------------------------------------------------------------
30 Permission is granted to anyone to use this software for any purpose on any
31 computer system, and to redistribute it freely, subject to the following
32 restrictions:
34 1. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 2. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
41 3. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43 -----------------------------------------------------------------------------
47 #define FOR_PYTHON
48 #include "Python.h"
49 #include "pcre-int.h"
50 #include <ctype.h>
51 #include "graminit.h"
53 /*************************************************
54 * Perl-Compatible Regular Expressions *
55 *************************************************/
57 /* This file is automatically written by the makechartables auxiliary
58 program. If you edit it by hand, you might like to edit the Makefile to
59 prevent its ever being regenerated. */
61 /* This table is a lower casing table. */
63 unsigned char pcre_lcc[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
97 /* This table is a case flipping table. */
99 unsigned char pcre_fcc[] = {
100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
108 64, 97, 98, 99,100,101,102,103,
109 104,105,106,107,108,109,110,111,
110 112,113,114,115,116,117,118,119,
111 120,121,122, 91, 92, 93, 94, 95,
112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
133 /* This table contains bit maps for digits, letters, 'word' chars, and
134 white space. Each map is 32 bytes long and the bits run from the least
135 significant end of each byte. */
137 unsigned char pcre_cbits[] = {
138 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
139 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
149 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158 /* This table identifies various classes of character by individual bits:
159 0x01 white space character
160 0x02 letter
161 0x04 decimal digit
162 0x08 hexadecimal digit
163 0x10 alphanumeric or '_'
164 0x80 regular expression metacharacter or binary zero
167 unsigned char pcre_ctypes[] = {
168 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
169 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
172 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
173 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
174 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
175 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
176 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
177 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
179 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
180 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
181 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
183 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201 /* End of chartables.c */
202 /*************************************************
203 * Perl-Compatible Regular Expressions *
204 *************************************************/
207 This is a library of functions to support regular expressions whose syntax
208 and semantics are as close as possible to those of the Perl 5 language. See
209 the file Tech.Notes for some information on the internals.
211 Written by: Philip Hazel <ph10@cam.ac.uk>
213 Copyright (c) 1998 University of Cambridge
215 -----------------------------------------------------------------------------
216 Permission is granted to anyone to use this software for any purpose on any
217 computer system, and to redistribute it freely, subject to the following
218 restrictions:
220 1. This software is distributed in the hope that it will be useful,
221 but WITHOUT ANY WARRANTY; without even the implied warranty of
222 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224 2. The origin of this software must not be misrepresented, either by
225 explicit claim or by omission.
227 3. Altered versions must be plainly marked as such, and must not be
228 misrepresented as being the original software.
229 -----------------------------------------------------------------------------
233 /* Include the internals header, which itself includes Standard C headers plus
234 the external pcre header. */
239 /*************************************************
240 * Create bitmap of starting chars *
241 *************************************************/
243 /* This function scans a compiled unanchored expression and attempts to build a
244 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
245 goes by, we may be able to get more clever at doing this.
247 Arguments:
248 code points to an expression
249 start_bits points to a 32-byte table, initialized to 0
251 Returns: TRUE if table built, FALSE otherwise
254 static BOOL
255 set_start_bits(const uschar *code, uschar *start_bits)
257 register int c;
258 volatile int dummy;
262 const uschar *tcode = code + 3;
263 BOOL try_next = TRUE;
265 while (try_next)
267 try_next = FALSE;
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
271 if (!set_start_bits(tcode, start_bits)) return FALSE;
274 else switch(*tcode)
276 default:
277 return FALSE;
279 /* BRAZERO does the bracket, but carries on. */
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
284 dummy = 1;
285 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
286 tcode += 3;
287 try_next = TRUE;
288 break;
290 /* Single-char * or ? sets the bit and tries the next item */
292 case OP_STAR:
293 case OP_MINSTAR:
294 case OP_QUERY:
295 case OP_MINQUERY:
296 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
297 tcode += 2;
298 try_next = TRUE;
299 break;
301 /* Single-char upto sets the bit and tries the next */
303 case OP_UPTO:
304 case OP_MINUPTO:
305 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
306 tcode += 4;
307 try_next = TRUE;
308 break;
310 /* At least one single char sets the bit and stops */
312 case OP_EXACT: /* Fall through */
313 tcode++;
315 case OP_CHARS: /* Fall through */
316 tcode++;
318 case OP_PLUS:
319 case OP_MINPLUS:
320 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
321 break;
323 /* Single character type sets the bits and stops */
325 case OP_NOT_DIGIT:
326 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
327 break;
329 case OP_DIGIT:
330 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
331 break;
333 case OP_NOT_WHITESPACE:
334 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
335 break;
337 case OP_WHITESPACE:
338 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
339 break;
341 case OP_NOT_WORDCHAR:
342 for (c = 0; c < 32; c++)
343 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
344 break;
346 case OP_WORDCHAR:
347 for (c = 0; c < 32; c++)
348 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
349 break;
351 /* One or more character type fudges the pointer and restarts, knowing
352 it will hit a single character type and stop there. */
354 case OP_TYPEPLUS:
355 case OP_TYPEMINPLUS:
356 tcode++;
357 try_next = TRUE;
358 break;
360 case OP_TYPEEXACT:
361 tcode += 3;
362 try_next = TRUE;
363 break;
365 /* Zero or more repeats of character types set the bits and then
366 try again. */
368 case OP_TYPEUPTO:
369 case OP_TYPEMINUPTO:
370 tcode += 2; /* Fall through */
372 case OP_TYPESTAR:
373 case OP_TYPEMINSTAR:
374 case OP_TYPEQUERY:
375 case OP_TYPEMINQUERY:
376 switch(tcode[1])
378 case OP_NOT_DIGIT:
379 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
380 break;
382 case OP_DIGIT:
383 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
384 break;
386 case OP_NOT_WHITESPACE:
387 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
388 break;
390 case OP_WHITESPACE:
391 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
392 break;
394 case OP_NOT_WORDCHAR:
395 for (c = 0; c < 32; c++)
396 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
397 break;
399 case OP_WORDCHAR:
400 for (c = 0; c < 32; c++)
401 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
402 break;
405 tcode += 2;
406 try_next = TRUE;
407 break;
409 /* Character class: set the bits and either carry on or not,
410 according to the repeat count. */
412 case OP_CLASS:
413 case OP_NEGCLASS:
415 tcode++;
416 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
417 tcode += 32;
418 switch (*tcode)
420 case OP_CRSTAR:
421 case OP_CRMINSTAR:
422 case OP_CRQUERY:
423 case OP_CRMINQUERY:
424 tcode++;
425 try_next = TRUE;
426 break;
428 case OP_CRRANGE:
429 case OP_CRMINRANGE:
430 if (((tcode[1] << 8) + tcode[2]) == 0)
432 tcode += 5;
433 try_next = TRUE;
435 break;
438 break; /* End of class handling */
440 } /* End of switch */
441 } /* End of try_next loop */
443 code += (code[1] << 8) + code[2]; /* Advance to next branch */
445 while (*code == OP_ALT);
446 return TRUE;
451 /*************************************************
452 * Study a compiled expression *
453 *************************************************/
455 /* This function is handed a compiled expression that it must study to produce
456 information that will speed up the matching. It returns a pcre_extra block
457 which then gets handed back to pcre_exec().
459 Arguments:
460 re points to the compiled expression
461 options contains option bits
462 errorptr points to where to place error messages;
463 set NULL unless error
465 Returns: pointer to a pcre_extra block,
466 NULL on error or if no optimization possible
469 pcre_extra *
470 pcre_study(const pcre *external_re, int options, const char **errorptr)
472 BOOL caseless;
473 uschar start_bits[32];
474 real_pcre_extra *extra;
475 const real_pcre *re = (const real_pcre *)external_re;
477 *errorptr = NULL;
479 if (re == NULL || re->magic_number != MAGIC_NUMBER)
481 *errorptr = "argument is not a compiled regular expression";
482 return NULL;
485 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
487 *errorptr = "unknown or incorrect option bit(s) set";
488 return NULL;
491 /* Caseless can either be from the compiled regex or from options. */
493 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
495 /* For an anchored pattern, or an unanchored pattern that has a first char, or a
496 multiline pattern that matches only at "line starts", no further processing at
497 present. */
499 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
500 return NULL;
502 /* See if we can find a fixed set of initial characters for the pattern. */
504 memset(start_bits, 0, 32 * sizeof(uschar));
505 if (!set_start_bits(re->code, start_bits)) return NULL;
507 /* If this studying is caseless, scan the created bit map and duplicate the
508 bits for any letters. */
510 if (caseless)
512 register int c;
513 for (c = 0; c < 256; c++)
515 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
516 (pcre_ctypes[c] & ctype_letter) != 0)
518 int d = pcre_fcc[c];
519 start_bits[d/8] |= (1 << (d&7));
524 /* Get an "extra" block and put the information therein. */
526 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
528 if (extra == NULL)
530 *errorptr = "failed to get memory";
531 return NULL;
534 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
535 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
537 return (pcre_extra *)extra;
540 /* End of study.c */
541 /*************************************************
542 * Perl-Compatible Regular Expressions *
543 *************************************************/
546 This is a library of functions to support regular expressions whose syntax
547 and semantics are as close as possible to those of the Perl 5 language. See
548 the file Tech.Notes for some information on the internals.
550 Written by: Philip Hazel <ph10@cam.ac.uk>
552 Copyright (c) 1998 University of Cambridge
554 -----------------------------------------------------------------------------
555 Permission is granted to anyone to use this software for any purpose on any
556 computer system, and to redistribute it freely, subject to the following
557 restrictions:
559 1. This software is distributed in the hope that it will be useful,
560 but WITHOUT ANY WARRANTY; without even the implied warranty of
561 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
563 2. The origin of this software must not be misrepresented, either by
564 explicit claim or by omission.
566 3. Altered versions must be plainly marked as such, and must not be
567 misrepresented as being the original software.
568 -----------------------------------------------------------------------------
572 /* Define DEBUG to get debugging output on stdout. */
574 /* #define DEBUG */
576 /* Use a macro for debugging printing, 'cause that eliminates the the use
577 of #ifdef inline, and there are *still* stupid compilers about that don't like
578 indented pre-processor statements. I suppose it's only been 10 years... */
580 #undef DPRINTF
581 #ifdef DEBUG
582 #define DPRINTF(p) printf p
583 #else
584 #define DPRINTF(p) /*nothing*/
585 #endif
587 /* Include the internals header, which itself includes Standard C headers plus
588 the external pcre header. */
593 #ifndef Py_eval_input
594 /* For Python 1.4, graminit.h has to be explicitly included */
595 #define Py_eval_input eval_input
597 #endif /* FOR_PYTHON */
599 /* Allow compilation as C++ source code, should anybody want to do that. */
601 #ifdef __cplusplus
602 #define class pcre_class
603 #endif
606 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
608 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
609 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
611 /* Text forms of OP_ values and things, for debugging (not all used) */
613 #ifdef DEBUG
614 static const char *OP_names[] = {
615 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
616 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
617 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
618 "^", "$", "Any", "chars",
619 "not",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
623 "*", "*?", "+", "+?", "?", "??", "{", "{",
624 "class", "negclass", "classL", "Ref",
625 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
626 "Brazero", "Braminzero", "Bra"
628 #endif
630 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
631 are simple data values; negative values are for special things like \d and so
632 on. Zero means further processing is needed (for things like \x), or the escape
633 is invalid. */
635 static const short int escapes[] = {
636 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
637 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
638 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
639 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
640 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
641 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
642 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
643 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
644 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
645 0, 0, 0 /* x - z */
648 /* Definition to allow mutual recursion */
650 static BOOL
651 compile_regex(int, int *, uschar **, const uschar **, const char **,
652 PyObject *);
654 /* Structure for passing "static" information around between the functions
655 doing the matching, so that they are thread-safe. */
657 typedef struct match_data {
658 int errorcode; /* As it says */
659 int *offset_vector; /* Offset vector */
660 int offset_end; /* One past the end */
661 BOOL offset_overflow; /* Set if too many extractions */
662 BOOL caseless; /* Case-independent flag */
663 BOOL runtime_caseless; /* Caseless forced at run time */
664 BOOL multiline; /* Multiline flag */
665 BOOL notbol; /* NOTBOL flag */
666 BOOL noteol; /* NOTEOL flag */
667 BOOL dotall; /* Dot matches any char */
668 BOOL endonly; /* Dollar not before final \n */
669 const uschar *start_subject; /* Start of the subject string */
670 const uschar *end_subject; /* End of the subject string */
671 jmp_buf fail_env; /* Environment for longjump() break out */
672 const uschar *end_match_ptr; /* Subject position at end match */
673 int end_offset_top; /* Highwater mark at end of match */
674 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
675 matching operation */
676 int length; /* Length of the allocated stacks */
677 int point; /* Point to add next item pushed onto stacks */
678 /* Pointers to the 6 stacks */
679 int *off_num, *offset_top, *r1, *r2;
680 const uschar **eptr, **ecode;
681 } match_data;
685 /*************************************************
686 * Global variables *
687 *************************************************/
689 /* PCRE is thread-clean and doesn't use any global variables in the normal
690 sense. However, it calls memory allocation and free functions via the two
691 indirections below, which are can be changed by the caller, but are shared
692 between all threads. */
694 void *(*pcre_malloc)(size_t) = malloc;
695 void (*pcre_free)(void *) = free;
700 /*************************************************
701 * Return version string *
702 *************************************************/
704 const char *
705 pcre_version(void)
707 return PCRE_VERSION;
713 /*************************************************
714 * Return info about a compiled pattern *
715 *************************************************/
717 /* This function picks potentially useful data out of the private
718 structure.
720 Arguments:
721 external_re points to compiled code
722 optptr where to pass back the options
723 first_char where to pass back the first character,
724 or -1 if multiline and all branches start ^,
725 or -2 otherwise
727 Returns: number of identifying extraction brackets
728 or negative values on error
732 pcre_info(const pcre *external_re, int *optptr, int *first_char)
734 const real_pcre *re = (real_pcre *)external_re;
735 if (re == NULL) return PCRE_ERROR_NULL;
736 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
737 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
738 if (first_char != NULL)
739 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
740 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
741 return re->top_bracket;
747 #ifdef DEBUG
748 /*************************************************
749 * Debugging function to print chars *
750 *************************************************/
752 /* Print a sequence of chars in printable format, stopping at the end of the
753 subject if the requested.
755 Arguments:
756 p points to characters
757 length number to print
758 is_subject TRUE if printing from within md->start_subject
759 md pointer to matching data block, if is_subject is TRUE
761 Returns: nothing
764 static void
765 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
767 int c;
768 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
769 while (length-- > 0)
770 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
772 #endif
777 /*************************************************
778 * Check subpattern for empty operand *
779 *************************************************/
781 /* This function checks a bracketed subpattern to see if any of the paths
782 through it could match an empty string. This is used to diagnose an error if
783 such a subpattern is followed by a quantifier with an unlimited upper bound.
785 Argument:
786 code points to the opening bracket
788 Returns: TRUE or FALSE
791 static BOOL
792 could_be_empty(uschar *code)
794 do {
795 uschar *cc = code + 3;
797 /* Scan along the opcodes for this branch; as soon as we find something
798 that matches a non-empty string, break out and advance to test the next
799 branch. If we get to the end of the branch, return TRUE for the whole
800 sub-expression. */
802 for (;;)
804 /* Test an embedded subpattern; if it could not be empty, break the
805 loop. Otherwise carry on in the branch. */
807 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
809 if (!could_be_empty(cc)) break;
810 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811 cc += 3;
814 else switch (*cc)
816 /* Reached end of a branch: the subpattern may match the empty string */
818 case OP_ALT:
819 case OP_KET:
820 case OP_KETRMAX:
821 case OP_KETRMIN:
822 return TRUE;
824 /* Skip over entire bracket groups with zero lower bound */
826 case OP_BRAZERO:
827 case OP_BRAMINZERO:
828 cc++;
829 /* Fall through */
831 /* Skip over assertive subpatterns */
833 case OP_ASSERT:
834 case OP_ASSERT_NOT:
835 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
836 cc += 3;
837 break;
839 /* Skip over things that don't match chars */
841 case OP_SOD:
842 case OP_EOD:
843 case OP_CIRC:
844 case OP_DOLL:
845 case OP_NOT_WORD_BOUNDARY:
846 case OP_WORD_BOUNDARY:
847 case OP_NOT_WORD_BOUNDARY_L:
848 case OP_WORD_BOUNDARY_L:
849 cc++;
850 break;
852 /* Skip over simple repeats with zero lower bound */
854 case OP_STAR:
855 case OP_MINSTAR:
856 case OP_QUERY:
857 case OP_MINQUERY:
858 case OP_NOTSTAR:
859 case OP_NOTMINSTAR:
860 case OP_NOTQUERY:
861 case OP_NOTMINQUERY:
862 case OP_TYPESTAR:
863 case OP_TYPEMINSTAR:
864 case OP_TYPEQUERY:
865 case OP_TYPEMINQUERY:
866 cc += 2;
867 break;
869 /* Skip over UPTOs (lower bound is zero) */
871 case OP_UPTO:
872 case OP_MINUPTO:
873 case OP_TYPEUPTO:
874 case OP_TYPEMINUPTO:
875 cc += 4;
876 break;
878 /* Check a class or a back reference for a zero minimum */
880 case OP_CLASS:
881 case OP_NEGCLASS:
882 case OP_REF:
883 case OP_CLASS_L:
884 switch(*cc)
886 case (OP_REF): cc += 2; break;
887 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
888 case (OP_CLASS_L): cc += 1+1+32; break;
891 switch (*cc)
893 case OP_CRSTAR:
894 case OP_CRMINSTAR:
895 case OP_CRQUERY:
896 case OP_CRMINQUERY:
897 cc++;
898 break;
900 case OP_CRRANGE:
901 case OP_CRMINRANGE:
902 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
903 cc += 3;
904 break;
906 default:
907 goto NEXT_BRANCH;
909 break;
911 /* Anything else matches at least one character */
913 default:
914 goto NEXT_BRANCH;
918 NEXT_BRANCH:
919 code += (code[1] << 8) + code[2];
921 while (*code == OP_ALT);
923 /* No branches match the empty string */
925 return FALSE;
928 /* Determine the length of a group ID in an expression like
929 (?P<foo_123>...)
930 Arguments:
931 ptr pattern position pointer (say that 3 times fast)
932 finalchar the character that will mark the end of the ID
933 errorptr points to the pointer to the error message
936 static int
937 get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
939 const uschar *start = ptr;
941 /* If the first character is not in \w, or is in \w but is a digit,
942 report an error */
943 if (!(pcre_ctypes[*ptr] & ctype_word) ||
944 (pcre_ctypes[*ptr++] & ctype_digit))
946 *errorptr = "(?P identifier must start with a letter or underscore";
947 return 0;
950 /* Increment ptr until we either hit a null byte, the desired
951 final character, or a non-word character */
952 for(; (*ptr != 0) && (*ptr != finalchar) &&
953 (pcre_ctypes[*ptr] & ctype_word); ptr++)
955 /* Empty loop body */
957 if (*ptr==finalchar)
958 return ptr-start;
959 if (*ptr==0)
961 *errorptr = "unterminated (?P identifier";
962 return 0;
964 *errorptr = "illegal character in (?P identifier";
965 return 0;
968 /*************************************************
969 * Handle escapes *
970 *************************************************/
972 /* This function is called when a \ has been encountered. It either returns a
973 positive value for a simple escape such as \n, or a negative value which
974 encodes one of the more complicated things such as \d. On entry, ptr is
975 pointing at the \. On exit, it is on the final character of the escape
976 sequence.
978 Arguments:
979 ptrptr points to the pattern position pointer
980 errorptr points to the pointer to the error message
981 bracount number of previous extracting brackets
982 options the options bits
983 isclass TRUE if inside a character class
985 Returns: zero or positive => a data character
986 negative => a special escape sequence
987 on error, errorptr is set
990 static int
991 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
992 int options, BOOL isclass)
994 const uschar *ptr = *ptrptr;
995 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
996 int i;
998 if (c == 0) *errorptr = ERR1;
1000 /* Digits or letters may have special meaning; all others are literals. */
1002 else if (c < '0' || c > 'z') {}
1004 /* Do an initial lookup in a table. A non-zero result is something that can be
1005 returned immediately. Otherwise further processing may be required. */
1007 else if ((i = escapes[c - '0']) != 0) c = i;
1009 /* Escapes that need further processing, or are illegal. */
1011 else
1014 switch (c)
1016 /* The handling of escape sequences consisting of a string of digits
1017 starting with one that is not zero is not straightforward. By experiment,
1018 the way Perl works seems to be as follows:
1020 Outside a character class, the digits are read as a decimal number. If the
1021 number is less than 10, or if there are that many previous extracting
1022 left brackets, then it is a back reference. Otherwise, up to three octal
1023 digits are read to form an escaped byte. Thus \123 is likely to be octal
1024 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1025 value is greater than 377, the least significant 8 bits are taken. Inside a
1026 character class, \ followed by a digit is always an octal number. */
1028 case '1': case '2': case '3': case '4': case '5':
1029 case '6': case '7': case '8': case '9':
1032 /* PYTHON: Try to compute an octal value for a character */
1033 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
1035 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1036 c = (c * 8 + ptr[i]-'0') & 255;
1037 else
1038 break; /* Non-octal character--break out of the loop */
1040 /* It's a character if there were exactly 3 octal digits, or if
1041 we're inside a character class and there was at least one
1042 octal digit. */
1043 if ( (i == 3) || (isclass && i!=0) )
1045 ptr += i-1;
1046 break;
1048 c = ptr[0]; /* Restore the first character after the \ */
1049 c -= '0'; i = 1;
1050 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1052 c = c * 10 + ptr[1] - '0';
1053 ptr++; i++;
1055 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1056 c = -(ESC_REF + c);
1058 break;
1060 /* \0 always starts an octal number, but we may drop through to here with a
1061 larger first octal digit */
1063 case '0':
1064 c -= '0';
1065 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1066 ptr[1] != '8' && ptr[1] != '9')
1067 c = (c * 8 + *(++ptr) - '0') & 255;
1068 break;
1070 /* Special escapes not starting with a digit are straightforward */
1072 case 'x':
1073 c = 0;
1074 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
1076 ptr++;
1077 c = c * 16 + pcre_lcc[*ptr] -
1078 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1079 c &= 255;
1081 break;
1084 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1085 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1086 for Perl compatibility, it is a literal. */
1088 default:
1089 if ((options & PCRE_EXTRA) != 0) switch(c)
1091 case 'X':
1092 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1093 break;
1095 default:
1096 *errorptr = ERR3;
1097 break;
1099 break;
1103 *ptrptr = ptr;
1104 return c;
1109 /*************************************************
1110 * Check for counted repeat *
1111 *************************************************/
1113 /* This function is called when a '{' is encountered in a place where it might
1114 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1115 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1116 where the ddds are digits.
1118 Arguments:
1119 p pointer to the first char after '{'
1121 Returns: TRUE or FALSE
1124 static BOOL
1125 is_counted_repeat(const uschar *p)
1127 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1128 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1129 if (*p == '}') return TRUE;
1131 if (*p++ != ',') return FALSE;
1132 if (*p == '}') return TRUE;
1134 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1135 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1136 return (*p == '}');
1141 /*************************************************
1142 * Read repeat counts *
1143 *************************************************/
1145 /* Read an item of the form {n,m} and return the values. This is called only
1146 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1147 so the syntax is guaranteed to be correct, but we need to check the values.
1149 Arguments:
1150 p pointer to first char after '{'
1151 minp pointer to int for min
1152 maxp pointer to int for max
1153 returned as -1 if no max
1154 errorptr points to pointer to error message
1156 Returns: pointer to '}' on success;
1157 current ptr on error, with errorptr set
1160 static const uschar *
1161 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1163 int min = 0;
1164 int max = -1;
1166 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1168 if (*p == '}') max = min; else
1170 if (*(++p) != '}')
1172 max = 0;
1173 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1174 if (max < min)
1176 *errorptr = ERR4;
1177 return p;
1182 /* Do paranoid checks, then fill in the required variables, and pass back the
1183 pointer to the terminating '}'. */
1185 if (min > 65535 || max > 65535)
1186 *errorptr = ERR5;
1187 else
1189 *minp = min;
1190 *maxp = max;
1192 return p;
1197 /*************************************************
1198 * Compile one branch *
1199 *************************************************/
1201 /* Scan the pattern, compiling it into the code vector.
1203 Arguments:
1204 options the option bits
1205 bracket points to number of brackets used
1206 code points to the pointer to the current code point
1207 ptrptr points to the current pattern pointer
1208 errorptr points to pointer to error message
1210 Returns: TRUE on success
1211 FALSE, with *errorptr set on error
1214 static BOOL
1215 compile_branch(int options, int *brackets, uschar **codeptr,
1216 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
1218 int repeat_type, op_type;
1219 int repeat_min, repeat_max;
1220 int bravalue, length;
1221 int greedy_default, greedy_non_default;
1222 register int c;
1223 register uschar *code = *codeptr;
1224 const uschar *ptr = *ptrptr;
1225 const uschar *oldptr;
1226 uschar *previous = NULL;
1227 uschar class[32];
1228 uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
1230 /* Set up the default and non-default settings for greediness */
1232 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1233 greedy_non_default = greedy_default ^ 1;
1235 /* Switch on next character until the end of the branch */
1237 for (;; ptr++)
1239 BOOL negate_class;
1240 int class_charcount;
1241 int class_lastchar;
1243 c = *ptr;
1244 if ((options & PCRE_EXTENDED) != 0)
1246 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1247 if (c == '#')
1249 while ((c = *(++ptr)) != 0 && c != '\n');
1250 continue;
1254 switch(c)
1256 /* The branch terminates at end of string, |, or ). */
1258 case 0:
1259 case '|':
1260 case ')':
1261 *codeptr = code;
1262 *ptrptr = ptr;
1263 return TRUE;
1265 /* Handle single-character metacharacters */
1267 case '^':
1268 previous = NULL;
1269 *code++ = OP_CIRC;
1270 break;
1272 case '$':
1273 previous = NULL;
1274 *code++ = OP_DOLL;
1275 break;
1277 case '.':
1278 previous = code;
1279 *code++ = OP_ANY;
1280 break;
1282 /* Character classes. These always build a 32-byte bitmap of the permitted
1283 characters, except in the special case where there is only one character.
1284 For negated classes, we build the map as usual, then invert it at the end.
1287 case '[':
1288 previous = code;
1289 if (options & PCRE_LOCALE)
1291 *code++ = OP_CLASS_L;
1292 /* Set the flag for localized classes (like \w) to 0 */
1293 class_flag = code;
1294 *class_flag = 0;
1296 else
1298 *code++ = OP_CLASS;
1299 class_flag = NULL;
1302 /* If the first character is '^', set the negation flag, and use a
1303 different opcode. This only matters if caseless matching is specified at
1304 runtime. */
1306 if ((c = *(++ptr)) == '^')
1308 negate_class = TRUE;
1309 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
1310 c = *(++ptr);
1312 else negate_class = FALSE;
1314 /* Keep a count of chars so that we can optimize the case of just a single
1315 character. */
1317 class_charcount = 0;
1318 class_lastchar = -1;
1320 /* Initialize the 32-char bit map to all zeros. We have to build the
1321 map in a temporary bit of store, in case the class contains only 1
1322 character, because in that case the compiled code doesn't use the
1323 bit map. */
1325 memset(class, 0, 32 * sizeof(uschar));
1327 /* Process characters until ] is reached. By writing this as a "do" it
1328 means that an initial ] is taken as a data character. */
1332 if (c == 0)
1334 *errorptr = ERR6;
1335 goto FAILED;
1338 /* Backslash may introduce a single character, or it may introduce one
1339 of the specials, which just set a flag. Escaped items are checked for
1340 validity in the pre-compiling pass. The sequence \b is a special case.
1341 Inside a class (and only there) it is treated as backspace. Elsewhere
1342 it marks a word boundary. Other escapes have preset maps ready to
1343 or into the one we are building. We assume they have more than one
1344 character in them, so set class_count bigger than one. */
1346 if (c == '\\')
1348 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1349 if (-c == ESC_b) c = '\b';
1350 else if (c < 0)
1352 class_charcount = 10;
1353 switch (-c)
1355 case ESC_d:
1357 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1359 continue;
1361 case ESC_D:
1363 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1365 continue;
1367 case ESC_w:
1368 if (options & PCRE_LOCALE)
1370 *class_flag |= 1;
1372 else
1374 for (c = 0; c < 32; c++)
1375 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1377 continue;
1379 case ESC_W:
1380 if (options & PCRE_LOCALE)
1382 *class_flag |= 2;
1384 else
1386 for (c = 0; c < 32; c++)
1387 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1389 continue;
1391 case ESC_s:
1393 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1395 continue;
1397 case ESC_S:
1399 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1401 continue;
1403 default:
1404 *errorptr = ERR7;
1405 goto FAILED;
1408 /* Fall through if single character */
1411 /* A single character may be followed by '-' to form a range. However,
1412 Perl does not permit ']' to be the end of the range. A '-' character
1413 here is treated as a literal. */
1415 if (ptr[1] == '-' && ptr[2] != ']')
1417 int d;
1418 ptr += 2;
1419 d = *ptr;
1421 if (d == 0)
1423 *errorptr = ERR6;
1424 goto FAILED;
1427 /* The second part of a range can be a single-character escape, but
1428 not any of the other escapes. */
1430 if (d == '\\')
1432 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1433 if (d < 0)
1435 if (d == -ESC_b) d = '\b'; else
1437 *errorptr = ERR7;
1438 goto FAILED;
1443 if (d < c)
1445 *errorptr = ERR8;
1446 goto FAILED;
1449 for (; c <= d; c++)
1451 class[c/8] |= (1 << (c&7));
1452 if ((options & PCRE_CASELESS) != 0)
1454 int uc = pcre_fcc[c]; /* flip case */
1455 class[uc/8] |= (1 << (uc&7));
1457 class_charcount++; /* in case a one-char range */
1458 class_lastchar = c;
1460 continue; /* Go get the next char in the class */
1463 /* Handle a lone single character - we can get here for a normal
1464 non-escape char, or after \ that introduces a single character. */
1466 class [c/8] |= (1 << (c&7));
1467 if ((options & PCRE_CASELESS) != 0)
1469 c = pcre_fcc[c]; /* flip case */
1470 class[c/8] |= (1 << (c&7));
1472 class_charcount++;
1473 class_lastchar = c;
1476 /* Loop until ']' reached; the check for end of string happens inside the
1477 loop. This "while" is the end of the "do" above. */
1479 while ((c = *(++ptr)) != ']');
1481 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1482 precisely one character. This doesn't need the whole 32-byte bit map.
1483 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1484 it's negative. */
1486 if (class_charcount == 1 && class_lastchar >= 0)
1488 if (negate_class)
1490 code[-1] = OP_NOT;
1492 else
1494 code[-1] = OP_CHARS;
1495 *code++ = 1;
1497 *code++ = class_lastchar;
1500 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1501 the code vector. */
1503 else
1505 /* If this is a localized opcode, bump the code pointer up */
1506 if (class_flag) code++;
1507 if (negate_class)
1509 if (class_flag) *class_flag = (*class_flag) ^ 63;
1510 for (c = 0; c < 32; c++) code[c] = ~class[c];
1512 else
1513 memcpy(code, class, 32);
1514 code += 32;
1516 break;
1518 /* Various kinds of repeat */
1520 case '{':
1521 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1522 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1523 if (*errorptr != NULL) goto FAILED;
1524 goto REPEAT;
1526 case '*':
1527 repeat_min = 0;
1528 repeat_max = -1;
1529 goto REPEAT;
1531 case '+':
1532 repeat_min = 1;
1533 repeat_max = -1;
1534 goto REPEAT;
1536 case '?':
1537 repeat_min = 0;
1538 repeat_max = 1;
1540 REPEAT:
1541 if (previous == NULL)
1543 *errorptr = ERR9;
1544 goto FAILED;
1547 /* If the next character is '?' this is a minimizing repeat, by default,
1548 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1549 next character. */
1551 if (ptr[1] == '?')
1552 { repeat_type = greedy_non_default; ptr++; }
1553 else repeat_type = greedy_default;
1555 /* If the maximum is zero then the minimum must also be zero; Perl allows
1556 this case, so we do too - by simply omitting the item altogether. */
1558 if (repeat_max == 0) code = previous;
1560 /* If previous was a string of characters, chop off the last one and use it
1561 as the subject of the repeat. If there was only one character, we can
1562 abolish the previous item altogether. */
1564 else if (*previous == OP_CHARS)
1566 int len = previous[1];
1567 if (len == 1)
1569 c = previous[2];
1570 code = previous;
1572 else
1574 c = previous[len+1];
1575 previous[1]--;
1576 code--;
1578 op_type = 0; /* Use single-char op codes */
1579 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1582 /* If previous was a single negated character ([^a] or similar), we use
1583 one of the special opcodes, replacing it. The code is shared with single-
1584 character repeats by adding a suitable offset into repeat_type. */
1586 else if ((int)*previous == OP_NOT)
1588 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1589 c = previous[1];
1590 code = previous;
1591 goto OUTPUT_SINGLE_REPEAT;
1594 /* If previous was a character type match (\d or similar), abolish it and
1595 create a suitable repeat item. The code is shared with single-character
1596 repeats by adding a suitable offset into repeat_type. */
1598 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
1600 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1601 c = *previous;
1602 code = previous;
1604 OUTPUT_SINGLE_REPEAT:
1605 repeat_type += op_type; /* Combine both values for many cases */
1607 /* A minimum of zero is handled either as the special case * or ?, or as
1608 an UPTO, with the maximum given. */
1610 if (repeat_min == 0)
1612 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1613 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1614 else
1616 *code++ = OP_UPTO + repeat_type;
1617 *code++ = repeat_max >> 8;
1618 *code++ = (repeat_max & 255);
1622 /* The case {1,} is handled as the special case + */
1624 else if (repeat_min == 1 && repeat_max == -1)
1625 *code++ = OP_PLUS + repeat_type;
1627 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1628 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1630 else
1632 if (repeat_min != 1)
1634 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1635 *code++ = repeat_min >> 8;
1636 *code++ = (repeat_min & 255);
1639 /* If the minimum is 1 and the previous item was a character string,
1640 we either have to put back the item that got canceled if the string
1641 length was 1, or add the character back onto the end of a longer
1642 string. For a character type nothing need be done; it will just get
1643 put back naturally. Note that the final character is always going to
1644 get added below. */
1646 else if (*previous == OP_CHARS)
1648 if (code == previous) code += 2; else previous[1]++;
1651 /* For a single negated character we also have to put back the
1652 item that got canceled. */
1654 else if (*previous == OP_NOT) code++;
1656 /* If the maximum is unlimited, insert an OP_STAR. */
1658 if (repeat_max < 0)
1660 *code++ = c;
1661 *code++ = OP_STAR + repeat_type;
1664 /* Else insert an UPTO if the max is greater than the min. */
1666 else if (repeat_max != repeat_min)
1668 *code++ = c;
1669 repeat_max -= repeat_min;
1670 *code++ = OP_UPTO + repeat_type;
1671 *code++ = repeat_max >> 8;
1672 *code++ = (repeat_max & 255);
1676 /* The character or character type itself comes last in all cases. */
1678 *code++ = c;
1681 /* If previous was a character class or a back reference, we put the repeat
1682 stuff after it. */
1684 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1685 *previous==OP_CLASS_L || *previous == OP_REF)
1687 if (repeat_min == 0 && repeat_max == -1)
1688 *code++ = OP_CRSTAR + repeat_type;
1689 else if (repeat_min == 1 && repeat_max == -1)
1690 *code++ = OP_CRPLUS + repeat_type;
1691 else if (repeat_min == 0 && repeat_max == 1)
1692 *code++ = OP_CRQUERY + repeat_type;
1693 else
1695 *code++ = OP_CRRANGE + repeat_type;
1696 *code++ = repeat_min >> 8;
1697 *code++ = repeat_min & 255;
1698 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1699 *code++ = repeat_max >> 8;
1700 *code++ = repeat_max & 255;
1704 /* If previous was a bracket group, we may have to replicate it in certain
1705 cases. If the maximum repeat count is unlimited, check that the bracket
1706 group cannot match the empty string, and diagnose an error if it can. */
1708 else if ((int)*previous >= OP_BRA)
1710 int i;
1711 int len = code - previous;
1713 if (repeat_max == -1 && could_be_empty(previous))
1715 *errorptr = ERR10;
1716 goto FAILED;
1719 /* If the minimum is greater than zero, and the maximum is unlimited or
1720 equal to the minimum, the first copy remains where it is, and is
1721 replicated up to the minimum number of times. This case includes the +
1722 repeat, but of course no replication is needed in that case. */
1724 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1726 for (i = 1; i < repeat_min; i++)
1728 memcpy(code, previous, len);
1729 code += len;
1733 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1734 Then, if there is a fixed upper limit, replicated up to that many times,
1735 sticking BRAZERO in front of all the optional ones. */
1737 else
1739 if (repeat_min == 0)
1741 memmove(previous+1, previous, len);
1742 code++;
1743 *previous++ = OP_BRAZERO + repeat_type;
1746 for (i = 1; i < repeat_min; i++)
1748 memcpy(code, previous, len);
1749 code += len;
1752 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1754 *code++ = OP_BRAZERO + repeat_type;
1755 memcpy(code, previous, len);
1756 code += len;
1760 /* If the maximum is unlimited, set a repeater in the final copy. */
1762 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1765 /* Else there's some kind of shambles */
1767 else
1769 *errorptr = ERR11;
1770 goto FAILED;
1773 /* In all case we no longer have a previous item. */
1775 previous = NULL;
1776 break;
1779 /* Start of nested bracket sub-expression, or comment or lookahead.
1780 First deal with special things that can come after a bracket; all are
1781 introduced by ?, and the appearance of any of them means that this is not a
1782 referencing group. They were checked for validity in the first pass over
1783 the string, so we don't have to check for syntax errors here. */
1785 case '(':
1786 previous = code; /* Only real brackets can be repeated */
1787 if (*(++ptr) == '?')
1789 bravalue = OP_BRA;
1791 switch (*(++ptr))
1793 case '#':
1794 case 'i':
1795 case 'L':
1796 case 'm':
1797 case 's':
1798 case 'x':
1799 ptr++;
1800 while (*ptr != ')') ptr++;
1801 previous = NULL;
1802 continue;
1804 case ':': /* Non-extracting bracket */
1805 ptr++;
1806 break;
1808 case '=': /* Assertions can't be repeated */
1809 bravalue = OP_ASSERT;
1810 ptr++;
1811 previous = NULL;
1812 break;
1814 case '!':
1815 bravalue = OP_ASSERT_NOT;
1816 ptr++;
1817 previous = NULL;
1818 break;
1820 case ('P'):
1821 ptr++;
1822 if (*ptr=='<')
1824 /* (?P<groupname>...) */
1825 int idlen;
1826 PyObject *string, *intobj;
1828 ptr++;
1829 idlen = get_group_id(ptr, '>', errorptr);
1830 if (*errorptr) {
1831 goto FAILED;
1833 string = PyString_FromStringAndSize((char*)ptr, idlen);
1834 intobj = PyInt_FromLong( brackets[0] + 1 );
1835 if (intobj == NULL || string == NULL)
1837 Py_XDECREF(string);
1838 Py_XDECREF(intobj);
1839 *errorptr = "exception raised";
1840 goto FAILED;
1842 PyDict_SetItem(dictionary, string, intobj);
1843 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
1844 ptr += idlen+1; /* Point to rest of expression */
1845 goto do_grouping_bracket;
1847 if (*ptr=='=')
1849 /* (?P=groupname) */
1850 int idlen, refnum;
1851 PyObject *string, *intobj;
1853 ptr++;
1854 idlen = get_group_id(ptr, ')', errorptr);
1855 if (*errorptr) {
1856 goto FAILED;
1858 string = PyString_FromStringAndSize((char *)ptr, idlen);
1859 if (string==NULL) {
1860 *errorptr = "exception raised";
1861 goto FAILED;
1863 intobj = PyDict_GetItem(dictionary, string);
1864 if (intobj==NULL) {
1865 Py_DECREF(string);
1866 *errorptr = "?P= group identifier isn't defined";
1867 goto FAILED;
1870 refnum = PyInt_AsLong(intobj);
1871 Py_DECREF(string);
1872 /* The caller doesn't own the reference to the value
1873 returned from PyDict_GetItem, so intobj is not
1874 DECREF'ed. */
1876 *code++ = OP_REF;
1877 *code++ = refnum;
1878 /* The continue will cause the top-level for() loop to
1879 be resumed, so ptr will be immediately incremented.
1880 Therefore, the following line adds just idlen, not
1881 idlen+1 */
1882 ptr += idlen;
1883 continue;
1885 /* The character after ?P is neither < nor =, so
1886 report an error. Add more Python-extensions here. */
1887 *errorptr="unknown after (?P";
1888 goto FAILED;
1890 case '>': /* "Match once" brackets */
1891 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1893 bravalue = OP_ONCE;
1894 ptr++;
1895 previous = NULL;
1896 break;
1898 /* Else fall through */
1900 default:
1901 *errorptr = ERR12;
1902 goto FAILED;
1906 /* Else we have a referencing group */
1908 else
1910 do_grouping_bracket:
1911 if (++(*brackets) > EXTRACT_MAX)
1913 *errorptr = ERR13;
1914 goto FAILED;
1916 bravalue = OP_BRA + *brackets;
1919 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1920 code into a non-register variable in order to be able to pass its address
1921 because some compilers complain otherwise. */
1923 *code = bravalue;
1925 uschar *mcode = code;
1926 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
1927 goto FAILED;
1928 code = mcode;
1931 if (*ptr != ')')
1933 *errorptr = ERR14;
1934 goto FAILED;
1936 break;
1938 /* Check \ for being a real metacharacter; if not, fall through and handle
1939 it as a data character at the start of a string. Escape items are checked
1940 for validity in the pre-compiling pass. */
1942 case '\\':
1943 oldptr = ptr;
1944 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1946 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1947 are arranged to be the negation of the corresponding OP_values. For the
1948 back references, the values are ESC_REF plus the reference number. Only
1949 back references and those types that consume a character may be repeated.
1950 We can test for values between ESC_b and ESC_Z for the latter; this may
1951 have to change if any new ones are ever created. */
1953 if (c < 0)
1955 if (-c >= ESC_REF)
1957 int refnum = -c - ESC_REF;
1958 if (*brackets < refnum)
1960 *errorptr = ERR15;
1961 goto FAILED;
1963 previous = code;
1964 *code++ = OP_REF;
1965 *code++ = refnum;
1967 else
1969 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1970 if ( (options & PCRE_LOCALE) != 0)
1972 switch (c)
1974 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1975 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1976 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1977 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1980 *code++ = -c;
1982 continue;
1985 /* Data character: Reset and fall through */
1987 ptr = oldptr;
1988 c = '\\';
1990 /* Handle a run of data characters until a metacharacter is encountered.
1991 The first character is guaranteed not to be whitespace or # when the
1992 extended flag is set. */
1994 NORMAL_CHAR:
1995 default:
1996 previous = code;
1997 *code = OP_CHARS;
1998 code += 2;
1999 length = 0;
2003 if ((options & PCRE_EXTENDED) != 0)
2005 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2006 if (c == '#')
2008 while ((c = *(++ptr)) != 0 && c != '\n');
2009 if (c == 0) break;
2010 continue;
2014 /* Backslash may introduce a data char or a metacharacter. Escaped items
2015 are checked for validity in the pre-compiling pass. Stop the string
2016 before a metaitem. */
2018 if (c == '\\')
2020 oldptr = ptr;
2021 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
2022 if (c < 0) { ptr = oldptr; break; }
2025 /* Ordinary character or single-char escape */
2027 *code++ = c;
2028 length++;
2031 /* This "while" is the end of the "do" above. */
2033 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2035 /* Compute the length and set it in the data vector, and advance to
2036 the next state. */
2038 previous[1] = length;
2039 if (length < 255) ptr--;
2040 break;
2042 } /* end of big loop */
2044 /* Control never reaches here by falling through, only by a goto for all the
2045 error states. Pass back the position in the pattern so that it can be displayed
2046 to the user for diagnosing the error. */
2048 FAILED:
2049 *ptrptr = ptr;
2050 return FALSE;
2056 /*************************************************
2057 * Compile sequence of alternatives *
2058 *************************************************/
2060 /* On entry, ptr is pointing past the bracket character, but on return
2061 it points to the closing bracket, or vertical bar, or end of string.
2062 The code variable is pointing at the byte into which the BRA operator has been
2063 stored.
2065 Argument:
2066 options the option bits
2067 brackets -> int containing the number of extracting brackets used
2068 codeptr -> the address of the current code pointer
2069 ptrptr -> the address of the current pattern pointer
2070 errorptr -> pointer to error message
2072 Returns: TRUE on success
2075 static BOOL
2076 compile_regex(int options, int *brackets, uschar **codeptr,
2077 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
2079 const uschar *ptr = *ptrptr;
2080 uschar *code = *codeptr;
2081 uschar *start_bracket = code;
2083 for (;;)
2085 int length;
2086 uschar *last_branch = code;
2088 code += 3;
2089 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
2091 *ptrptr = ptr;
2092 return FALSE;
2095 /* Fill in the length of the last branch */
2097 length = code - last_branch;
2098 last_branch[1] = length >> 8;
2099 last_branch[2] = length & 255;
2101 /* Reached end of expression, either ')' or end of pattern. Insert a
2102 terminating ket and the length of the whole bracketed item, and return,
2103 leaving the pointer at the terminating char. */
2105 if (*ptr != '|')
2107 length = code - start_bracket;
2108 *code++ = OP_KET;
2109 *code++ = length >> 8;
2110 *code++ = length & 255;
2111 *codeptr = code;
2112 *ptrptr = ptr;
2113 return TRUE;
2116 /* Another branch follows; insert an "or" node and advance the pointer. */
2118 *code = OP_ALT;
2119 ptr++;
2121 /* Control never reaches here */
2126 /*************************************************
2127 * Check for anchored expression *
2128 *************************************************/
2130 /* Try to find out if this is an anchored regular expression. Consider each
2131 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2132 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2133 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2134 counts, since OP_CIRC can match in the middle.
2136 A branch is also implicitly anchored if it starts with .* because that will try
2137 the rest of the pattern at all possible matching points, so there is no point
2138 trying them again.
2140 Argument: points to start of expression (the bracket)
2141 Returns: TRUE or FALSE
2144 static BOOL
2145 is_anchored(register const uschar *code, BOOL multiline)
2147 do {
2148 int op = (int)code[3];
2149 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2150 { if (!is_anchored(code+3, multiline)) return FALSE; }
2151 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2152 { if (code[4] != OP_ANY) return FALSE; }
2153 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2154 code += (code[1] << 8) + code[2];
2156 while (*code == OP_ALT);
2157 return TRUE;
2162 /*************************************************
2163 * Check for start with \n line expression *
2164 *************************************************/
2166 /* This is called for multiline expressions to try to find out if every branch
2167 starts with ^ so that "first char" processing can be done to speed things up.
2169 Argument: points to start of expression (the bracket)
2170 Returns: TRUE or FALSE
2173 static BOOL
2174 is_startline(const uschar *code)
2176 do {
2177 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2178 { if (!is_startline(code+3)) return FALSE; }
2179 else if (code[3] != OP_CIRC) return FALSE;
2180 code += (code[1] << 8) + code[2];
2182 while (*code == OP_ALT);
2183 return TRUE;
2188 /*************************************************
2189 * Check for fixed first char *
2190 *************************************************/
2192 /* Try to find out if there is a fixed first character. This is called for
2193 unanchored expressions, as it speeds up their processing quite considerably.
2194 Consider each alternative branch. If they all start with the same char, or with
2195 a bracket all of whose alternatives start with the same char (recurse ad lib),
2196 then we return that char, otherwise -1.
2198 Argument: points to start of expression (the bracket)
2199 Returns: -1 or the fixed first char
2202 static int
2203 find_firstchar(uschar *code)
2205 register int c = -1;
2208 register int charoffset = 4;
2210 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2212 register int d;
2213 if ((d = find_firstchar(code+3)) < 0) return -1;
2214 if (c < 0) c = d; else if (c != d) return -1;
2217 else switch(code[3])
2219 default:
2220 return -1;
2222 case OP_EXACT: /* Fall through */
2223 charoffset++;
2225 case OP_CHARS: /* Fall through */
2226 charoffset++;
2228 case OP_PLUS:
2229 case OP_MINPLUS:
2230 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2231 break;
2233 code += (code[1] << 8) + code[2];
2235 while (*code == OP_ALT);
2236 return c;
2241 /*************************************************
2242 * Compile a Regular Expression *
2243 *************************************************/
2245 /* This function takes a string and returns a pointer to a block of store
2246 holding a compiled version of the expression.
2248 Arguments:
2249 pattern the regular expression
2250 options various option bits
2251 errorptr pointer to pointer to error text
2252 erroroffset ptr offset in pattern where error was detected
2254 Returns: pointer to compiled data block, or NULL on error,
2255 with errorptr and erroroffset set
2258 pcre *
2259 pcre_compile(const char *pattern, int options, const char **errorptr,
2260 int *erroroffset, PyObject *dictionary)
2262 real_pcre *re;
2263 int spaces = 0;
2264 int length = 3; /* For initial BRA plus length */
2265 int runlength;
2266 int c, size;
2267 int bracount = 0;
2268 int brastack[200];
2269 int top_backref = 0;
2270 unsigned int brastackptr = 0;
2271 uschar *code;
2272 const uschar *ptr;
2274 #ifdef DEBUG
2275 uschar *code_base, *code_end;
2276 #endif
2278 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2279 can do is just return NULL. */
2281 if (errorptr == NULL) return NULL;
2282 *errorptr = NULL;
2284 /* However, we can give a message for this error */
2286 if (erroroffset == NULL)
2288 *errorptr = ERR16;
2289 return NULL;
2291 *erroroffset = 0;
2293 if ((options & ~PUBLIC_OPTIONS) != 0)
2295 *errorptr = ERR17;
2296 return NULL;
2299 DPRINTF(("------------------------------------------------------------------\n"));
2300 DPRINTF(("%s\n", pattern));
2302 /* The first thing to do is to make a pass over the pattern to compute the
2303 amount of store required to hold the compiled code. This does not have to be
2304 perfect as long as errors are overestimates. At the same time we can detect any
2305 internal flag settings. Make an attempt to correct for any counted white space
2306 if an "extended" flag setting appears late in the pattern. We can't be so
2307 clever for #-comments. */
2309 ptr = (const uschar *)(pattern - 1);
2310 while ((c = *(++ptr)) != 0)
2312 int min, max;
2313 int class_charcount;
2315 if ((pcre_ctypes[c] & ctype_space) != 0)
2317 if ((options & PCRE_EXTENDED) != 0) continue;
2318 spaces++;
2321 if (c == '#' && (options & PCRE_EXTENDED) != 0)
2323 while ((c = *(++ptr)) != 0 && c != '\n');
2324 continue;
2327 switch(c)
2329 /* A backslashed item may be an escaped "normal" character or a
2330 character type. For a "normal" character, put the pointers and
2331 character back so that tests for whitespace etc. in the input
2332 are done correctly. */
2334 case '\\':
2336 const uschar *save_ptr = ptr;
2337 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2338 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2339 if (c >= 0)
2341 ptr = save_ptr;
2342 c = '\\';
2343 goto NORMAL_CHAR;
2346 length++;
2348 /* A back reference needs an additional char, plus either one or 5
2349 bytes for a repeat. We also need to keep the value of the highest
2350 back reference. */
2352 if (c <= -ESC_REF)
2354 int refnum = -c - ESC_REF;
2355 if (refnum > top_backref) top_backref = refnum;
2356 length++; /* For single back reference */
2357 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2359 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2360 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2361 if ((min == 0 && (max == 1 || max == -1)) ||
2362 (min == 1 && max == -1))
2363 length++;
2364 else length += 5;
2365 if (ptr[1] == '?') ptr++;
2368 continue;
2370 case '^':
2371 case '.':
2372 case '$':
2373 case '*': /* These repeats won't be after brackets; */
2374 case '+': /* those are handled separately */
2375 case '?':
2376 length++;
2377 continue;
2379 /* This covers the cases of repeats after a single char, metachar, class,
2380 or back reference. */
2382 case '{':
2383 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2384 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2385 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2386 if ((min == 0 && (max == 1 || max == -1)) ||
2387 (min == 1 && max == -1))
2388 length++;
2389 else
2391 length--; /* Uncount the original char or metachar */
2392 if (min == 1) length++; else if (min > 0) length += 4;
2393 if (max > 0) length += 4; else length += 2;
2395 if (ptr[1] == '?') ptr++;
2396 continue;
2398 /* An alternation contains an offset to the next branch or ket. */
2399 case '|':
2400 length += 3;
2401 continue;
2403 /* A character class uses 33 characters. Don't worry about character types
2404 that aren't allowed in classes - they'll get picked up during the compile.
2405 A character class that contains only one character uses 2 or 3 bytes,
2406 depending on whether it is negated or not. Notice this where we can. */
2408 case '[':
2409 class_charcount = 0;
2410 if (*(++ptr) == '^') ptr++;
2413 if (*ptr == '\\')
2415 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
2416 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2417 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2419 else class_charcount++;
2420 ptr++;
2422 while (*ptr != 0 && *ptr != ']');
2424 /* Repeats for negated single chars are handled by the general code */
2426 if (class_charcount == 1) length += 3; else
2428 length += 33;
2429 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2431 /* A repeat needs either 1 or 5 bytes. */
2433 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2435 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2436 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2437 if ((min == 0 && (max == 1 || max == -1)) ||
2438 (min == 1 && max == -1))
2439 length++;
2440 else length += 5;
2441 if (ptr[1] == '?') ptr++;
2444 continue;
2446 /* Brackets may be genuine groups or special things */
2448 case '(':
2450 /* Handle special forms of bracket, which all start (? */
2452 if (ptr[1] == '?') switch (c = ptr[2])
2454 /* Skip over comments entirely */
2455 case '#':
2456 ptr += 3;
2457 while (*ptr != 0 && *ptr != ')') ptr++;
2458 if (*ptr == 0)
2460 *errorptr = ERR18;
2461 goto PCRE_ERROR_RETURN;
2463 continue;
2465 /* Non-referencing groups and lookaheads just move the pointer on, and
2466 then behave like a non-special bracket, except that they don't increment
2467 the count of extracting brackets. */
2469 case ':':
2470 case '=':
2471 case '!':
2472 ptr += 2;
2473 break;
2475 case ('P'):
2477 int idlen;
2478 switch (*ptr++) {
2479 case ('<'):
2480 idlen = get_group_id(ptr++, '>', errorptr);
2481 if (*errorptr) goto PCRE_ERROR_RETURN;
2482 ptr += idlen+1;
2483 break;
2484 case ('='):
2485 idlen = get_group_id(ptr++, ')', errorptr);
2486 if (*errorptr) goto PCRE_ERROR_RETURN;
2487 ptr += idlen+1;
2488 length++;
2489 break;
2492 break;
2494 /* Ditto for the "once only" bracket, allowed only if the extra bit
2495 is set. */
2497 case '>':
2498 if ((options & PCRE_EXTRA) != 0)
2500 ptr += 2;
2501 break;
2503 /* Else fall through */
2505 /* Else loop setting valid options until ) is met. Anything else is an
2506 error. */
2508 default:
2509 ptr += 2;
2510 for (;; ptr++)
2512 if ((c = *ptr) == 'i')
2514 options |= PCRE_CASELESS;
2515 continue;
2517 else if ((c = *ptr) == 'L')
2519 options |= PCRE_LOCALE;
2520 continue;
2522 else if ((c = *ptr) == 'm')
2524 options |= PCRE_MULTILINE;
2525 continue;
2527 else if (c == 's')
2529 options |= PCRE_DOTALL;
2530 continue;
2532 else if (c == 'x')
2534 options |= PCRE_EXTENDED;
2535 length -= spaces; /* Already counted spaces */
2536 continue;
2538 else if (c == ')') break;
2540 *errorptr = ERR12;
2541 goto PCRE_ERROR_RETURN;
2543 continue; /* End of this bracket handling */
2546 /* Extracting brackets must be counted so we can process escapes in a
2547 Perlish way. */
2549 else bracount++;
2551 /* Non-special forms of bracket. Save length for computing whole length
2552 at end if there's a repeat that requires duplication of the group. */
2554 if (brastackptr >= sizeof(brastack)/sizeof(int))
2556 *errorptr = ERR19;
2557 goto PCRE_ERROR_RETURN;
2560 brastack[brastackptr++] = length;
2561 length += 3;
2562 continue;
2564 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2565 have to replicate this bracket up to that many times. If brastackptr is
2566 0 this is an unmatched bracket which will generate an error, but take care
2567 not to try to access brastack[-1]. */
2569 case ')':
2570 length += 3;
2572 int minval = 1;
2573 int maxval = 1;
2574 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
2576 /* Leave ptr at the final char; for read_repeat_counts this happens
2577 automatically; for the others we need an increment. */
2579 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2581 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2582 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2584 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2585 else if (c == '+') { maxval = -1; ptr++; }
2586 else if (c == '?') { minval = 0; ptr++; }
2588 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2589 if there is a limited maximum we have to replicate up to maxval-1 times
2590 and allow for a BRAZERO item before each optional copy, as we also have
2591 to do before the first copy if the minimum is zero. */
2593 if (minval == 0) length++;
2594 else if (minval > 1) length += (minval - 1) * duplength;
2595 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2597 continue;
2599 /* Non-special character. For a run of such characters the length required
2600 is the number of characters + 2, except that the maximum run length is 255.
2601 We won't get a skipped space or a non-data escape or the start of a #
2602 comment as the first character, so the length can't be zero. */
2604 NORMAL_CHAR:
2605 default:
2606 length += 2;
2607 runlength = 0;
2610 if ((pcre_ctypes[c] & ctype_space) != 0)
2612 if ((options & PCRE_EXTENDED) != 0) continue;
2613 spaces++;
2616 if (c == '#' && (options & PCRE_EXTENDED) != 0)
2618 while ((c = *(++ptr)) != 0 && c != '\n');
2619 continue;
2622 /* Backslash may introduce a data char or a metacharacter; stop the
2623 string before the latter. */
2625 if (c == '\\')
2627 const uschar *saveptr = ptr;
2628 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2629 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630 if (c < 0) { ptr = saveptr; break; }
2633 /* Ordinary character or single-char escape */
2635 runlength++;
2638 /* This "while" is the end of the "do" above. */
2640 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2642 ptr--;
2643 length += runlength;
2644 continue;
2648 length += 4; /* For final KET and END */
2650 if (length > 65539)
2652 *errorptr = ERR20;
2653 return NULL;
2656 /* Compute the size of data block needed and get it, either from malloc or
2657 externally provided function. We specify "code[0]" in the offsetof() expression
2658 rather than just "code", because it has been reported that one broken compiler
2659 fails on "code" because it is also an independent variable. It should make no
2660 difference to the value of the offsetof(). */
2662 size = length + offsetof(real_pcre, code[0]);
2663 re = (real_pcre *)(pcre_malloc)(size+50);
2665 if (re == NULL)
2667 *errorptr = ERR21;
2668 return NULL;
2671 /* Put in the magic number and the options. */
2673 re->magic_number = MAGIC_NUMBER;
2674 re->options = options;
2676 /* Set up a starting, non-extracting bracket, then compile the expression. On
2677 error, *errorptr will be set non-NULL, so we don't need to look at the result
2678 of the function here. */
2680 ptr = (const uschar *)pattern;
2681 code = re->code;
2682 *code = OP_BRA;
2683 bracount = 0;
2684 (void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2685 re->top_bracket = bracount;
2686 re->top_backref = top_backref;
2688 /* If not reached end of pattern on success, there's an excess bracket. */
2690 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2692 /* Fill in the terminating state and check for disastrous overflow, but
2693 if debugging, leave the test till after things are printed out. */
2695 *code++ = OP_END;
2698 #ifndef DEBUG
2699 if (code - re->code > length) *errorptr = ERR23;
2700 #endif
2702 /* Failed to compile */
2704 if (*errorptr != NULL)
2706 (pcre_free)(re);
2707 PCRE_ERROR_RETURN:
2708 *erroroffset = ptr - (const uschar *)pattern;
2709 return NULL;
2712 /* If the anchored option was not passed, set flag if we can determine that it
2713 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2714 we can determine what the first character has to be, because that speeds up
2715 unanchored matches no end. In the case of multiline matches, an alternative is
2716 to set the PCRE_STARTLINE flag if all branches start with ^. */
2718 if ((options & PCRE_ANCHORED) == 0)
2720 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2721 re->options |= PCRE_ANCHORED;
2722 else
2724 int ch = find_firstchar(re->code);
2725 if (ch >= 0)
2727 re->first_char = ch;
2728 re->options |= PCRE_FIRSTSET;
2730 else if (is_startline(re->code))
2731 re->options |= PCRE_STARTLINE;
2735 /* Print out the compiled data for debugging */
2737 #ifdef DEBUG
2739 printf("Length = %d top_bracket = %d top_backref=%d\n",
2740 length, re->top_bracket, re->top_backref);
2742 if (re->options != 0)
2744 printf("%s%s%s%s%s%s%s%s\n",
2745 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2746 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2747 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2748 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2749 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2750 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2751 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2752 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2755 if ((re->options & PCRE_FIRSTSET) != 0)
2757 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2758 else printf("First char = \\x%02x\n", re->first_char);
2761 code_end = code;
2762 code_base = code = re->code;
2764 while (code < code_end)
2766 int charlength;
2768 printf("%3d ", code - code_base);
2770 if (*code >= OP_BRA)
2772 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2773 code += 2;
2776 else switch(*code)
2778 case OP_CHARS:
2779 charlength = *(++code);
2780 printf("%3d ", charlength);
2781 while (charlength-- > 0)
2782 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2783 break;
2785 case OP_KETRMAX:
2786 case OP_KETRMIN:
2787 case OP_ALT:
2788 case OP_KET:
2789 case OP_ASSERT:
2790 case OP_ASSERT_NOT:
2791 case OP_ONCE:
2792 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2793 code += 2;
2794 break;
2796 case OP_STAR:
2797 case OP_MINSTAR:
2798 case OP_PLUS:
2799 case OP_MINPLUS:
2800 case OP_QUERY:
2801 case OP_MINQUERY:
2802 case OP_TYPESTAR:
2803 case OP_TYPEMINSTAR:
2804 case OP_TYPEPLUS:
2805 case OP_TYPEMINPLUS:
2806 case OP_TYPEQUERY:
2807 case OP_TYPEMINQUERY:
2808 if (*code >= OP_TYPESTAR)
2809 printf(" %s", OP_names[code[1]]);
2810 else if (isprint(c = code[1])) printf(" %c", c);
2811 else printf(" \\x%02x", c);
2812 printf("%s", OP_names[*code++]);
2813 break;
2815 case OP_EXACT:
2816 case OP_UPTO:
2817 case OP_MINUPTO:
2818 if (isprint(c = code[3])) printf(" %c{", c);
2819 else printf(" \\x%02x{", c);
2820 if (*code != OP_EXACT) printf("0,");
2821 printf("%d}", (code[1] << 8) + code[2]);
2822 if (*code == OP_MINUPTO) printf("?");
2823 code += 3;
2824 break;
2826 case OP_TYPEEXACT:
2827 case OP_TYPEUPTO:
2828 case OP_TYPEMINUPTO:
2829 printf(" %s{", OP_names[code[3]]);
2830 if (*code != OP_TYPEEXACT) printf(",");
2831 printf("%d}", (code[1] << 8) + code[2]);
2832 if (*code == OP_TYPEMINUPTO) printf("?");
2833 code += 3;
2834 break;
2836 case OP_NOT:
2837 if (isprint(c = *(++code))) printf(" [^%c]", c);
2838 else printf(" [^\\x%02x]", c);
2839 break;
2841 case OP_NOTSTAR:
2842 case OP_NOTMINSTAR:
2843 case OP_NOTPLUS:
2844 case OP_NOTMINPLUS:
2845 case OP_NOTQUERY:
2846 case OP_NOTMINQUERY:
2847 if (isprint(c = code[1])) printf(" [^%c]", c);
2848 else printf(" [^\\x%02x]", c);
2849 printf("%s", OP_names[*code++]);
2850 break;
2852 case OP_NOTEXACT:
2853 case OP_NOTUPTO:
2854 case OP_NOTMINUPTO:
2855 if (isprint(c = code[3])) printf(" [^%c]{", c);
2856 else printf(" [^\\x%02x]{", c);
2857 if (*code != OP_NOTEXACT) printf(",");
2858 printf("%d}", (code[1] << 8) + code[2]);
2859 if (*code == OP_NOTMINUPTO) printf("?");
2860 code += 3;
2861 break;
2863 case OP_REF:
2864 printf(" \\%d", *(++code));
2865 code ++;
2866 goto CLASS_REF_REPEAT;
2868 case OP_CLASS:
2869 case OP_NEGCLASS:
2870 case OP_CLASS_L:
2872 int i, min, max;
2874 if (*code==OP_CLASS_L)
2876 code++;
2877 printf("Locflag = %i ", *code++);
2878 printf(" [");
2880 else
2882 if (*code++ == OP_CLASS) printf(" [");
2883 else printf(" ^[");
2887 for (i = 0; i < 256; i++)
2889 if ((code[i/8] & (1 << (i&7))) != 0)
2891 int j;
2892 for (j = i+1; j < 256; j++)
2893 if ((code[j/8] & (1 << (j&7))) == 0) break;
2894 if (i == '-' || i == ']') printf("\\");
2895 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2896 if (--j > i)
2898 printf("-");
2899 if (j == '-' || j == ']') printf("\\");
2900 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2902 i = j;
2905 printf("]");
2906 code += 32;
2907 /* code ++;*/
2909 CLASS_REF_REPEAT:
2911 switch(*code)
2913 case OP_CRSTAR:
2914 case OP_CRMINSTAR:
2915 case OP_CRPLUS:
2916 case OP_CRMINPLUS:
2917 case OP_CRQUERY:
2918 case OP_CRMINQUERY:
2919 printf("%s", OP_names[*code]);
2920 break;
2922 case OP_CRRANGE:
2923 case OP_CRMINRANGE:
2924 min = (code[1] << 8) + code[2];
2925 max = (code[3] << 8) + code[4];
2926 if (max == 0) printf("{%d,}", min);
2927 else printf("{%d,%d}", min, max);
2928 if (*code == OP_CRMINRANGE) printf("?");
2929 code += 4;
2930 break;
2932 default:
2933 code--;
2936 break;
2938 /* Anything else is just a one-node item */
2940 default:
2941 printf(" %s", OP_names[*code]);
2942 break;
2945 code++;
2946 printf("\n");
2948 printf("------------------------------------------------------------------\n");
2950 /* This check is done here in the debugging case so that the code that
2951 was compiled can be seen. */
2953 if (code - re->code > length)
2955 printf("length=%i, code length=%i\n", length, code-re->code);
2956 *errorptr = ERR23;
2957 (pcre_free)(re);
2958 *erroroffset = ptr - (uschar *)pattern;
2959 return NULL;
2961 #endif
2963 return (pcre *)re;
2968 /*************************************************
2969 * Match a character type *
2970 *************************************************/
2972 /* Not used in all the places it might be as it's sometimes faster
2973 to put the code inline.
2975 Arguments:
2976 type the character type
2977 c the character
2978 dotall the dotall flag
2980 Returns: TRUE if character is of the type
2983 static BOOL
2984 match_type(int type, int c, BOOL dotall)
2987 #ifdef DEBUG
2988 if (isprint(c)) printf("matching subject %c against ", c);
2989 else printf("matching subject \\x%02x against ", c);
2990 printf("%s\n", OP_names[type]);
2991 #endif
2993 switch(type)
2995 case OP_ANY: return dotall || c != '\n';
2996 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2997 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2998 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2999 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
3000 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
3001 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
3002 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3003 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
3005 return FALSE;
3010 /*************************************************
3011 * Match a back-reference *
3012 *************************************************/
3014 /* If a back reference hasn't been set, the match fails.
3016 Arguments:
3017 number reference number
3018 eptr points into the subject
3019 length length to be matched
3020 md points to match data block
3022 Returns: TRUE if matched
3025 static BOOL
3026 match_ref(int number, register const uschar *eptr, int length, match_data *md)
3028 const uschar *p = md->start_subject + md->offset_vector[number];
3030 #ifdef DEBUG
3031 if (eptr >= md->end_subject)
3032 printf("matching subject <null>");
3033 else
3035 printf("matching subject ");
3036 pchars(eptr, length, TRUE, md);
3038 printf(" against backref ");
3039 pchars(p, length, FALSE, md);
3040 printf("\n");
3041 #endif
3043 /* Always fail if not enough characters left */
3045 if (length > md->end_subject - p) return FALSE;
3047 /* Separate the caseless case for speed */
3049 if (md->caseless)
3050 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3051 else
3052 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3054 return TRUE;
3057 static int free_stack(match_data *md)
3059 /* Free any stack space that was allocated by the call to match(). */
3060 if (md->off_num) PyMem_DEL(md->off_num);
3061 if (md->offset_top) PyMem_DEL(md->offset_top);
3062 if (md->r1) PyMem_DEL(md->r1);
3063 if (md->r2) PyMem_DEL(md->r2);
3064 if (md->eptr) PyMem_DEL((char *)md->eptr);
3065 if (md->ecode) PyMem_DEL((char *)md->ecode);
3066 return 0;
3069 static int grow_stack(match_data *md)
3071 if (md->length != 0)
3073 md->length = md->length + md->length/2;
3075 else
3077 int string_len = md->end_subject - md->start_subject + 1;
3078 if (string_len < 80) {md->length = string_len; }
3079 else {md->length = 80;}
3081 PyMem_RESIZE(md->offset_top, int, md->length);
3082 /* Can't realloc a pointer-to-const; cast const away. */
3083 md->eptr = (const uschar **)PyMem_Realloc((void *)md->eptr,
3084 sizeof(uschar *) * md->length);
3085 md->ecode = (const uschar **)PyMem_Realloc((void *)md->ecode,
3086 sizeof(uschar *) * md->length);
3087 PyMem_RESIZE(md->off_num, int, md->length);
3088 PyMem_RESIZE(md->r1, int, md->length);
3089 PyMem_RESIZE(md->r2, int, md->length);
3090 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3091 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3093 PyErr_NoMemory();
3094 longjmp(md->error_env, 1);
3096 return 0;
3100 /*************************************************
3101 * Match from current position *
3102 *************************************************/
3104 /* On entry ecode points to the first opcode, and eptr to the first character.
3106 Arguments:
3107 eptr pointer in subject
3108 ecode position in code
3109 offset_top current top pointer
3110 md pointer to "static" info for the match
3112 Returns: TRUE if matched
3115 static BOOL
3116 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
3117 match_data *md)
3119 int save_stack_position = md->point;
3120 match_loop:
3122 #define SUCCEED goto succeed
3123 #define FAIL goto fail
3125 for (;;)
3127 int min, max, ctype;
3128 register int i;
3129 register int c;
3130 BOOL minimize = FALSE;
3132 /* Opening bracket. Check the alternative branches in turn, failing if none
3133 match. We have to set the start offset if required and there is space
3134 in the offset vector so that it is available for subsequent back references
3135 if the bracket matches. However, if the bracket fails, we must put back the
3136 previous value of both offsets in case they were set by a previous copy of
3137 the same bracket. Don't worry about setting the flag for the error case here;
3138 that is handled in the code for KET. */
3140 if ((int)*ecode >= OP_BRA)
3142 int number = (*ecode - OP_BRA) << 1;
3143 int save_offset1 = 0, save_offset2 = 0;
3145 DPRINTF(("start bracket %d\n", number/2));
3147 if (number > 0 && number < md->offset_end)
3149 save_offset1 = md->offset_vector[number];
3150 save_offset2 = md->offset_vector[number+1];
3151 md->offset_vector[number] = eptr - md->start_subject;
3153 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
3156 /* Recurse for all the alternatives. */
3160 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3161 ecode += (ecode[1] << 8) + ecode[2];
3163 while (*ecode == OP_ALT);
3165 DPRINTF(("bracket %d failed\n", number/2));
3167 if (number > 0 && number < md->offset_end)
3169 md->offset_vector[number] = save_offset1;
3170 md->offset_vector[number+1] = save_offset2;
3173 FAIL;
3176 /* Other types of node can be handled by a switch */
3178 switch(*ecode)
3180 case OP_END:
3181 md->end_match_ptr = eptr; /* Record where we ended */
3182 md->end_offset_top = offset_top; /* and how many extracts were taken */
3183 SUCCEED;
3185 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3186 whole thing doesn't match, so we have to get out via a longjmp(). */
3188 case OP_CUT:
3189 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3190 longjmp(md->fail_env, 1);
3192 /* Assertion brackets. Check the alternative branches in turn - the
3193 matching won't pass the KET for an assertion. If any one branch matches,
3194 the assertion is true. */
3196 case OP_ASSERT:
3199 if (match(eptr, ecode+3, offset_top, md)) break;
3200 ecode += (ecode[1] << 8) + ecode[2];
3202 while (*ecode == OP_ALT);
3203 if (*ecode == OP_KET) FAIL;
3205 /* Continue from after the assertion, updating the offsets high water
3206 mark, since extracts may have been taken during the assertion. */
3208 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3209 ecode += 3;
3210 offset_top = md->end_offset_top;
3211 continue;
3213 /* Negative assertion: all branches must fail to match */
3215 case OP_ASSERT_NOT:
3218 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3219 ecode += (ecode[1] << 8) + ecode[2];
3221 while (*ecode == OP_ALT);
3222 ecode += 3;
3223 continue;
3225 /* "Once" brackets are like assertion brackets except that after a match,
3226 the point in the subject string is not moved back. Thus there can never be
3227 a move back into the brackets. Check the alternative branches in turn - the
3228 matching won't pass the KET for this kind of subpattern. If any one branch
3229 matches, we carry on, leaving the subject pointer. */
3231 case OP_ONCE:
3234 if (match(eptr, ecode+3, offset_top, md)) break;
3235 ecode += (ecode[1] << 8) + ecode[2];
3237 while (*ecode == OP_ALT);
3238 if (*ecode == OP_KET) FAIL;
3240 /* Continue as from after the assertion, updating the offsets high water
3241 mark, since extracts may have been taken. */
3243 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3244 ecode += 3;
3245 offset_top = md->end_offset_top;
3246 eptr = md->end_match_ptr;
3247 continue;
3249 /* An alternation is the end of a branch; scan along to find the end of the
3250 bracketed group and go to there. */
3252 case OP_ALT:
3253 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3254 break;
3256 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3257 that it may occur zero times. It may repeat infinitely, or not at all -
3258 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3259 repeat limits are compiled as a number of copies, with the optional ones
3260 preceded by BRAZERO or BRAMINZERO. */
3262 case OP_BRAZERO:
3264 const uschar *next = ecode+1;
3265 if (match(eptr, next, offset_top, md)) SUCCEED;
3266 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3267 ecode = next + 3;
3269 break;
3271 case OP_BRAMINZERO:
3273 const uschar *next = ecode+1;
3274 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3275 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3276 ecode++;
3278 break;;
3280 /* End of a group, repeated or non-repeating. If we are at the end of
3281 an assertion "group", stop matching and SUCCEED, but record the
3282 current high water mark for use by positive assertions. */
3284 case OP_KET:
3285 case OP_KETRMIN:
3286 case OP_KETRMAX:
3288 int number;
3289 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3291 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
3293 md->end_match_ptr = eptr; /* For ONCE */
3294 md->end_offset_top = offset_top;
3295 SUCCEED;
3298 /* In all other cases we have to check the group number back at the
3299 start and if necessary complete handling an extraction by setting the
3300 final offset and bumping the high water mark. */
3302 number = (*prev - OP_BRA) << 1;
3304 DPRINTF(("end bracket %d\n", number/2));
3306 if (number > 0)
3308 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3310 md->offset_vector[number+1] = eptr - md->start_subject;
3311 if (offset_top <= number) offset_top = number + 2;
3315 /* For a non-repeating ket, just advance to the next node and continue at
3316 this level. */
3318 if (*ecode == OP_KET)
3320 ecode += 3;
3321 break;
3324 /* The repeating kets try the rest of the pattern or restart from the
3325 preceding bracket, in the appropriate order. */
3327 if (*ecode == OP_KETRMIN)
3329 const uschar *ptr;
3330 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3331 /* Handle alternation inside the BRA...KET; push the additional
3332 alternatives onto the stack */
3333 ptr=prev;
3334 do {
3335 ptr += (ptr[1]<<8)+ ptr[2];
3336 if (*ptr==OP_ALT)
3338 if (md->length == md->point)
3340 grow_stack(md);
3342 md->offset_top[md->point] = offset_top;
3343 md->eptr[md->point] = eptr;
3344 md->ecode[md->point] = ptr+3;
3345 md->r1[md->point] = 0;
3346 md->r2[md->point] = 0;
3347 md->off_num[md->point] = 0;
3348 md->point++;
3350 } while (*ptr==OP_ALT);
3351 ecode=prev+3; goto match_loop;
3353 else /* OP_KETRMAX */
3355 const uschar *ptr;
3356 /*int points_pushed=0;*/
3358 /* Push one failure point, that will resume matching at the code after
3359 the KETRMAX opcode. */
3360 if (md->length == md->point)
3362 grow_stack(md);
3364 md->offset_top[md->point] = offset_top;
3365 md->eptr[md->point] = eptr;
3366 md->ecode[md->point] = ecode+3;
3367 md->r1[md->point] = md->offset_vector[number];
3368 md->r2[md->point] = md->offset_vector[number+1];
3369 md->off_num[md->point] = number;
3370 md->point++;
3372 md->offset_vector[number] = eptr - md->start_subject;
3373 /* Handle alternation inside the BRA...KET; push each of the
3374 additional alternatives onto the stack */
3375 ptr=prev;
3376 do {
3377 ptr += (ptr[1]<<8)+ ptr[2];
3378 if (*ptr==OP_ALT)
3380 if (md->length == md->point)
3381 if (md->length == md->point)
3383 grow_stack(md);
3385 md->offset_top[md->point] = offset_top;
3386 md->eptr[md->point] = eptr;
3387 md->ecode[md->point] = ptr+3;
3388 md->r1[md->point] = 0;
3389 md->r2[md->point] = 0;
3390 md->off_num[md->point] = 0;
3391 md->point++;
3392 /*points_pushed++;*/
3394 } while (*ptr==OP_ALT);
3395 /* Jump to the first (or only) alternative and resume trying to match */
3396 ecode=prev+3; goto match_loop;
3400 /* Start of subject unless notbol, or after internal newline if multiline */
3402 case OP_CIRC:
3403 if (md->notbol && eptr == md->start_subject) FAIL;
3404 if (md->multiline)
3406 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3407 ecode++;
3408 break;
3410 /* ... else fall through */
3412 /* Start of subject assertion */
3414 case OP_SOD:
3415 if (eptr != md->start_subject) FAIL;
3416 ecode++;
3417 break;
3419 /* Assert before internal newline if multiline, or before
3420 a terminating newline unless endonly is set, else end of subject unless
3421 noteol is set. */
3423 case OP_DOLL:
3424 if (md->noteol && eptr >= md->end_subject) FAIL;
3425 if (md->multiline)
3427 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3428 ecode++;
3429 break;
3431 else if (!md->endonly)
3433 if (eptr < md->end_subject - 1 ||
3434 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3435 ecode++;
3436 break;
3438 /* ... else fall through */
3440 /* End of subject assertion */
3442 case OP_EOD:
3443 if (eptr < md->end_subject) FAIL;
3444 ecode++;
3445 break;
3447 /* Word boundary assertions */
3449 case OP_NOT_WORD_BOUNDARY:
3450 case OP_WORD_BOUNDARY:
3452 BOOL prev_is_word = (eptr != md->start_subject) &&
3453 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3454 BOOL cur_is_word = (eptr < md->end_subject) &&
3455 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3456 if ((*ecode++ == OP_WORD_BOUNDARY)?
3457 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3458 FAIL;
3460 break;
3462 case OP_NOT_WORD_BOUNDARY_L:
3463 case OP_WORD_BOUNDARY_L:
3465 BOOL prev_is_word = (eptr != md->start_subject) &&
3466 (isalnum(eptr[-1]) || eptr[-1]=='_');
3467 BOOL cur_is_word = (eptr < md->end_subject) &&
3468 (isalnum(*eptr) || *eptr=='_');
3469 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3470 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3471 FAIL;
3473 break;
3476 /* Match a single character type; inline for speed */
3478 case OP_ANY:
3479 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3480 if (eptr++ >= md->end_subject) FAIL;
3481 ecode++;
3482 break;
3484 case OP_NOT_DIGIT:
3485 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3486 FAIL;
3487 ecode++;
3488 break;
3490 case OP_DIGIT:
3491 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3492 FAIL;
3493 ecode++;
3494 break;
3496 case OP_NOT_WHITESPACE:
3497 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3498 FAIL;
3499 ecode++;
3500 break;
3502 case OP_WHITESPACE:
3503 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3504 FAIL;
3505 ecode++;
3506 break;
3508 case OP_NOT_WORDCHAR:
3509 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3510 FAIL;
3511 ecode++;
3512 break;
3514 case OP_WORDCHAR:
3515 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3516 FAIL;
3517 ecode++;
3518 break;
3520 case OP_NOT_WORDCHAR_L:
3521 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
3522 FAIL;
3523 eptr++;
3524 ecode++;
3525 break;
3527 case OP_WORDCHAR_L:
3528 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
3529 FAIL;
3530 eptr++;
3531 ecode++;
3532 break;
3534 /* Match a back reference, possibly repeatedly. Look past the end of the
3535 item to see if there is repeat information following. The code is similar
3536 to that for character classes, but repeated for efficiency. Then obey
3537 similar code to character type repeats - written out again for speed.
3538 However, if the referenced string is the empty string, always treat
3539 it as matched, any number of times (otherwise there could be infinite
3540 loops). */
3542 case OP_REF:
3544 int length;
3545 int number = ecode[1] << 1; /* Doubled reference number */
3546 ecode += 2; /* Advance past the item */
3548 if (number >= offset_top || md->offset_vector[number] < 0)
3550 md->errorcode = PCRE_ERROR_BADREF;
3551 FAIL;
3554 length = md->offset_vector[number+1] - md->offset_vector[number];
3556 switch (*ecode)
3558 case OP_CRSTAR:
3559 case OP_CRMINSTAR:
3560 case OP_CRPLUS:
3561 case OP_CRMINPLUS:
3562 case OP_CRQUERY:
3563 case OP_CRMINQUERY:
3564 c = *ecode++ - OP_CRSTAR;
3565 minimize = (c & 1) != 0;
3566 min = rep_min[c]; /* Pick up values from tables; */
3567 max = rep_max[c]; /* zero for max => infinity */
3568 if (max == 0) max = INT_MAX;
3569 break;
3571 case OP_CRRANGE:
3572 case OP_CRMINRANGE:
3573 minimize = (*ecode == OP_CRMINRANGE);
3574 min = (ecode[1] << 8) + ecode[2];
3575 max = (ecode[3] << 8) + ecode[4];
3576 if (max == 0) max = INT_MAX;
3577 ecode += 5;
3578 break;
3580 default: /* No repeat follows */
3581 if (!match_ref(number, eptr, length, md)) FAIL;
3582 eptr += length;
3583 continue; /* With the main loop */
3586 /* If the length of the reference is zero, just continue with the
3587 main loop. */
3589 if (length == 0) continue;
3591 /* First, ensure the minimum number of matches are present. We get back
3592 the length of the reference string explicitly rather than passing the
3593 address of eptr, so that eptr can be a register variable. */
3595 for (i = 1; i <= min; i++)
3597 if (!match_ref(number, eptr, length, md)) FAIL;
3598 eptr += length;
3601 /* If min = max, continue at the same level without recursion.
3602 They are not both allowed to be zero. */
3604 if (min == max) continue;
3606 /* If minimizing, keep trying and advancing the pointer */
3608 if (minimize)
3610 for (i = min;; i++)
3612 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3613 if (i >= max || !match_ref(number, eptr, length, md))
3614 FAIL;
3615 eptr += length;
3617 /* Control never gets here */
3620 /* If maximizing, find the longest string and work backwards */
3622 else
3624 const uschar *pp = eptr;
3625 for (i = min; i < max; i++)
3627 if (!match_ref(number, eptr, length, md)) break;
3628 eptr += length;
3630 while (eptr >= pp)
3632 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3633 eptr -= length;
3635 FAIL;
3638 /* Control never gets here */
3640 /* Match a character class, possibly repeatedly. Look past the end of the
3641 item to see if there is repeat information following. Then obey similar
3642 code to character type repeats - written out again for speed. If caseless
3643 matching was set at runtime but not at compile time, we have to check both
3644 versions of a character, and we have to behave differently for positive and
3645 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3646 treated differently. */
3648 case OP_CLASS:
3649 case OP_NEGCLASS:
3651 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
3652 const uschar *data = ecode + 1; /* Save for matching */
3653 ecode += 33; /* Advance past the item */
3655 switch (*ecode)
3657 case OP_CRSTAR:
3658 case OP_CRMINSTAR:
3659 case OP_CRPLUS:
3660 case OP_CRMINPLUS:
3661 case OP_CRQUERY:
3662 case OP_CRMINQUERY:
3663 c = *ecode++ - OP_CRSTAR;
3664 minimize = (c & 1) != 0;
3665 min = rep_min[c]; /* Pick up values from tables; */
3666 max = rep_max[c]; /* zero for max => infinity */
3667 if (max == 0) max = INT_MAX;
3668 break;
3670 case OP_CRRANGE:
3671 case OP_CRMINRANGE:
3672 minimize = (*ecode == OP_CRMINRANGE);
3673 min = (ecode[1] << 8) + ecode[2];
3674 max = (ecode[3] << 8) + ecode[4];
3675 if (max == 0) max = INT_MAX;
3676 ecode += 5;
3677 break;
3679 default: /* No repeat follows */
3680 min = max = 1;
3681 break;
3684 /* First, ensure the minimum number of matches are present. */
3686 for (i = 1; i <= min; i++)
3688 if (eptr >= md->end_subject) FAIL;
3689 c = *eptr++;
3691 /* Either not runtime caseless, or it was a positive class. For
3692 runtime caseless, continue if either case is in the map. */
3694 if (!nasty_case)
3696 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3697 if (md->runtime_caseless)
3699 c = pcre_fcc[c];
3700 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3704 /* Runtime caseless and it was a negative class. Continue only if
3705 both cases are in the map. */
3707 else
3709 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3710 c = pcre_fcc[c];
3711 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3714 FAIL;
3717 /* If max == min we can continue with the main loop without the
3718 need to recurse. */
3720 if (min == max) continue;
3722 /* If minimizing, keep testing the rest of the expression and advancing
3723 the pointer while it matches the class. */
3725 if (minimize)
3727 for (i = min;; i++)
3729 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3730 if (i >= max || eptr >= md->end_subject) FAIL;
3731 c = *eptr++;
3733 /* Either not runtime caseless, or it was a positive class. For
3734 runtime caseless, continue if either case is in the map. */
3736 if (!nasty_case)
3738 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3739 if (md->runtime_caseless)
3741 c = pcre_fcc[c];
3742 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3746 /* Runtime caseless and it was a negative class. Continue only if
3747 both cases are in the map. */
3749 else
3751 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3752 c = pcre_fcc[c];
3753 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3756 FAIL;
3758 /* Control never gets here */
3761 /* If maximizing, find the longest possible run, then work backwards. */
3763 else
3765 const uschar *pp = eptr;
3766 for (i = min; i < max; eptr++, i++)
3768 if (eptr >= md->end_subject) break;
3769 c = *eptr;
3771 /* Either not runtime caseless, or it was a positive class. For
3772 runtime caseless, continue if either case is in the map. */
3774 if (!nasty_case)
3776 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3777 if (md->runtime_caseless)
3779 c = pcre_fcc[c];
3780 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3784 /* Runtime caseless and it was a negative class. Continue only if
3785 both cases are in the map. */
3787 else
3789 if ((data[c/8] & (1 << (c&7))) == 0) break;
3790 c = pcre_fcc[c];
3791 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3794 break;
3797 while (eptr >= pp)
3798 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3799 FAIL;
3802 /* Control never gets here */
3804 /* OP_CLASS_L opcode: handles localized character classes */
3806 case OP_CLASS_L:
3808 const uschar *data = ecode + 1; /* Save for matching */
3809 const uschar locale_flag = *data;
3810 ecode++; data++; /* The localization support adds an extra byte */
3812 ecode += 33; /* Advance past the item */
3814 switch (*ecode)
3816 case OP_CRSTAR:
3817 case OP_CRMINSTAR:
3818 case OP_CRPLUS:
3819 case OP_CRMINPLUS:
3820 case OP_CRQUERY:
3821 case OP_CRMINQUERY:
3822 c = *ecode++ - OP_CRSTAR;
3823 minimize = (c & 1) != 0;
3824 min = rep_min[c]; /* Pick up values from tables; */
3825 max = rep_max[c]; /* zero for max => infinity */
3826 if (max == 0) max = INT_MAX;
3827 break;
3829 case OP_CRRANGE:
3830 case OP_CRMINRANGE:
3831 minimize = (*ecode == OP_CRMINRANGE);
3832 min = (ecode[1] << 8) + ecode[2];
3833 max = (ecode[3] << 8) + ecode[4];
3834 if (max == 0) max = INT_MAX;
3835 ecode += 5;
3836 break;
3838 default: /* No repeat follows */
3839 if (eptr >= md->end_subject) FAIL;
3840 c = *eptr++;
3841 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3842 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3843 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3844 #if 0
3845 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3846 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3847 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3848 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3849 #endif
3851 if (md->runtime_caseless)
3853 c = pcre_fcc[c];
3854 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3856 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3857 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3859 FAIL;
3862 /* First, ensure the minimum number of matches are present. */
3864 for (i = 1; i <= min; i++)
3866 if (eptr >= md->end_subject) FAIL;
3867 c = *eptr++;
3868 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3869 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3870 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3872 if (md->runtime_caseless)
3874 c = pcre_fcc[c];
3875 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3876 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3877 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3879 FAIL;
3882 /* If max == min we can continue with the main loop without the
3883 need to recurse. */
3885 if (min == max) continue;
3887 /* If minimizing, keep testing the rest of the expression and advancing
3888 the pointer while it matches the class. */
3890 if (minimize)
3892 for (i = min;; i++)
3894 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3895 if (i >= max || eptr >= md->end_subject) FAIL;
3896 c = *eptr++;
3897 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3898 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3899 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3901 if (md->runtime_caseless)
3903 c = pcre_fcc[c];
3904 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3905 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3906 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3908 FAIL;
3910 /* Control never gets here */
3913 /* If maximizing, find the longest possible run, then work backwards. */
3915 else
3917 const uschar *pp = eptr;
3918 for (i = min; i < max; eptr++, i++)
3920 if (eptr >= md->end_subject) break;
3921 c = *eptr;
3922 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3923 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3924 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3925 if (md->runtime_caseless)
3927 c = pcre_fcc[c];
3928 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3929 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3930 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3932 break;
3935 while (eptr >= pp)
3936 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3937 FAIL;
3940 /* Control never gets here */
3942 /* Match a run of characters */
3944 case OP_CHARS:
3946 register int length = ecode[1];
3947 ecode += 2;
3949 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3950 if (eptr >= md->end_subject)
3951 printf("matching subject <null> against pattern ");
3952 else
3954 printf("matching subject ");
3955 pchars(eptr, length, TRUE, md);
3956 printf(" against pattern ");
3958 pchars(ecode, length, FALSE, md);
3959 printf("\n");
3960 #endif
3962 if (length > md->end_subject - eptr) FAIL;
3963 if (md->caseless)
3965 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3967 else
3969 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3972 break;
3974 /* Match a single character repeatedly; different opcodes share code. */
3976 case OP_EXACT:
3977 min = max = (ecode[1] << 8) + ecode[2];
3978 ecode += 3;
3979 goto REPEATCHAR;
3981 case OP_UPTO:
3982 case OP_MINUPTO:
3983 min = 0;
3984 max = (ecode[1] << 8) + ecode[2];
3985 minimize = *ecode == OP_MINUPTO;
3986 ecode += 3;
3987 goto REPEATCHAR;
3989 case OP_STAR:
3990 case OP_MINSTAR:
3991 case OP_PLUS:
3992 case OP_MINPLUS:
3993 case OP_QUERY:
3994 case OP_MINQUERY:
3995 c = *ecode++ - OP_STAR;
3996 minimize = (c & 1) != 0;
3997 min = rep_min[c]; /* Pick up values from tables; */
3998 max = rep_max[c]; /* zero for max => infinity */
3999 if (max == 0) max = INT_MAX;
4001 /* Common code for all repeated single-character matches. We can give
4002 up quickly if there are fewer than the minimum number of characters left in
4003 the subject. */
4005 REPEATCHAR:
4006 if (min > md->end_subject - eptr) FAIL;
4007 c = *ecode++;
4009 /* The code is duplicated for the caseless and caseful cases, for speed,
4010 since matching characters is likely to be quite common. First, ensure the
4011 minimum number of matches are present. If min = max, continue at the same
4012 level without recursing. Otherwise, if minimizing, keep trying the rest of
4013 the expression and advancing one matching character if failing, up to the
4014 maximum. Alternatively, if maximizing, find the maximum number of
4015 characters and work backwards. */
4017 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4018 max, eptr));
4020 if (md->caseless)
4022 c = pcre_lcc[c];
4023 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4024 if (min == max) continue;
4025 if (minimize)
4027 for (i = min;; i++)
4029 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4030 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4031 FAIL;
4033 /* Control never gets here */
4035 else
4037 const uschar *pp = eptr;
4038 for (i = min; i < max; i++)
4040 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4041 eptr++;
4043 while (eptr >= pp)
4044 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4045 FAIL;
4047 /* Control never gets here */
4050 /* Caseful comparisons */
4052 else
4054 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4055 if (min == max) continue;
4056 if (minimize)
4058 for (i = min;; i++)
4060 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4061 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4063 /* Control never gets here */
4065 else
4067 const uschar *pp = eptr;
4068 for (i = min; i < max; i++)
4070 if (eptr >= md->end_subject || c != *eptr) break;
4071 eptr++;
4073 while (eptr >= pp)
4074 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4075 FAIL;
4078 /* Control never gets here */
4080 /* Match a negated single character */
4082 case OP_NOT:
4083 if (eptr >= md->end_subject) FAIL;
4084 ecode++;
4085 if (md->caseless)
4087 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4089 else
4091 if (*ecode++ == *eptr++) FAIL;
4093 break;
4095 /* Match a negated single character repeatedly. This is almost a repeat of
4096 the code for a repeated single character, but I haven't found a nice way of
4097 commoning these up that doesn't require a test of the positive/negative
4098 option for each character match. Maybe that wouldn't add very much to the
4099 time taken, but character matching *is* what this is all about... */
4101 case OP_NOTEXACT:
4102 min = max = (ecode[1] << 8) + ecode[2];
4103 ecode += 3;
4104 goto REPEATNOTCHAR;
4106 case OP_NOTUPTO:
4107 case OP_NOTMINUPTO:
4108 min = 0;
4109 max = (ecode[1] << 8) + ecode[2];
4110 minimize = *ecode == OP_NOTMINUPTO;
4111 ecode += 3;
4112 goto REPEATNOTCHAR;
4114 case OP_NOTSTAR:
4115 case OP_NOTMINSTAR:
4116 case OP_NOTPLUS:
4117 case OP_NOTMINPLUS:
4118 case OP_NOTQUERY:
4119 case OP_NOTMINQUERY:
4120 c = *ecode++ - OP_NOTSTAR;
4121 minimize = (c & 1) != 0;
4122 min = rep_min[c]; /* Pick up values from tables; */
4123 max = rep_max[c]; /* zero for max => infinity */
4124 if (max == 0) max = INT_MAX;
4126 /* Common code for all repeated single-character matches. We can give
4127 up quickly if there are fewer than the minimum number of characters left in
4128 the subject. */
4130 REPEATNOTCHAR:
4131 if (min > md->end_subject - eptr) FAIL;
4132 c = *ecode++;
4134 /* The code is duplicated for the caseless and caseful cases, for speed,
4135 since matching characters is likely to be quite common. First, ensure the
4136 minimum number of matches are present. If min = max, continue at the same
4137 level without recursing. Otherwise, if minimizing, keep trying the rest of
4138 the expression and advancing one matching character if failing, up to the
4139 maximum. Alternatively, if maximizing, find the maximum number of
4140 characters and work backwards. */
4142 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4143 max, eptr));
4145 if (md->caseless)
4147 c = pcre_lcc[c];
4148 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4149 if (min == max) continue;
4150 if (minimize)
4152 for (i = min;; i++)
4154 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4155 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4156 FAIL;
4158 /* Control never gets here */
4160 else
4162 const uschar *pp = eptr;
4163 for (i = min; i < max; i++)
4165 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4166 eptr++;
4168 while (eptr >= pp)
4169 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4170 FAIL;
4172 /* Control never gets here */
4175 /* Caseful comparisons */
4177 else
4179 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4180 if (min == max) continue;
4181 if (minimize)
4183 for (i = min;; i++)
4185 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4186 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4188 /* Control never gets here */
4190 else
4192 const uschar *pp = eptr;
4193 for (i = min; i < max; i++)
4195 if (eptr >= md->end_subject || c == *eptr) break;
4196 eptr++;
4198 while (eptr >= pp)
4199 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4200 FAIL;
4203 /* Control never gets here */
4205 /* Match a single character type repeatedly; several different opcodes
4206 share code. This is very similar to the code for single characters, but we
4207 repeat it in the interests of efficiency. */
4209 case OP_TYPEEXACT:
4210 min = max = (ecode[1] << 8) + ecode[2];
4211 minimize = TRUE;
4212 ecode += 3;
4213 goto REPEATTYPE;
4215 case OP_TYPEUPTO:
4216 case OP_TYPEMINUPTO:
4217 min = 0;
4218 max = (ecode[1] << 8) + ecode[2];
4219 minimize = *ecode == OP_TYPEMINUPTO;
4220 ecode += 3;
4221 goto REPEATTYPE;
4223 case OP_TYPESTAR:
4224 case OP_TYPEMINSTAR:
4225 case OP_TYPEPLUS:
4226 case OP_TYPEMINPLUS:
4227 case OP_TYPEQUERY:
4228 case OP_TYPEMINQUERY:
4229 c = *ecode++ - OP_TYPESTAR;
4230 minimize = (c & 1) != 0;
4231 min = rep_min[c]; /* Pick up values from tables; */
4232 max = rep_max[c]; /* zero for max => infinity */
4233 if (max == 0) max = INT_MAX;
4235 /* Common code for all repeated single character type matches */
4237 REPEATTYPE:
4238 ctype = *ecode++; /* Code for the character type */
4240 /* First, ensure the minimum number of matches are present. Use inline
4241 code for maximizing the speed, and do the type test once at the start
4242 (i.e. keep it out of the loop). Also test that there are at least the
4243 minimum number of characters before we start. */
4245 if (min > md->end_subject - eptr) FAIL;
4246 if (min > 0) switch(ctype)
4248 case OP_ANY:
4249 if (!md->dotall)
4250 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4251 else eptr += min;
4252 break;
4254 case OP_NOT_DIGIT:
4255 for (i = 1; i <= min; i++)
4256 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4257 break;
4259 case OP_DIGIT:
4260 for (i = 1; i <= min; i++)
4261 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4262 break;
4264 case OP_NOT_WHITESPACE:
4265 for (i = 1; i <= min; i++)
4266 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4267 break;
4269 case OP_WHITESPACE:
4270 for (i = 1; i <= min; i++)
4271 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4272 break;
4274 case OP_NOT_WORDCHAR:
4275 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4276 FAIL;
4277 break;
4279 case OP_WORDCHAR:
4280 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4281 FAIL;
4282 break;
4284 case OP_NOT_WORDCHAR_L:
4285 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
4286 FAIL;
4287 break;
4289 case OP_WORDCHAR_L:
4290 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
4291 FAIL;
4292 break;
4295 /* If min = max, continue at the same level without recursing */
4297 if (min == max) continue;
4299 /* If minimizing, we have to test the rest of the pattern before each
4300 subsequent match, so inlining isn't much help; just use the function. */
4302 if (minimize)
4304 for (i = min;; i++)
4306 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4307 if (i >= max || eptr >= md->end_subject ||
4308 !match_type(ctype, *eptr++, md->dotall))
4309 FAIL;
4311 /* Control never gets here */
4314 /* If maximizing it is worth using inline code for speed, doing the type
4315 test once at the start (i.e. keep it out of the loop). */
4317 else
4319 const uschar *pp = eptr;
4320 switch(ctype)
4322 case OP_ANY:
4323 if (!md->dotall)
4325 for (i = min; i < max; i++)
4327 if (eptr >= md->end_subject || *eptr == '\n') break;
4328 eptr++;
4331 else
4333 c = max - min;
4334 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4335 eptr += c;
4337 break;
4339 case OP_NOT_DIGIT:
4340 for (i = min; i < max; i++)
4342 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4343 break;
4344 eptr++;
4346 break;
4348 case OP_DIGIT:
4349 for (i = min; i < max; i++)
4351 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4352 break;
4353 eptr++;
4355 break;
4357 case OP_NOT_WHITESPACE:
4358 for (i = min; i < max; i++)
4360 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4361 break;
4362 eptr++;
4364 break;
4366 case OP_WHITESPACE:
4367 for (i = min; i < max; i++)
4369 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4370 break;
4371 eptr++;
4373 break;
4375 case OP_NOT_WORDCHAR:
4376 for (i = min; i < max; i++)
4378 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4379 break;
4380 eptr++;
4382 break;
4384 case OP_WORDCHAR:
4385 for (i = min; i < max; i++)
4387 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4388 break;
4389 eptr++;
4391 break;
4392 case OP_NOT_WORDCHAR_L:
4393 for (i = min; i < max; i++)
4395 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
4396 break;
4397 eptr++;
4399 break;
4401 case OP_WORDCHAR_L:
4402 for (i = min; i < max; i++)
4404 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
4405 break;
4406 eptr++;
4408 break;
4411 while (eptr >= pp)
4412 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4413 FAIL;
4415 /* Control never gets here */
4417 /* There's been some horrible disaster. */
4419 default:
4420 DPRINTF(("Unknown opcode %d\n", *ecode));
4421 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4422 FAIL;
4425 /* Do not stick any code in here without much thought; it is assumed
4426 that "continue" in the code above comes out to here to repeat the main
4427 loop. */
4429 } /* End of main loop */
4430 /* Control never reaches here */
4432 fail:
4433 if (md->point > save_stack_position)
4435 /* If there are still points remaining on the stack, pop the next one off */
4436 int off_num;
4438 md->point--;
4439 offset_top = md->offset_top[md->point];
4440 eptr = md->eptr[md->point];
4441 ecode = md->ecode[md->point];
4442 off_num = md->off_num[md->point];
4443 md->offset_vector[off_num] = md->r1[md->point];
4444 md->offset_vector[off_num+1] = md->r2[md->point];
4445 goto match_loop;
4447 /* Failure, and nothing left on the stack, so end this function call */
4449 /* Restore the top of the stack to where it was before this function
4450 call. This lets us use one stack for everything; recursive calls
4451 can push and pop information, and may increase the stack. When
4452 the call returns, the parent function can resume pushing and
4453 popping wherever it was. */
4455 md->point = save_stack_position;
4456 return FALSE;
4458 succeed:
4459 return TRUE;
4464 /*************************************************
4465 * Segregate setjmp() *
4466 *************************************************/
4468 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4469 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4470 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4471 since it's needed only for the extension \X option, and with any luck, a good
4472 compiler will spot the tail recursion and compile it efficiently.
4474 Arguments:
4475 eptr pointer in subject
4476 ecode position in code
4477 offset_top current top pointer
4478 md pointer to "static" info for the match
4480 Returns: TRUE if matched
4483 static BOOL
4484 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4485 match_data *match_block)
4487 return setjmp(match_block->fail_env) == 0 &&
4488 match(eptr, ecode, offset_top, match_block);
4493 /*************************************************
4494 * Execute a Regular Expression *
4495 *************************************************/
4497 /* This function applies a compiled re to a subject string and picks out
4498 portions of the string if it matches. Two elements in the vector are set for
4499 each substring: the offsets to the start and end of the substring.
4501 Arguments:
4502 external_re points to the compiled expression
4503 external_extra points to "hints" from pcre_study() or is NULL
4504 subject points to the subject string
4505 length length of subject string (may contain binary zeros)
4506 options option bits
4507 offsets points to a vector of ints to be filled in with offsets
4508 offsetcount the number of elements in the vector
4510 Returns: > 0 => success; value is the number of elements filled in
4511 = 0 => success, but offsets is not big enough
4512 -1 => failed to match
4513 < -1 => some kind of unexpected problem
4517 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4518 const char *subject, int length, int start_pos, int options,
4519 int *offsets, int offsetcount)
4521 /* The "volatile" directives are to make gcc -Wall stop complaining
4522 that these variables can be clobbered by the longjmp. Hopefully
4523 they won't cost too much performance. */
4524 volatile int resetcount, ocount;
4525 volatile int first_char = -1;
4526 const uschar * volatile start_bits = NULL;
4527 const uschar * volatile start_match = (const uschar *)subject + start_pos;
4528 match_data match_block;
4529 const uschar *end_subject;
4530 const real_pcre *re = (const real_pcre *)external_re;
4531 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4532 volatile BOOL using_temporary_offsets = FALSE;
4533 volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4534 volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4536 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4538 if (re == NULL || subject == NULL ||
4539 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4540 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4542 match_block.start_subject = (const uschar *)subject;
4543 match_block.end_subject = match_block.start_subject + length;
4544 end_subject = match_block.end_subject;
4546 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4547 match_block.runtime_caseless = match_block.caseless &&
4548 (re->options & PCRE_CASELESS) == 0;
4550 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4551 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4552 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4554 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4555 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4557 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4559 /* Set the stack state to empty */
4560 match_block.off_num = match_block.offset_top = NULL;
4561 match_block.r1 = match_block.r2 = NULL;
4562 match_block.eptr = match_block.ecode = NULL;
4563 match_block.point = match_block.length = 0;
4565 /* If the expression has got more back references than the offsets supplied can
4566 hold, we get a temporary bit of working store to use during the matching.
4567 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4568 of 2. */
4570 ocount = offsetcount & (-2);
4571 if (re->top_backref > 0 && re->top_backref >= ocount/2)
4573 ocount = re->top_backref * 2 + 2;
4574 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4575 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4576 using_temporary_offsets = TRUE;
4577 DPRINTF(("Got memory to hold back references\n"));
4579 else match_block.offset_vector = offsets;
4581 match_block.offset_end = ocount;
4582 match_block.offset_overflow = FALSE;
4584 /* Compute the minimum number of offsets that we need to reset each time. Doing
4585 this makes a huge difference to execution time when there aren't many brackets
4586 in the pattern. */
4588 resetcount = 2 + re->top_bracket * 2;
4589 if (resetcount > offsetcount) resetcount = ocount;
4591 /* If MULTILINE is set at exec time but was not set at compile time, and the
4592 anchored flag is set, we must re-check because a setting provoked by ^ in the
4593 pattern is not right in multi-line mode. Calling is_anchored() again here does
4594 the right check, because multiline is now set. If it now yields FALSE, the
4595 expression must have had ^ starting some of its branches. Check to see if
4596 that is true for *all* branches, and if so, set the startline flag. */
4598 if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4599 !is_anchored(re->code, match_block.multiline))
4601 anchored = FALSE;
4602 if (is_startline(re->code)) startline = TRUE;
4605 /* Set up the first character to match, if available. The first_char value is
4606 never set for an anchored regular expression, but the anchoring may be forced
4607 at run time, so we have to test for anchoring. The first char may be unset for
4608 an unanchored pattern, of course. If there's no first char and the pattern was
4609 studied, the may be a bitmap of possible first characters. However, we can
4610 use this only if the caseless state of the studying was correct. */
4612 if (!anchored)
4614 if ((re->options & PCRE_FIRSTSET) != 0)
4616 first_char = re->first_char;
4617 if (match_block.caseless) first_char = pcre_lcc[first_char];
4619 else
4620 if (!startline && extra != NULL &&
4621 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4622 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4623 start_bits = extra->start_bits;
4626 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4630 int rc;
4631 register int *iptr = match_block.offset_vector;
4632 register int *iend = iptr + resetcount;
4634 /* Reset the maximum number of extractions we might see. */
4636 while (iptr < iend) *iptr++ = -1;
4638 /* Advance to a unique first char if possible */
4640 if (first_char >= 0)
4642 if (match_block.caseless)
4643 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4644 start_match++;
4645 else
4646 while (start_match < end_subject && *start_match != first_char)
4647 start_match++;
4650 /* Or to just after \n for a multiline match if possible */
4652 else if (startline)
4654 if (start_match > match_block.start_subject)
4656 while (start_match < end_subject && start_match[-1] != '\n')
4657 start_match++;
4661 /* Or to a non-unique first char */
4663 else if (start_bits != NULL)
4665 while (start_match < end_subject)
4667 register int c = *start_match;
4668 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4672 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4673 printf(">>>> Match against: ");
4674 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4675 printf("\n");
4676 #endif
4678 /* When a match occurs, substrings will be set for all internal extractions;
4679 we just need to set up the whole thing as substring 0 before returning. If
4680 there were too many extractions, set the return code to zero. In the case
4681 where we had to get some local store to hold offsets for backreferences, copy
4682 those back references that we can. In this case there need not be overflow
4683 if certain parts of the pattern were not used.
4685 Before starting the match, we have to set up a longjmp() target to enable
4686 the "cut" operation to fail a match completely without backtracking. This
4687 is done in a separate function to avoid compiler warnings. We need not do
4688 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4689 enabled. */
4691 /* To handle errors such as running out of memory for the failure
4692 stack, we need to save this location via setjmp(), so
4693 error-handling code can call longjmp() to jump out of deeply-nested code. */
4694 if (setjmp(match_block.error_env)==0)
4697 if ((re->options & PCRE_EXTRA) != 0)
4699 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4700 continue;
4702 else if (!match(start_match, re->code, 2, &match_block)) continue;
4704 /* Copy the offset information from temporary store if necessary */
4706 if (using_temporary_offsets)
4708 if (offsetcount >= 4)
4710 memcpy(offsets + 2, match_block.offset_vector + 2,
4711 (offsetcount - 2) * sizeof(int));
4712 DPRINTF(("Copied offsets from temporary memory\n"));
4714 if (match_block.end_offset_top > offsetcount)
4715 match_block.offset_overflow = TRUE;
4717 DPRINTF(("Freeing temporary memory\n"));
4718 (pcre_free)(match_block.offset_vector);
4721 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4723 if (match_block.offset_end < 2) rc = 0; else
4725 offsets[0] = start_match - match_block.start_subject;
4726 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4729 DPRINTF((">>>> returning %d\n", rc));
4730 free_stack(&match_block);
4731 return rc;
4732 } /* End of (if setjmp(match_block.error_env)...) */
4733 free_stack(&match_block);
4735 /* Return an error code; pcremodule.c will preserve the exception */
4736 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4738 while (!anchored &&
4739 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4740 start_match++ < end_subject);
4742 if (using_temporary_offsets)
4744 DPRINTF(("Freeing temporary memory\n"));
4745 (pcre_free)(match_block.offset_vector);
4748 #ifdef DEBUG
4749 printf(">>>> returning %d\n", match_block.errorcode);
4750 #endif
4752 free_stack(&match_block);
4753 return match_block.errorcode;
4756 /* End of pcre.c */