The 0.5 release happened on 2/15, not on 2/14. :-)
[python/dscho.git] / Modules / pypcre.c
blob976eb21fc328cb1a4c6b465609db449ebc8bf921
2 /*************************************************
3 * Perl-Compatible Regular Expressions *
4 *************************************************/
6 /* DO NOT EDIT THIS FILE! */
8 /* This file is automatically written by the merge-files.py script
9 included with the PCRE distribution for Python; it's produced from
10 several C files, and code is removed in the process. If you want to
11 modify the code or track down bugs, it will be much easier to work
12 with the code in its original, multiple-file form. Don't edit this
13 file by hand, or submit patches to it.
15 The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
18 The unmodified original PCRE distribution is available at
19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20 written by: Philip Hazel <ph10@cam.ac.uk>
22 Extensively modified by the Python String-SIG: <string-sig@python.org>
23 Send bug reports to: <string-sig@python.org>
24 (They'll figure out if it's a bug in PCRE or in the Python-specific
25 changes.)
27 Copyright (c) 1997 University of Cambridge
29 -----------------------------------------------------------------------------
30 Permission is granted to anyone to use this software for any purpose on any
31 computer system, and to redistribute it freely, subject to the following
32 restrictions:
34 1. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 2. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
41 3. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43 -----------------------------------------------------------------------------
47 #define FOR_PYTHON
48 #include "pcre-int.h"
49 #include "Python.h"
50 #include "mymalloc.h"
51 #include <ctype.h>
52 #include "graminit.h"
54 /*************************************************
55 * Perl-Compatible Regular Expressions *
56 *************************************************/
58 /* This file is automatically written by the makechartables auxiliary
59 program. If you edit it by hand, you might like to edit the Makefile to
60 prevent its ever being regenerated. */
62 /* This table is a lower casing table. */
64 unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
98 /* This table is a case flipping table. */
100 unsigned char pcre_fcc[] = {
101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
134 /* This table contains bit maps for digits, letters, 'word' chars, and
135 white space. Each map is 32 bytes long and the bits run from the least
136 significant end of each byte. */
138 unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
159 /* This table identifies various classes of character by individual bits:
160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
168 unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
202 /* End of chartables.c */
203 /*************************************************
204 * Perl-Compatible Regular Expressions *
205 *************************************************/
208 This is a library of functions to support regular expressions whose syntax
209 and semantics are as close as possible to those of the Perl 5 language. See
210 the file Tech.Notes for some information on the internals.
212 Written by: Philip Hazel <ph10@cam.ac.uk>
214 Copyright (c) 1998 University of Cambridge
216 -----------------------------------------------------------------------------
217 Permission is granted to anyone to use this software for any purpose on any
218 computer system, and to redistribute it freely, subject to the following
219 restrictions:
221 1. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
225 2. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
228 3. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230 -----------------------------------------------------------------------------
234 /* Include the internals header, which itself includes Standard C headers plus
235 the external pcre header. */
240 /*************************************************
241 * Create bitmap of starting chars *
242 *************************************************/
244 /* This function scans a compiled unanchored expression and attempts to build a
245 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246 goes by, we may be able to get more clever at doing this.
248 Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
252 Returns: TRUE if table built, FALSE otherwise
255 static BOOL
256 set_start_bits(const uschar *code, uschar *start_bits)
258 register int c;
259 volatile int dummy;
263 const uschar *tcode = code + 3;
264 BOOL try_next = TRUE;
266 while (try_next)
268 try_next = FALSE;
270 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
272 if (!set_start_bits(tcode, start_bits)) return FALSE;
275 else switch(*tcode)
277 default:
278 return FALSE;
280 /* BRAZERO does the bracket, but carries on. */
282 case OP_BRAZERO:
283 case OP_BRAMINZERO:
284 if (!set_start_bits(++tcode, start_bits)) return FALSE;
285 dummy = 1;
286 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
287 tcode += 3;
288 try_next = TRUE;
289 break;
291 /* Single-char * or ? sets the bit and tries the next item */
293 case OP_STAR:
294 case OP_MINSTAR:
295 case OP_QUERY:
296 case OP_MINQUERY:
297 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
298 tcode += 2;
299 try_next = TRUE;
300 break;
302 /* Single-char upto sets the bit and tries the next */
304 case OP_UPTO:
305 case OP_MINUPTO:
306 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
307 tcode += 4;
308 try_next = TRUE;
309 break;
311 /* At least one single char sets the bit and stops */
313 case OP_EXACT: /* Fall through */
314 tcode++;
316 case OP_CHARS: /* Fall through */
317 tcode++;
319 case OP_PLUS:
320 case OP_MINPLUS:
321 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
322 break;
324 /* Single character type sets the bits and stops */
326 case OP_NOT_DIGIT:
327 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
328 break;
330 case OP_DIGIT:
331 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
332 break;
334 case OP_NOT_WHITESPACE:
335 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
336 break;
338 case OP_WHITESPACE:
339 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
340 break;
342 case OP_NOT_WORDCHAR:
343 for (c = 0; c < 32; c++)
344 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
345 break;
347 case OP_WORDCHAR:
348 for (c = 0; c < 32; c++)
349 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
350 break;
352 /* One or more character type fudges the pointer and restarts, knowing
353 it will hit a single character type and stop there. */
355 case OP_TYPEPLUS:
356 case OP_TYPEMINPLUS:
357 tcode++;
358 try_next = TRUE;
359 break;
361 case OP_TYPEEXACT:
362 tcode += 3;
363 try_next = TRUE;
364 break;
366 /* Zero or more repeats of character types set the bits and then
367 try again. */
369 case OP_TYPEUPTO:
370 case OP_TYPEMINUPTO:
371 tcode += 2; /* Fall through */
373 case OP_TYPESTAR:
374 case OP_TYPEMINSTAR:
375 case OP_TYPEQUERY:
376 case OP_TYPEMINQUERY:
377 switch(tcode[1])
379 case OP_NOT_DIGIT:
380 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
381 break;
383 case OP_DIGIT:
384 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
385 break;
387 case OP_NOT_WHITESPACE:
388 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
389 break;
391 case OP_WHITESPACE:
392 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
393 break;
395 case OP_NOT_WORDCHAR:
396 for (c = 0; c < 32; c++)
397 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
398 break;
400 case OP_WORDCHAR:
401 for (c = 0; c < 32; c++)
402 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
403 break;
406 tcode += 2;
407 try_next = TRUE;
408 break;
410 /* Character class: set the bits and either carry on or not,
411 according to the repeat count. */
413 case OP_CLASS:
414 case OP_NEGCLASS:
416 tcode++;
417 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
418 tcode += 32;
419 switch (*tcode)
421 case OP_CRSTAR:
422 case OP_CRMINSTAR:
423 case OP_CRQUERY:
424 case OP_CRMINQUERY:
425 tcode++;
426 try_next = TRUE;
427 break;
429 case OP_CRRANGE:
430 case OP_CRMINRANGE:
431 if (((tcode[1] << 8) + tcode[2]) == 0)
433 tcode += 5;
434 try_next = TRUE;
436 break;
439 break; /* End of class handling */
441 } /* End of switch */
442 } /* End of try_next loop */
444 code += (code[1] << 8) + code[2]; /* Advance to next branch */
446 while (*code == OP_ALT);
447 return TRUE;
452 /*************************************************
453 * Study a compiled expression *
454 *************************************************/
456 /* This function is handed a compiled expression that it must study to produce
457 information that will speed up the matching. It returns a pcre_extra block
458 which then gets handed back to pcre_exec().
460 Arguments:
461 re points to the compiled expression
462 options contains option bits
463 errorptr points to where to place error messages;
464 set NULL unless error
466 Returns: pointer to a pcre_extra block,
467 NULL on error or if no optimization possible
470 pcre_extra *
471 pcre_study(const pcre *external_re, int options, const char **errorptr)
473 BOOL caseless;
474 uschar start_bits[32];
475 real_pcre_extra *extra;
476 const real_pcre *re = (const real_pcre *)external_re;
478 *errorptr = NULL;
480 if (re == NULL || re->magic_number != MAGIC_NUMBER)
482 *errorptr = "argument is not a compiled regular expression";
483 return NULL;
486 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
488 *errorptr = "unknown or incorrect option bit(s) set";
489 return NULL;
492 /* Caseless can either be from the compiled regex or from options. */
494 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
496 /* For an anchored pattern, or an unchored pattern that has a first char, or a
497 multiline pattern that matches only at "line starts", no further processing at
498 present. */
500 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
501 return NULL;
503 /* See if we can find a fixed set of initial characters for the pattern. */
505 memset(start_bits, 0, 32 * sizeof(uschar));
506 if (!set_start_bits(re->code, start_bits)) return NULL;
508 /* If this studying is caseless, scan the created bit map and duplicate the
509 bits for any letters. */
511 if (caseless)
513 register int c;
514 for (c = 0; c < 256; c++)
516 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
517 (pcre_ctypes[c] & ctype_letter) != 0)
519 int d = pcre_fcc[c];
520 start_bits[d/8] |= (1 << (d&7));
525 /* Get an "extra" block and put the information therein. */
527 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
529 if (extra == NULL)
531 *errorptr = "failed to get memory";
532 return NULL;
535 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
536 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
538 return (pcre_extra *)extra;
541 /* End of study.c */
542 /*************************************************
543 * Perl-Compatible Regular Expressions *
544 *************************************************/
547 This is a library of functions to support regular expressions whose syntax
548 and semantics are as close as possible to those of the Perl 5 language. See
549 the file Tech.Notes for some information on the internals.
551 Written by: Philip Hazel <ph10@cam.ac.uk>
553 Copyright (c) 1998 University of Cambridge
555 -----------------------------------------------------------------------------
556 Permission is granted to anyone to use this software for any purpose on any
557 computer system, and to redistribute it freely, subject to the following
558 restrictions:
560 1. This software is distributed in the hope that it will be useful,
561 but WITHOUT ANY WARRANTY; without even the implied warranty of
562 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
564 2. The origin of this software must not be misrepresented, either by
565 explicit claim or by omission.
567 3. Altered versions must be plainly marked as such, and must not be
568 misrepresented as being the original software.
569 -----------------------------------------------------------------------------
573 /* Define DEBUG to get debugging output on stdout. */
575 /* #define DEBUG */
577 /* Use a macro for debugging printing, 'cause that eliminates the the use
578 of #ifdef inline, and there are *still* stupid compilers about that don't like
579 indented pre-processor statements. I suppose it's only been 10 years... */
581 #ifdef DEBUG
582 #define DPRINTF(p) printf p
583 #else
584 #define DPRINTF(p) /*nothing*/
585 #endif
587 /* Include the internals header, which itself includes Standard C headers plus
588 the external pcre header. */
593 #ifndef Py_eval_input
594 /* For Python 1.4, graminit.h has to be explicitly included */
595 #define Py_eval_input eval_input
597 #endif /* FOR_PYTHON */
599 /* Allow compilation as C++ source code, should anybody want to do that. */
601 #ifdef __cplusplus
602 #define class pcre_class
603 #endif
606 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
608 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
609 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
611 /* Text forms of OP_ values and things, for debugging (not all used) */
613 #ifdef DEBUG
614 static const char *OP_names[] = {
615 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
616 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
617 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
618 "^", "$", "Any", "chars",
619 "not",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
623 "*", "*?", "+", "+?", "?", "??", "{", "{",
624 "class", "negclass", "classL", "Ref",
625 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
626 "Brazero", "Braminzero", "Bra"
628 #endif
630 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
631 are simple data values; negative values are for special things like \d and so
632 on. Zero means further processing is needed (for things like \x), or the escape
633 is invalid. */
635 static const short int escapes[] = {
636 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
637 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
638 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
639 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
640 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
641 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
642 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
643 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
644 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
645 0, 0, 0 /* x - z */
648 /* Definition to allow mutual recursion */
650 static BOOL
651 compile_regex(int, int *, uschar **, const uschar **, const char **,
652 PyObject *);
654 /* Structure for passing "static" information around between the functions
655 doing the matching, so that they are thread-safe. */
657 typedef struct match_data {
658 int errorcode; /* As it says */
659 int *offset_vector; /* Offset vector */
660 int offset_end; /* One past the end */
661 BOOL offset_overflow; /* Set if too many extractions */
662 BOOL caseless; /* Case-independent flag */
663 BOOL runtime_caseless; /* Caseless forced at run time */
664 BOOL multiline; /* Multiline flag */
665 BOOL notbol; /* NOTBOL flag */
666 BOOL noteol; /* NOTEOL flag */
667 BOOL dotall; /* Dot matches any char */
668 BOOL endonly; /* Dollar not before final \n */
669 const uschar *start_subject; /* Start of the subject string */
670 const uschar *end_subject; /* End of the subject string */
671 jmp_buf fail_env; /* Environment for longjump() break out */
672 const uschar *end_match_ptr; /* Subject position at end match */
673 int end_offset_top; /* Highwater mark at end of match */
674 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
675 matching operation */
676 int length; /* Length of the allocated stacks */
677 int point; /* Point to add next item pushed onto stacks */
678 /* Pointers to the 6 stacks */
679 int *off_num, *offset_top, *r1, *r2;
680 const uschar **eptr, **ecode;
681 } match_data;
685 /*************************************************
686 * Global variables *
687 *************************************************/
689 /* PCRE is thread-clean and doesn't use any global variables in the normal
690 sense. However, it calls memory allocation and free functions via the two
691 indirections below, which are can be changed by the caller, but are shared
692 between all threads. */
694 void *(*pcre_malloc)(size_t) = malloc;
695 void (*pcre_free)(void *) = free;
700 /*************************************************
701 * Return version string *
702 *************************************************/
704 const char *
705 pcre_version(void)
707 return PCRE_VERSION;
713 /*************************************************
714 * Return info about a compiled pattern *
715 *************************************************/
717 /* This function picks potentially useful data out of the private
718 structure.
720 Arguments:
721 external_re points to compiled code
722 optptr where to pass back the options
723 first_char where to pass back the first character,
724 or -1 if multiline and all branches start ^,
725 or -2 otherwise
727 Returns: number of identifying extraction brackets
728 or negative values on error
732 pcre_info(const pcre *external_re, int *optptr, int *first_char)
734 const real_pcre *re = (real_pcre *)external_re;
735 if (re == NULL) return PCRE_ERROR_NULL;
736 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
737 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
738 if (first_char != NULL)
739 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
740 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
741 return re->top_bracket;
747 #ifdef DEBUG
748 /*************************************************
749 * Debugging function to print chars *
750 *************************************************/
752 /* Print a sequence of chars in printable format, stopping at the end of the
753 subject if the requested.
755 Arguments:
756 p points to characters
757 length number to print
758 is_subject TRUE if printing from within md->start_subject
759 md pointer to matching data block, if is_subject is TRUE
761 Returns: nothing
764 static void
765 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
767 int c;
768 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
769 while (length-- > 0)
770 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
772 #endif
777 /*************************************************
778 * Check subpattern for empty operand *
779 *************************************************/
781 /* This function checks a bracketed subpattern to see if any of the paths
782 through it could match an empty string. This is used to diagnose an error if
783 such a subpattern is followed by a quantifier with an unlimited upper bound.
785 Argument:
786 code points to the opening bracket
788 Returns: TRUE or FALSE
791 static BOOL
792 could_be_empty(uschar *code)
794 do {
795 uschar *cc = code + 3;
797 /* Scan along the opcodes for this branch; as soon as we find something
798 that matches a non-empty string, break out and advance to test the next
799 branch. If we get to the end of the branch, return TRUE for the whole
800 sub-expression. */
802 for (;;)
804 /* Test an embedded subpattern; if it could not be empty, break the
805 loop. Otherwise carry on in the branch. */
807 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
809 if (!could_be_empty(cc)) break;
810 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811 cc += 3;
814 else switch (*cc)
816 /* Reached end of a branch: the subpattern may match the empty string */
818 case OP_ALT:
819 case OP_KET:
820 case OP_KETRMAX:
821 case OP_KETRMIN:
822 return TRUE;
824 /* Skip over entire bracket groups with zero lower bound */
826 case OP_BRAZERO:
827 case OP_BRAMINZERO:
828 cc++;
829 /* Fall through */
831 /* Skip over assertive subpatterns */
833 case OP_ASSERT:
834 case OP_ASSERT_NOT:
835 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
836 cc += 3;
837 break;
839 /* Skip over things that don't match chars */
841 case OP_SOD:
842 case OP_EOD:
843 case OP_CIRC:
844 case OP_DOLL:
845 case OP_NOT_WORD_BOUNDARY:
846 case OP_WORD_BOUNDARY:
847 case OP_NOT_WORD_BOUNDARY_L:
848 case OP_WORD_BOUNDARY_L:
849 cc++;
850 break;
852 /* Skip over simple repeats with zero lower bound */
854 case OP_STAR:
855 case OP_MINSTAR:
856 case OP_QUERY:
857 case OP_MINQUERY:
858 case OP_NOTSTAR:
859 case OP_NOTMINSTAR:
860 case OP_NOTQUERY:
861 case OP_NOTMINQUERY:
862 case OP_TYPESTAR:
863 case OP_TYPEMINSTAR:
864 case OP_TYPEQUERY:
865 case OP_TYPEMINQUERY:
866 cc += 2;
867 break;
869 /* Skip over UPTOs (lower bound is zero) */
871 case OP_UPTO:
872 case OP_MINUPTO:
873 case OP_TYPEUPTO:
874 case OP_TYPEMINUPTO:
875 cc += 4;
876 break;
878 /* Check a class or a back reference for a zero minimum */
880 case OP_CLASS:
881 case OP_NEGCLASS:
882 case OP_REF:
883 case OP_CLASS_L:
884 switch(*cc)
886 case (OP_REF): cc += 2; break;
887 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
888 case (OP_CLASS_L): cc += 1+1+32; break;
891 switch (*cc)
893 case OP_CRSTAR:
894 case OP_CRMINSTAR:
895 case OP_CRQUERY:
896 case OP_CRMINQUERY:
897 cc++;
898 break;
900 case OP_CRRANGE:
901 case OP_CRMINRANGE:
902 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
903 cc += 3;
904 break;
906 default:
907 goto NEXT_BRANCH;
909 break;
911 /* Anything else matches at least one character */
913 default:
914 goto NEXT_BRANCH;
918 NEXT_BRANCH:
919 code += (code[1] << 8) + code[2];
921 while (*code == OP_ALT);
923 /* No branches match the empty string */
925 return FALSE;
928 /* Determine the length of a group ID in an expression like
929 (?P<foo_123>...)
930 Arguments:
931 ptr pattern position pointer (say that 3 times fast)
932 finalchar the character that will mark the end of the ID
933 errorptr points to the pointer to the error message
936 static int
937 get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
939 const uschar *start = ptr;
941 /* If the first character is not in \w, or is in \w but is a digit,
942 report an error */
943 if (!(pcre_ctypes[*ptr] & ctype_word) ||
944 (pcre_ctypes[*ptr++] & ctype_digit))
946 *errorptr = "(?P identifier must start with a letter or underscore";
947 return 0;
950 /* Increment ptr until we either hit a null byte, the desired
951 final character, or a non-word character */
952 for(; (*ptr != 0) && (*ptr != finalchar) &&
953 (pcre_ctypes[*ptr] & ctype_word); ptr++)
955 /* Empty loop body */
957 if (*ptr==finalchar)
958 return ptr-start;
959 if (*ptr==0)
961 *errorptr = "unterminated (?P identifier";
962 return 0;
964 *errorptr = "illegal character in (?P identifier";
965 return 0;
968 /*************************************************
969 * Handle escapes *
970 *************************************************/
972 /* This function is called when a \ has been encountered. It either returns a
973 positive value for a simple escape such as \n, or a negative value which
974 encodes one of the more complicated things such as \d. On entry, ptr is
975 pointing at the \. On exit, it is on the final character of the escape
976 sequence.
978 Arguments:
979 ptrptr points to the pattern position pointer
980 errorptr points to the pointer to the error message
981 bracount number of previous extracting brackets
982 options the options bits
983 isclass TRUE if inside a character class
985 Returns: zero or positive => a data character
986 negative => a special escape sequence
987 on error, errorptr is set
990 static int
991 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
992 int options, BOOL isclass)
994 const uschar *ptr = *ptrptr;
995 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
996 int i;
998 if (c == 0) *errorptr = ERR1;
1000 /* Digits or letters may have special meaning; all others are literals. */
1002 else if (c < '0' || c > 'z') {}
1004 /* Do an initial lookup in a table. A non-zero result is something that can be
1005 returned immediately. Otherwise further processing may be required. */
1007 else if ((i = escapes[c - '0']) != 0) c = i;
1009 /* Escapes that need further processing, or are illegal. */
1011 else
1014 switch (c)
1016 /* The handling of escape sequences consisting of a string of digits
1017 starting with one that is not zero is not straightforward. By experiment,
1018 the way Perl works seems to be as follows:
1020 Outside a character class, the digits are read as a decimal number. If the
1021 number is less than 10, or if there are that many previous extracting
1022 left brackets, then it is a back reference. Otherwise, up to three octal
1023 digits are read to form an escaped byte. Thus \123 is likely to be octal
1024 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1025 value is greater than 377, the least significant 8 bits are taken. Inside a
1026 character class, \ followed by a digit is always an octal number. */
1028 case '1': case '2': case '3': case '4': case '5':
1029 case '6': case '7': case '8': case '9':
1032 /* PYTHON: Try to compute an octal value for a character */
1033 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
1035 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1036 c = c * 8 + ptr[i]-'0';
1037 else
1038 break; /* Non-octal character--break out of the loop */
1040 /* It's a character if there were exactly 3 octal digits, or if
1041 we're inside a character class and there was at least one
1042 octal digit. */
1043 if ( (i == 3) || (isclass && i!=0) )
1045 ptr += i-1;
1046 break;
1048 c = ptr[0]; /* Restore the first character after the \ */
1049 c -= '0'; i = 1;
1050 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1052 c = c * 10 + ptr[1] - '0';
1053 ptr++; i++;
1055 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1056 c = -(ESC_REF + c);
1058 break;
1060 /* \0 always starts an octal number, but we may drop through to here with a
1061 larger first octal digit */
1063 case '0':
1064 c -= '0';
1065 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1066 ptr[1] != '8' && ptr[1] != '9')
1067 c = c * 8 + *(++ptr) - '0';
1068 break;
1070 /* Special escapes not starting with a digit are straightforward */
1072 case 'x':
1073 c = 0;
1074 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
1076 ptr++;
1077 c = c * 16 + pcre_lcc[*ptr] -
1078 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1079 c &= 255;
1081 break;
1084 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1085 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1086 for Perl compatibility, it is a literal. */
1088 default:
1089 if ((options & PCRE_EXTRA) != 0) switch(c)
1091 case 'X':
1092 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1093 break;
1095 default:
1096 *errorptr = ERR3;
1097 break;
1099 break;
1103 *ptrptr = ptr;
1104 return c;
1109 /*************************************************
1110 * Check for counted repeat *
1111 *************************************************/
1113 /* This function is called when a '{' is encountered in a place where it might
1114 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1115 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1116 where the ddds are digits.
1118 Arguments:
1119 p pointer to the first char after '{'
1121 Returns: TRUE or FALSE
1124 static BOOL
1125 is_counted_repeat(const uschar *p)
1127 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1128 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1129 if (*p == '}') return TRUE;
1131 if (*p++ != ',') return FALSE;
1132 if (*p == '}') return TRUE;
1134 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1135 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1136 return (*p == '}');
1141 /*************************************************
1142 * Read repeat counts *
1143 *************************************************/
1145 /* Read an item of the form {n,m} and return the values. This is called only
1146 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1147 so the syntax is guaranteed to be correct, but we need to check the values.
1149 Arguments:
1150 p pointer to first char after '{'
1151 minp pointer to int for min
1152 maxp pointer to int for max
1153 returned as -1 if no max
1154 errorptr points to pointer to error message
1156 Returns: pointer to '}' on success;
1157 current ptr on error, with errorptr set
1160 static const uschar *
1161 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1163 int min = 0;
1164 int max = -1;
1166 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1168 if (*p == '}') max = min; else
1170 if (*(++p) != '}')
1172 max = 0;
1173 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1174 if (max < min)
1176 *errorptr = ERR4;
1177 return p;
1182 /* Do paranoid checks, then fill in the required variables, and pass back the
1183 pointer to the terminating '}'. */
1185 if (min > 65535 || max > 65535)
1186 *errorptr = ERR5;
1187 else
1189 *minp = min;
1190 *maxp = max;
1192 return p;
1197 /*************************************************
1198 * Compile one branch *
1199 *************************************************/
1201 /* Scan the pattern, compiling it into the code vector.
1203 Arguments:
1204 options the option bits
1205 bracket points to number of brackets used
1206 code points to the pointer to the current code point
1207 ptrptr points to the current pattern pointer
1208 errorptr points to pointer to error message
1210 Returns: TRUE on success
1211 FALSE, with *errorptr set on error
1214 static BOOL
1215 compile_branch(int options, int *brackets, uschar **codeptr,
1216 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
1218 int repeat_type, op_type;
1219 int repeat_min, repeat_max;
1220 int bravalue, length;
1221 int greedy_default, greedy_non_default;
1222 register int c;
1223 register uschar *code = *codeptr;
1224 const uschar *ptr = *ptrptr;
1225 const uschar *oldptr;
1226 uschar *previous = NULL;
1227 uschar class[32];
1228 uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
1230 /* Set up the default and non-default settings for greediness */
1232 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1233 greedy_non_default = greedy_default ^ 1;
1235 /* Switch on next character until the end of the branch */
1237 for (;; ptr++)
1239 BOOL negate_class;
1240 int class_charcount;
1241 int class_lastchar;
1243 c = *ptr;
1244 if ((options & PCRE_EXTENDED) != 0)
1246 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1247 if (c == '#')
1249 while ((c = *(++ptr)) != 0 && c != '\n');
1250 continue;
1254 switch(c)
1256 /* The branch terminates at end of string, |, or ). */
1258 case 0:
1259 case '|':
1260 case ')':
1261 *codeptr = code;
1262 *ptrptr = ptr;
1263 return TRUE;
1265 /* Handle single-character metacharacters */
1267 case '^':
1268 previous = NULL;
1269 *code++ = OP_CIRC;
1270 break;
1272 case '$':
1273 previous = NULL;
1274 *code++ = OP_DOLL;
1275 break;
1277 case '.':
1278 previous = code;
1279 *code++ = OP_ANY;
1280 break;
1282 /* Character classes. These always build a 32-byte bitmap of the permitted
1283 characters, except in the special case where there is only one character.
1284 For negated classes, we build the map as usual, then invert it at the end.
1287 case '[':
1288 previous = code;
1289 if (options & PCRE_LOCALE)
1291 *code++ = OP_CLASS_L;
1292 /* Set the flag for localized classes (like \w) to 0 */
1293 class_flag = code;
1294 *class_flag = 0;
1296 else
1298 *code++ = OP_CLASS;
1299 class_flag = NULL;
1302 /* If the first character is '^', set the negation flag, and use a
1303 different opcode. This only matters if caseless matching is specified at
1304 runtime. */
1306 if ((c = *(++ptr)) == '^')
1308 negate_class = TRUE;
1309 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
1310 c = *(++ptr);
1312 else negate_class = FALSE;
1314 /* Keep a count of chars so that we can optimize the case of just a single
1315 character. */
1317 class_charcount = 0;
1318 class_lastchar = -1;
1320 /* Initialize the 32-char bit map to all zeros. We have to build the
1321 map in a temporary bit of store, in case the class contains only 1
1322 character, because in that case the compiled code doesn't use the
1323 bit map. */
1325 memset(class, 0, 32 * sizeof(uschar));
1327 /* Process characters until ] is reached. By writing this as a "do" it
1328 means that an initial ] is taken as a data character. */
1332 if (c == 0)
1334 *errorptr = ERR6;
1335 goto FAILED;
1338 /* Backslash may introduce a single character, or it may introduce one
1339 of the specials, which just set a flag. Escaped items are checked for
1340 validity in the pre-compiling pass. The sequence \b is a special case.
1341 Inside a class (and only there) it is treated as backspace. Elsewhere
1342 it marks a word boundary. Other escapes have preset maps ready to
1343 or into the one we are building. We assume they have more than one
1344 character in them, so set class_count bigger than one. */
1346 if (c == '\\')
1348 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1349 if (-c == ESC_b) c = '\b';
1350 else if (c < 0)
1352 class_charcount = 10;
1353 switch (-c)
1355 case ESC_d:
1357 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1359 continue;
1361 case ESC_D:
1363 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1365 continue;
1367 case ESC_w:
1368 if (options & PCRE_LOCALE)
1370 *class_flag |= 1;
1372 else
1374 for (c = 0; c < 32; c++)
1375 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1377 continue;
1379 case ESC_W:
1380 if (options & PCRE_LOCALE)
1382 *class_flag |= 2;
1384 else
1386 for (c = 0; c < 32; c++)
1387 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1389 continue;
1391 case ESC_s:
1393 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1395 continue;
1397 case ESC_S:
1399 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1401 continue;
1403 default:
1404 *errorptr = ERR7;
1405 goto FAILED;
1408 /* Fall through if single character */
1411 /* A single character may be followed by '-' to form a range. However,
1412 Perl does not permit ']' to be the end of the range. A '-' character
1413 here is treated as a literal. */
1415 if (ptr[1] == '-' && ptr[2] != ']')
1417 int d;
1418 ptr += 2;
1419 d = *ptr;
1421 if (d == 0)
1423 *errorptr = ERR6;
1424 goto FAILED;
1427 /* The second part of a range can be a single-character escape, but
1428 not any of the other escapes. */
1430 if (d == '\\')
1432 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1433 if (d < 0)
1435 if (d == -ESC_b) d = '\b'; else
1437 *errorptr = ERR7;
1438 goto FAILED;
1443 if (d < c)
1445 *errorptr = ERR8;
1446 goto FAILED;
1449 for (; c <= d; c++)
1451 class[c/8] |= (1 << (c&7));
1452 if ((options & PCRE_CASELESS) != 0)
1454 int uc = pcre_fcc[c]; /* flip case */
1455 class[uc/8] |= (1 << (uc&7));
1457 class_charcount++; /* in case a one-char range */
1458 class_lastchar = c;
1460 continue; /* Go get the next char in the class */
1463 /* Handle a lone single character - we can get here for a normal
1464 non-escape char, or after \ that introduces a single character. */
1466 class [c/8] |= (1 << (c&7));
1467 if ((options & PCRE_CASELESS) != 0)
1469 c = pcre_fcc[c]; /* flip case */
1470 class[c/8] |= (1 << (c&7));
1472 class_charcount++;
1473 class_lastchar = c;
1476 /* Loop until ']' reached; the check for end of string happens inside the
1477 loop. This "while" is the end of the "do" above. */
1479 while ((c = *(++ptr)) != ']');
1481 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1482 precisely one character. This doesn't need the whole 32-byte bit map.
1483 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1484 it's negative. */
1486 if (class_charcount == 1 && class_lastchar >= 0)
1488 if (negate_class)
1490 code[-1] = OP_NOT;
1492 else
1494 code[-1] = OP_CHARS;
1495 *code++ = 1;
1497 *code++ = class_lastchar;
1500 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1501 the code vector. */
1503 else
1505 /* If this is a localized opcode, bump the code pointer up */
1506 if (class_flag) code++;
1507 if (negate_class)
1509 if (class_flag) *class_flag = (*class_flag) ^ 63;
1510 for (c = 0; c < 32; c++) code[c] = ~class[c];
1512 else
1513 memcpy(code, class, 32);
1514 code += 32;
1516 break;
1518 /* Various kinds of repeat */
1520 case '{':
1521 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1522 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1523 if (*errorptr != NULL) goto FAILED;
1524 goto REPEAT;
1526 case '*':
1527 repeat_min = 0;
1528 repeat_max = -1;
1529 goto REPEAT;
1531 case '+':
1532 repeat_min = 1;
1533 repeat_max = -1;
1534 goto REPEAT;
1536 case '?':
1537 repeat_min = 0;
1538 repeat_max = 1;
1540 REPEAT:
1541 if (previous == NULL)
1543 *errorptr = ERR9;
1544 goto FAILED;
1547 /* If the next character is '?' this is a minimizing repeat, by default,
1548 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1549 next character. */
1551 if (ptr[1] == '?')
1552 { repeat_type = greedy_non_default; ptr++; }
1553 else repeat_type = greedy_default;
1555 /* If the maximum is zero then the minimum must also be zero; Perl allows
1556 this case, so we do too - by simply omitting the item altogether. */
1558 if (repeat_max == 0) code = previous;
1560 /* If previous was a string of characters, chop off the last one and use it
1561 as the subject of the repeat. If there was only one character, we can
1562 abolish the previous item altogether. */
1564 else if (*previous == OP_CHARS)
1566 int len = previous[1];
1567 if (len == 1)
1569 c = previous[2];
1570 code = previous;
1572 else
1574 c = previous[len+1];
1575 previous[1]--;
1576 code--;
1578 op_type = 0; /* Use single-char op codes */
1579 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1582 /* If previous was a single negated character ([^a] or similar), we use
1583 one of the special opcodes, replacing it. The code is shared with single-
1584 character repeats by adding a suitable offset into repeat_type. */
1586 else if ((int)*previous == OP_NOT)
1588 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1589 c = previous[1];
1590 code = previous;
1591 goto OUTPUT_SINGLE_REPEAT;
1594 /* If previous was a character type match (\d or similar), abolish it and
1595 create a suitable repeat item. The code is shared with single-character
1596 repeats by adding a suitable offset into repeat_type. */
1598 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
1600 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1601 c = *previous;
1602 code = previous;
1604 OUTPUT_SINGLE_REPEAT:
1605 repeat_type += op_type; /* Combine both values for many cases */
1607 /* A minimum of zero is handled either as the special case * or ?, or as
1608 an UPTO, with the maximum given. */
1610 if (repeat_min == 0)
1612 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1613 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1614 else
1616 *code++ = OP_UPTO + repeat_type;
1617 *code++ = repeat_max >> 8;
1618 *code++ = (repeat_max & 255);
1622 /* The case {1,} is handled as the special case + */
1624 else if (repeat_min == 1 && repeat_max == -1)
1625 *code++ = OP_PLUS + repeat_type;
1627 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1628 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1630 else
1632 if (repeat_min != 1)
1634 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1635 *code++ = repeat_min >> 8;
1636 *code++ = (repeat_min & 255);
1639 /* If the mininum is 1 and the previous item was a character string,
1640 we either have to put back the item that got cancelled if the string
1641 length was 1, or add the character back onto the end of a longer
1642 string. For a character type nothing need be done; it will just get
1643 put back naturally. Note that the final character is always going to
1644 get added below. */
1646 else if (*previous == OP_CHARS)
1648 if (code == previous) code += 2; else previous[1]++;
1651 /* For a single negated character we also have to put back the
1652 item that got cancelled. */
1654 else if (*previous == OP_NOT) code++;
1656 /* If the maximum is unlimited, insert an OP_STAR. */
1658 if (repeat_max < 0)
1660 *code++ = c;
1661 *code++ = OP_STAR + repeat_type;
1664 /* Else insert an UPTO if the max is greater than the min. */
1666 else if (repeat_max != repeat_min)
1668 *code++ = c;
1669 repeat_max -= repeat_min;
1670 *code++ = OP_UPTO + repeat_type;
1671 *code++ = repeat_max >> 8;
1672 *code++ = (repeat_max & 255);
1676 /* The character or character type itself comes last in all cases. */
1678 *code++ = c;
1681 /* If previous was a character class or a back reference, we put the repeat
1682 stuff after it. */
1684 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1685 *previous==OP_CLASS_L || *previous == OP_REF)
1687 if (repeat_min == 0 && repeat_max == -1)
1688 *code++ = OP_CRSTAR + repeat_type;
1689 else if (repeat_min == 1 && repeat_max == -1)
1690 *code++ = OP_CRPLUS + repeat_type;
1691 else if (repeat_min == 0 && repeat_max == 1)
1692 *code++ = OP_CRQUERY + repeat_type;
1693 else
1695 *code++ = OP_CRRANGE + repeat_type;
1696 *code++ = repeat_min >> 8;
1697 *code++ = repeat_min & 255;
1698 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1699 *code++ = repeat_max >> 8;
1700 *code++ = repeat_max & 255;
1704 /* If previous was a bracket group, we may have to replicate it in certain
1705 cases. If the maximum repeat count is unlimited, check that the bracket
1706 group cannot match the empty string, and diagnose an error if it can. */
1708 else if ((int)*previous >= OP_BRA)
1710 int i;
1711 int len = code - previous;
1713 if (repeat_max == -1 && could_be_empty(previous))
1715 *errorptr = ERR10;
1716 goto FAILED;
1719 /* If the minimum is greater than zero, and the maximum is unlimited or
1720 equal to the minimum, the first copy remains where it is, and is
1721 replicated up to the minimum number of times. This case includes the +
1722 repeat, but of course no replication is needed in that case. */
1724 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1726 for (i = 1; i < repeat_min; i++)
1728 memcpy(code, previous, len);
1729 code += len;
1733 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1734 Then, if there is a fixed upper limit, replicated up to that many times,
1735 sticking BRAZERO in front of all the optional ones. */
1737 else
1739 if (repeat_min == 0)
1741 memmove(previous+1, previous, len);
1742 code++;
1743 *previous++ = OP_BRAZERO + repeat_type;
1746 for (i = 1; i < repeat_min; i++)
1748 memcpy(code, previous, len);
1749 code += len;
1752 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1754 *code++ = OP_BRAZERO + repeat_type;
1755 memcpy(code, previous, len);
1756 code += len;
1760 /* If the maximum is unlimited, set a repeater in the final copy. */
1762 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1765 /* Else there's some kind of shambles */
1767 else
1769 *errorptr = ERR11;
1770 goto FAILED;
1773 /* In all case we no longer have a previous item. */
1775 previous = NULL;
1776 break;
1779 /* Start of nested bracket sub-expression, or comment or lookahead.
1780 First deal with special things that can come after a bracket; all are
1781 introduced by ?, and the appearance of any of them means that this is not a
1782 referencing group. They were checked for validity in the first pass over
1783 the string, so we don't have to check for syntax errors here. */
1785 case '(':
1786 previous = code; /* Only real brackets can be repeated */
1787 if (*(++ptr) == '?')
1789 bravalue = OP_BRA;
1791 switch (*(++ptr))
1793 case '#':
1794 case 'i':
1795 case 'L':
1796 case 'm':
1797 case 's':
1798 case 'x':
1799 ptr++;
1800 while (*ptr != ')') ptr++;
1801 previous = NULL;
1802 continue;
1804 case ':': /* Non-extracting bracket */
1805 ptr++;
1806 break;
1808 case '=': /* Assertions can't be repeated */
1809 bravalue = OP_ASSERT;
1810 ptr++;
1811 previous = NULL;
1812 break;
1814 case '!':
1815 bravalue = OP_ASSERT_NOT;
1816 ptr++;
1817 previous = NULL;
1818 break;
1820 case ('P'):
1821 ptr++;
1822 if (*ptr=='<')
1824 /* (?P<groupname>...) */
1825 int idlen;
1826 PyObject *string, *intobj;
1828 ptr++;
1829 idlen = get_group_id(ptr, '>', errorptr);
1830 if (*errorptr) {
1831 goto FAILED;
1833 string = PyString_FromStringAndSize((char*)ptr, idlen);
1834 intobj = PyInt_FromLong( brackets[0] + 1 );
1835 if (intobj == NULL || string == NULL)
1837 Py_XDECREF(string);
1838 Py_XDECREF(intobj);
1839 *errorptr = "exception raised";
1840 goto FAILED;
1842 PyDict_SetItem(dictionary, string, intobj);
1843 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
1844 ptr += idlen+1; /* Point to rest of expression */
1845 goto do_grouping_bracket;
1847 if (*ptr=='=')
1849 /* (?P=groupname) */
1850 int idlen, refnum;
1851 PyObject *string, *intobj;
1853 ptr++;
1854 idlen = get_group_id(ptr, ')', errorptr);
1855 if (*errorptr) {
1856 goto FAILED;
1858 string = PyString_FromStringAndSize((char *)ptr, idlen);
1859 if (string==NULL) {
1860 *errorptr = "exception raised";
1861 goto FAILED;
1863 intobj = PyDict_GetItem(dictionary, string);
1864 if (intobj==NULL) {
1865 Py_DECREF(string);
1866 *errorptr = "?P= group identifier isn't defined";
1867 goto FAILED;
1870 refnum = PyInt_AsLong(intobj);
1871 Py_DECREF(string);
1872 /* The caller doesn't own the reference to the value
1873 returned from PyDict_GetItem, so intobj is not
1874 DECREF'ed. */
1876 *code++ = OP_REF;
1877 *code++ = refnum;
1878 /* The continue will cause the top-level for() loop to
1879 be resumed, so ptr will be immediately incremented.
1880 Therefore, the following line adds just idlen, not
1881 idlen+1 */
1882 ptr += idlen;
1883 continue;
1885 /* The character after ?P is neither < nor =, so
1886 report an error. Add more Python-extensions here. */
1887 *errorptr="unknown after (?P";
1888 goto FAILED;
1890 case '>': /* "Match once" brackets */
1891 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1893 bravalue = OP_ONCE;
1894 ptr++;
1895 previous = NULL;
1896 break;
1898 /* Else fall through */
1900 default:
1901 *errorptr = ERR12;
1902 goto FAILED;
1906 /* Else we have a referencing group */
1908 else
1910 do_grouping_bracket:
1911 if (++(*brackets) > EXTRACT_MAX)
1913 *errorptr = ERR13;
1914 goto FAILED;
1916 bravalue = OP_BRA + *brackets;
1919 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1920 code into a non-register variable in order to be able to pass its address
1921 because some compilers complain otherwise. */
1923 *code = bravalue;
1925 uschar *mcode = code;
1926 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
1927 goto FAILED;
1928 code = mcode;
1931 if (*ptr != ')')
1933 *errorptr = ERR14;
1934 goto FAILED;
1936 break;
1938 /* Check \ for being a real metacharacter; if not, fall through and handle
1939 it as a data character at the start of a string. Escape items are checked
1940 for validity in the pre-compiling pass. */
1942 case '\\':
1943 oldptr = ptr;
1944 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1946 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1947 are arranged to be the negation of the corresponding OP_values. For the
1948 back references, the values are ESC_REF plus the reference number. Only
1949 back references and those types that consume a character may be repeated.
1950 We can test for values between ESC_b and ESC_Z for the latter; this may
1951 have to change if any new ones are ever created. */
1953 if (c < 0)
1955 if (-c >= ESC_REF)
1957 int refnum = -c - ESC_REF;
1958 if (*brackets < refnum)
1960 *errorptr = ERR15;
1961 goto FAILED;
1963 previous = code;
1964 *code++ = OP_REF;
1965 *code++ = refnum;
1967 else
1969 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1970 if ( (options & PCRE_LOCALE) != 0)
1972 switch (c)
1974 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1975 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1976 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1977 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1980 *code++ = -c;
1982 continue;
1985 /* Data character: Reset and fall through */
1987 ptr = oldptr;
1988 c = '\\';
1990 /* Handle a run of data characters until a metacharacter is encountered.
1991 The first character is guaranteed not to be whitespace or # when the
1992 extended flag is set. */
1994 NORMAL_CHAR:
1995 default:
1996 previous = code;
1997 *code = OP_CHARS;
1998 code += 2;
1999 length = 0;
2003 if ((options & PCRE_EXTENDED) != 0)
2005 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2006 if (c == '#')
2008 while ((c = *(++ptr)) != 0 && c != '\n');
2009 if (c == 0) break;
2010 continue;
2014 /* Backslash may introduce a data char or a metacharacter. Escaped items
2015 are checked for validity in the pre-compiling pass. Stop the string
2016 before a metaitem. */
2018 if (c == '\\')
2020 oldptr = ptr;
2021 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
2022 if (c < 0) { ptr = oldptr; break; }
2025 /* Ordinary character or single-char escape */
2027 *code++ = c;
2028 length++;
2031 /* This "while" is the end of the "do" above. */
2033 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2035 /* Compute the length and set it in the data vector, and advance to
2036 the next state. */
2038 previous[1] = length;
2039 if (length < 255) ptr--;
2040 break;
2042 } /* end of big loop */
2044 /* Control never reaches here by falling through, only by a goto for all the
2045 error states. Pass back the position in the pattern so that it can be displayed
2046 to the user for diagnosing the error. */
2048 FAILED:
2049 *ptrptr = ptr;
2050 return FALSE;
2056 /*************************************************
2057 * Compile sequence of alternatives *
2058 *************************************************/
2060 /* On entry, ptr is pointing past the bracket character, but on return
2061 it points to the closing bracket, or vertical bar, or end of string.
2062 The code variable is pointing at the byte into which the BRA operator has been
2063 stored.
2065 Argument:
2066 options the option bits
2067 brackets -> int containing the number of extracting brackets used
2068 codeptr -> the address of the current code pointer
2069 ptrptr -> the address of the current pattern pointer
2070 errorptr -> pointer to error message
2072 Returns: TRUE on success
2075 static BOOL
2076 compile_regex(int options, int *brackets, uschar **codeptr,
2077 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
2079 const uschar *ptr = *ptrptr;
2080 uschar *code = *codeptr;
2081 uschar *start_bracket = code;
2083 for (;;)
2085 int length;
2086 uschar *last_branch = code;
2088 code += 3;
2089 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
2091 *ptrptr = ptr;
2092 return FALSE;
2095 /* Fill in the length of the last branch */
2097 length = code - last_branch;
2098 last_branch[1] = length >> 8;
2099 last_branch[2] = length & 255;
2101 /* Reached end of expression, either ')' or end of pattern. Insert a
2102 terminating ket and the length of the whole bracketed item, and return,
2103 leaving the pointer at the terminating char. */
2105 if (*ptr != '|')
2107 length = code - start_bracket;
2108 *code++ = OP_KET;
2109 *code++ = length >> 8;
2110 *code++ = length & 255;
2111 *codeptr = code;
2112 *ptrptr = ptr;
2113 return TRUE;
2116 /* Another branch follows; insert an "or" node and advance the pointer. */
2118 *code = OP_ALT;
2119 ptr++;
2121 /* Control never reaches here */
2126 /*************************************************
2127 * Check for anchored expression *
2128 *************************************************/
2130 /* Try to find out if this is an anchored regular expression. Consider each
2131 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2132 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2133 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2134 counts, since OP_CIRC can match in the middle.
2136 A branch is also implicitly anchored if it starts with .* because that will try
2137 the rest of the pattern at all possible matching points, so there is no point
2138 trying them again.
2140 Argument: points to start of expression (the bracket)
2141 Returns: TRUE or FALSE
2144 static BOOL
2145 is_anchored(register const uschar *code, BOOL multiline)
2147 do {
2148 int op = (int)code[3];
2149 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2150 { if (!is_anchored(code+3, multiline)) return FALSE; }
2151 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2152 { if (code[4] != OP_ANY) return FALSE; }
2153 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2154 code += (code[1] << 8) + code[2];
2156 while (*code == OP_ALT);
2157 return TRUE;
2162 /*************************************************
2163 * Check for start with \n line expression *
2164 *************************************************/
2166 /* This is called for multiline expressions to try to find out if every branch
2167 starts with ^ so that "first char" processing can be done to speed things up.
2169 Argument: points to start of expression (the bracket)
2170 Returns: TRUE or FALSE
2173 static BOOL
2174 is_startline(const uschar *code)
2176 do {
2177 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2178 { if (!is_startline(code+3)) return FALSE; }
2179 else if (code[3] != OP_CIRC) return FALSE;
2180 code += (code[1] << 8) + code[2];
2182 while (*code == OP_ALT);
2183 return TRUE;
2188 /*************************************************
2189 * Check for fixed first char *
2190 *************************************************/
2192 /* Try to find out if there is a fixed first character. This is called for
2193 unanchored expressions, as it speeds up their processing quite considerably.
2194 Consider each alternative branch. If they all start with the same char, or with
2195 a bracket all of whose alternatives start with the same char (recurse ad lib),
2196 then we return that char, otherwise -1.
2198 Argument: points to start of expression (the bracket)
2199 Returns: -1 or the fixed first char
2202 static int
2203 find_firstchar(uschar *code)
2205 register int c = -1;
2208 register int charoffset = 4;
2210 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2212 register int d;
2213 if ((d = find_firstchar(code+3)) < 0) return -1;
2214 if (c < 0) c = d; else if (c != d) return -1;
2217 else switch(code[3])
2219 default:
2220 return -1;
2222 case OP_EXACT: /* Fall through */
2223 charoffset++;
2225 case OP_CHARS: /* Fall through */
2226 charoffset++;
2228 case OP_PLUS:
2229 case OP_MINPLUS:
2230 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2231 break;
2233 code += (code[1] << 8) + code[2];
2235 while (*code == OP_ALT);
2236 return c;
2241 /*************************************************
2242 * Compile a Regular Expression *
2243 *************************************************/
2245 /* This function takes a string and returns a pointer to a block of store
2246 holding a compiled version of the expression.
2248 Arguments:
2249 pattern the regular expression
2250 options various option bits
2251 errorptr pointer to pointer to error text
2252 erroroffset ptr offset in pattern where error was detected
2254 Returns: pointer to compiled data block, or NULL on error,
2255 with errorptr and erroroffset set
2258 pcre *
2259 pcre_compile(const char *pattern, int options, const char **errorptr,
2260 int *erroroffset, PyObject *dictionary)
2262 real_pcre *re;
2263 int spaces = 0;
2264 int length = 3; /* For initial BRA plus length */
2265 int runlength;
2266 int c, size;
2267 int bracount = 0;
2268 int brastack[200];
2269 int top_backref = 0;
2270 unsigned int brastackptr = 0;
2271 uschar *code;
2272 const uschar *ptr;
2274 #ifdef DEBUG
2275 uschar *code_base, *code_end;
2276 #endif
2278 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2279 can do is just return NULL. */
2281 if (errorptr == NULL) return NULL;
2282 *errorptr = NULL;
2284 /* However, we can give a message for this error */
2286 if (erroroffset == NULL)
2288 *errorptr = ERR16;
2289 return NULL;
2291 *erroroffset = 0;
2293 if ((options & ~PUBLIC_OPTIONS) != 0)
2295 *errorptr = ERR17;
2296 return NULL;
2299 DPRINTF(("------------------------------------------------------------------\n"));
2300 DPRINTF(("%s\n", pattern));
2302 /* The first thing to do is to make a pass over the pattern to compute the
2303 amount of store required to hold the compiled code. This does not have to be
2304 perfect as long as errors are overestimates. At the same time we can detect any
2305 internal flag settings. Make an attempt to correct for any counted white space
2306 if an "extended" flag setting appears late in the pattern. We can't be so
2307 clever for #-comments. */
2309 ptr = (const uschar *)(pattern - 1);
2310 while ((c = *(++ptr)) != 0)
2312 int min, max;
2313 int class_charcount;
2315 if ((pcre_ctypes[c] & ctype_space) != 0)
2317 if ((options & PCRE_EXTENDED) != 0) continue;
2318 spaces++;
2321 if (c == '#' && (options & PCRE_EXTENDED) != 0)
2323 while ((c = *(++ptr)) != 0 && c != '\n');
2324 continue;
2327 switch(c)
2329 /* A backslashed item may be an escaped "normal" character or a
2330 character type. For a "normal" character, put the pointers and
2331 character back so that tests for whitespace etc. in the input
2332 are done correctly. */
2334 case '\\':
2336 const uschar *save_ptr = ptr;
2337 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2338 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2339 if (c >= 0)
2341 ptr = save_ptr;
2342 c = '\\';
2343 goto NORMAL_CHAR;
2346 length++;
2348 /* A back reference needs an additional char, plus either one or 5
2349 bytes for a repeat. We also need to keep the value of the highest
2350 back reference. */
2352 if (c <= -ESC_REF)
2354 int refnum = -c - ESC_REF;
2355 if (refnum > top_backref) top_backref = refnum;
2356 length++; /* For single back reference */
2357 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2359 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2360 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2361 if ((min == 0 && (max == 1 || max == -1)) ||
2362 (min == 1 && max == -1))
2363 length++;
2364 else length += 5;
2365 if (ptr[1] == '?') ptr++;
2368 continue;
2370 case '^':
2371 case '.':
2372 case '$':
2373 case '*': /* These repeats won't be after brackets; */
2374 case '+': /* those are handled separately */
2375 case '?':
2376 length++;
2377 continue;
2379 /* This covers the cases of repeats after a single char, metachar, class,
2380 or back reference. */
2382 case '{':
2383 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2384 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2385 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2386 if ((min == 0 && (max == 1 || max == -1)) ||
2387 (min == 1 && max == -1))
2388 length++;
2389 else
2391 length--; /* Uncount the original char or metachar */
2392 if (min == 1) length++; else if (min > 0) length += 4;
2393 if (max > 0) length += 4; else length += 2;
2395 if (ptr[1] == '?') ptr++;
2396 continue;
2398 /* An alternation contains an offset to the next branch or ket. */
2399 case '|':
2400 length += 3;
2401 continue;
2403 /* A character class uses 33 characters. Don't worry about character types
2404 that aren't allowed in classes - they'll get picked up during the compile.
2405 A character class that contains only one character uses 2 or 3 bytes,
2406 depending on whether it is negated or not. Notice this where we can. */
2408 case '[':
2409 class_charcount = 0;
2410 if (*(++ptr) == '^') ptr++;
2413 if (*ptr == '\\')
2415 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
2416 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2417 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2419 else class_charcount++;
2420 ptr++;
2422 while (*ptr != 0 && *ptr != ']');
2424 /* Repeats for negated single chars are handled by the general code */
2426 if (class_charcount == 1) length += 3; else
2428 length += 33;
2429 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2431 /* A repeat needs either 1 or 5 bytes. */
2433 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2435 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2436 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2437 if ((min == 0 && (max == 1 || max == -1)) ||
2438 (min == 1 && max == -1))
2439 length++;
2440 else length += 5;
2441 if (ptr[1] == '?') ptr++;
2444 continue;
2446 /* Brackets may be genuine groups or special things */
2448 case '(':
2450 /* Handle special forms of bracket, which all start (? */
2452 if (ptr[1] == '?') switch (c = ptr[2])
2454 /* Skip over comments entirely */
2455 case '#':
2456 ptr += 3;
2457 while (*ptr != 0 && *ptr != ')') ptr++;
2458 if (*ptr == 0)
2460 *errorptr = ERR18;
2461 goto PCRE_ERROR_RETURN;
2463 continue;
2465 /* Non-referencing groups and lookaheads just move the pointer on, and
2466 then behave like a non-special bracket, except that they don't increment
2467 the count of extracting brackets. */
2469 case ':':
2470 case '=':
2471 case '!':
2472 ptr += 2;
2473 break;
2475 case ('P'):
2477 int idlen;
2478 switch (*ptr++) {
2479 case ('<'):
2480 idlen = get_group_id(ptr++, '>', errorptr);
2481 if (*errorptr) goto PCRE_ERROR_RETURN;
2482 ptr += idlen+1;
2483 break;
2484 case ('='):
2485 idlen = get_group_id(ptr++, ')', errorptr);
2486 if (*errorptr) goto PCRE_ERROR_RETURN;
2487 ptr += idlen+1;
2488 length++;
2489 break;
2492 break;
2494 /* Ditto for the "once only" bracket, allowed only if the extra bit
2495 is set. */
2497 case '>':
2498 if ((options & PCRE_EXTRA) != 0)
2500 ptr += 2;
2501 break;
2503 /* Else fall through */
2505 /* Else loop setting valid options until ) is met. Anything else is an
2506 error. */
2508 default:
2509 ptr += 2;
2510 for (;; ptr++)
2512 if ((c = *ptr) == 'i')
2514 options |= PCRE_CASELESS;
2515 continue;
2517 else if ((c = *ptr) == 'L')
2519 options |= PCRE_LOCALE;
2520 continue;
2522 else if ((c = *ptr) == 'm')
2524 options |= PCRE_MULTILINE;
2525 continue;
2527 else if (c == 's')
2529 options |= PCRE_DOTALL;
2530 continue;
2532 else if (c == 'x')
2534 options |= PCRE_EXTENDED;
2535 length -= spaces; /* Already counted spaces */
2536 continue;
2538 else if (c == ')') break;
2540 *errorptr = ERR12;
2541 goto PCRE_ERROR_RETURN;
2543 continue; /* End of this bracket handling */
2546 /* Extracting brackets must be counted so we can process escapes in a
2547 Perlish way. */
2549 else bracount++;
2551 /* Non-special forms of bracket. Save length for computing whole length
2552 at end if there's a repeat that requires duplication of the group. */
2554 if (brastackptr >= sizeof(brastack)/sizeof(int))
2556 *errorptr = ERR19;
2557 goto PCRE_ERROR_RETURN;
2560 brastack[brastackptr++] = length;
2561 length += 3;
2562 continue;
2564 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2565 have to replicate this bracket up to that many times. If brastackptr is
2566 0 this is an unmatched bracket which will generate an error, but take care
2567 not to try to access brastack[-1]. */
2569 case ')':
2570 length += 3;
2572 int minval = 1;
2573 int maxval = 1;
2574 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
2576 /* Leave ptr at the final char; for read_repeat_counts this happens
2577 automatically; for the others we need an increment. */
2579 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2581 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2582 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2584 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2585 else if (c == '+') { maxval = -1; ptr++; }
2586 else if (c == '?') { minval = 0; ptr++; }
2588 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2589 if there is a limited maximum we have to replicate up to maxval-1 times
2590 and allow for a BRAZERO item before each optional copy, as we also have
2591 to do before the first copy if the minimum is zero. */
2593 if (minval == 0) length++;
2594 else if (minval > 1) length += (minval - 1) * duplength;
2595 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2597 continue;
2599 /* Non-special character. For a run of such characters the length required
2600 is the number of characters + 2, except that the maximum run length is 255.
2601 We won't get a skipped space or a non-data escape or the start of a #
2602 comment as the first character, so the length can't be zero. */
2604 NORMAL_CHAR:
2605 default:
2606 length += 2;
2607 runlength = 0;
2610 if ((pcre_ctypes[c] & ctype_space) != 0)
2612 if ((options & PCRE_EXTENDED) != 0) continue;
2613 spaces++;
2616 if (c == '#' && (options & PCRE_EXTENDED) != 0)
2618 while ((c = *(++ptr)) != 0 && c != '\n');
2619 continue;
2622 /* Backslash may introduce a data char or a metacharacter; stop the
2623 string before the latter. */
2625 if (c == '\\')
2627 const uschar *saveptr = ptr;
2628 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2629 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630 if (c < 0) { ptr = saveptr; break; }
2633 /* Ordinary character or single-char escape */
2635 runlength++;
2638 /* This "while" is the end of the "do" above. */
2640 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2642 ptr--;
2643 length += runlength;
2644 continue;
2648 length += 4; /* For final KET and END */
2650 if (length > 65539)
2652 *errorptr = ERR20;
2653 return NULL;
2656 /* Compute the size of data block needed and get it, either from malloc or
2657 externally provided function. We specify "code[0]" in the offsetof() expression
2658 rather than just "code", because it has been reported that one broken compiler
2659 fails on "code" because it is also an independent variable. It should make no
2660 difference to the value of the offsetof(). */
2662 size = length + offsetof(real_pcre, code[0]);
2663 re = (real_pcre *)(pcre_malloc)(size+50);
2665 if (re == NULL)
2667 *errorptr = ERR21;
2668 return NULL;
2671 /* Put in the magic number and the options. */
2673 re->magic_number = MAGIC_NUMBER;
2674 re->options = options;
2676 /* Set up a starting, non-extracting bracket, then compile the expression. On
2677 error, *errorptr will be set non-NULL, so we don't need to look at the result
2678 of the function here. */
2680 ptr = (const uschar *)pattern;
2681 code = re->code;
2682 *code = OP_BRA;
2683 bracount = 0;
2684 (void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2685 re->top_bracket = bracount;
2686 re->top_backref = top_backref;
2688 /* If not reached end of pattern on success, there's an excess bracket. */
2690 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2692 /* Fill in the terminating state and check for disastrous overflow, but
2693 if debugging, leave the test till after things are printed out. */
2695 *code++ = OP_END;
2698 #ifndef DEBUG
2699 if (code - re->code > length) *errorptr = ERR23;
2700 #endif
2702 /* Failed to compile */
2704 if (*errorptr != NULL)
2706 (pcre_free)(re);
2707 PCRE_ERROR_RETURN:
2708 *erroroffset = ptr - (const uschar *)pattern;
2709 return NULL;
2712 /* If the anchored option was not passed, set flag if we can determine that it
2713 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2714 we can determine what the first character has to be, because that speeds up
2715 unanchored matches no end. In the case of multiline matches, an alternative is
2716 to set the PCRE_STARTLINE flag if all branches start with ^. */
2718 if ((options & PCRE_ANCHORED) == 0)
2720 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2721 re->options |= PCRE_ANCHORED;
2722 else
2724 int ch = find_firstchar(re->code);
2725 if (ch >= 0)
2727 re->first_char = ch;
2728 re->options |= PCRE_FIRSTSET;
2730 else if (is_startline(re->code))
2731 re->options |= PCRE_STARTLINE;
2735 /* Print out the compiled data for debugging */
2737 #ifdef DEBUG
2739 printf("Length = %d top_bracket = %d top_backref=%d\n",
2740 length, re->top_bracket, re->top_backref);
2742 if (re->options != 0)
2744 printf("%s%s%s%s%s%s%s%s\n",
2745 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2746 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2747 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2748 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2749 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2750 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2751 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2752 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2755 if ((re->options & PCRE_FIRSTSET) != 0)
2757 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2758 else printf("First char = \\x%02x\n", re->first_char);
2761 code_end = code;
2762 code_base = code = re->code;
2764 while (code < code_end)
2766 int charlength;
2768 printf("%3d ", code - code_base);
2770 if (*code >= OP_BRA)
2772 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2773 code += 2;
2776 else switch(*code)
2778 case OP_CHARS:
2779 charlength = *(++code);
2780 printf("%3d ", charlength);
2781 while (charlength-- > 0)
2782 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2783 break;
2785 case OP_KETRMAX:
2786 case OP_KETRMIN:
2787 case OP_ALT:
2788 case OP_KET:
2789 case OP_ASSERT:
2790 case OP_ASSERT_NOT:
2791 case OP_ONCE:
2792 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2793 code += 2;
2794 break;
2796 case OP_STAR:
2797 case OP_MINSTAR:
2798 case OP_PLUS:
2799 case OP_MINPLUS:
2800 case OP_QUERY:
2801 case OP_MINQUERY:
2802 case OP_TYPESTAR:
2803 case OP_TYPEMINSTAR:
2804 case OP_TYPEPLUS:
2805 case OP_TYPEMINPLUS:
2806 case OP_TYPEQUERY:
2807 case OP_TYPEMINQUERY:
2808 if (*code >= OP_TYPESTAR)
2809 printf(" %s", OP_names[code[1]]);
2810 else if (isprint(c = code[1])) printf(" %c", c);
2811 else printf(" \\x%02x", c);
2812 printf("%s", OP_names[*code++]);
2813 break;
2815 case OP_EXACT:
2816 case OP_UPTO:
2817 case OP_MINUPTO:
2818 if (isprint(c = code[3])) printf(" %c{", c);
2819 else printf(" \\x%02x{", c);
2820 if (*code != OP_EXACT) printf("0,");
2821 printf("%d}", (code[1] << 8) + code[2]);
2822 if (*code == OP_MINUPTO) printf("?");
2823 code += 3;
2824 break;
2826 case OP_TYPEEXACT:
2827 case OP_TYPEUPTO:
2828 case OP_TYPEMINUPTO:
2829 printf(" %s{", OP_names[code[3]]);
2830 if (*code != OP_TYPEEXACT) printf(",");
2831 printf("%d}", (code[1] << 8) + code[2]);
2832 if (*code == OP_TYPEMINUPTO) printf("?");
2833 code += 3;
2834 break;
2836 case OP_NOT:
2837 if (isprint(c = *(++code))) printf(" [^%c]", c);
2838 else printf(" [^\\x%02x]", c);
2839 break;
2841 case OP_NOTSTAR:
2842 case OP_NOTMINSTAR:
2843 case OP_NOTPLUS:
2844 case OP_NOTMINPLUS:
2845 case OP_NOTQUERY:
2846 case OP_NOTMINQUERY:
2847 if (isprint(c = code[1])) printf(" [^%c]", c);
2848 else printf(" [^\\x%02x]", c);
2849 printf("%s", OP_names[*code++]);
2850 break;
2852 case OP_NOTEXACT:
2853 case OP_NOTUPTO:
2854 case OP_NOTMINUPTO:
2855 if (isprint(c = code[3])) printf(" [^%c]{", c);
2856 else printf(" [^\\x%02x]{", c);
2857 if (*code != OP_NOTEXACT) printf(",");
2858 printf("%d}", (code[1] << 8) + code[2]);
2859 if (*code == OP_NOTMINUPTO) printf("?");
2860 code += 3;
2861 break;
2863 case OP_REF:
2864 printf(" \\%d", *(++code));
2865 code ++;
2866 goto CLASS_REF_REPEAT;
2868 case OP_CLASS:
2869 case OP_NEGCLASS:
2870 case OP_CLASS_L:
2872 int i, min, max;
2874 if (*code==OP_CLASS_L)
2876 code++;
2877 printf("Locflag = %i ", *code++);
2878 printf(" [");
2880 else
2882 if (*code++ == OP_CLASS) printf(" [");
2883 else printf(" ^[");
2887 for (i = 0; i < 256; i++)
2889 if ((code[i/8] & (1 << (i&7))) != 0)
2891 int j;
2892 for (j = i+1; j < 256; j++)
2893 if ((code[j/8] & (1 << (j&7))) == 0) break;
2894 if (i == '-' || i == ']') printf("\\");
2895 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2896 if (--j > i)
2898 printf("-");
2899 if (j == '-' || j == ']') printf("\\");
2900 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2902 i = j;
2905 printf("]");
2906 code += 32;
2907 /* code ++;*/
2909 CLASS_REF_REPEAT:
2911 switch(*code)
2913 case OP_CRSTAR:
2914 case OP_CRMINSTAR:
2915 case OP_CRPLUS:
2916 case OP_CRMINPLUS:
2917 case OP_CRQUERY:
2918 case OP_CRMINQUERY:
2919 printf("%s", OP_names[*code]);
2920 break;
2922 case OP_CRRANGE:
2923 case OP_CRMINRANGE:
2924 min = (code[1] << 8) + code[2];
2925 max = (code[3] << 8) + code[4];
2926 if (max == 0) printf("{%d,}", min);
2927 else printf("{%d,%d}", min, max);
2928 if (*code == OP_CRMINRANGE) printf("?");
2929 code += 4;
2930 break;
2932 default:
2933 code--;
2936 break;
2938 /* Anything else is just a one-node item */
2940 default:
2941 printf(" %s", OP_names[*code]);
2942 break;
2945 code++;
2946 printf("\n");
2948 printf("------------------------------------------------------------------\n");
2950 /* This check is done here in the debugging case so that the code that
2951 was compiled can be seen. */
2953 if (code - re->code > length)
2955 printf("length=%i, code length=%i\n", length, code-re->code);
2956 *errorptr = ERR23;
2957 (pcre_free)(re);
2958 *erroroffset = ptr - (uschar *)pattern;
2959 return NULL;
2961 #endif
2963 return (pcre *)re;
2968 /*************************************************
2969 * Match a character type *
2970 *************************************************/
2972 /* Not used in all the places it might be as it's sometimes faster
2973 to put the code inline.
2975 Arguments:
2976 type the character type
2977 c the character
2978 dotall the dotall flag
2980 Returns: TRUE if character is of the type
2983 static BOOL
2984 match_type(int type, int c, BOOL dotall)
2987 #ifdef DEBUG
2988 if (isprint(c)) printf("matching subject %c against ", c);
2989 else printf("matching subject \\x%02x against ", c);
2990 printf("%s\n", OP_names[type]);
2991 #endif
2993 switch(type)
2995 case OP_ANY: return dotall || c != '\n';
2996 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2997 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2998 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2999 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
3000 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
3001 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
3002 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3003 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
3005 return FALSE;
3010 /*************************************************
3011 * Match a back-reference *
3012 *************************************************/
3014 /* If a back reference hasn't been set, the match fails.
3016 Arguments:
3017 number reference number
3018 eptr points into the subject
3019 length length to be matched
3020 md points to match data block
3022 Returns: TRUE if matched
3025 static BOOL
3026 match_ref(int number, register const uschar *eptr, int length, match_data *md)
3028 const uschar *p = md->start_subject + md->offset_vector[number];
3030 #ifdef DEBUG
3031 if (eptr >= md->end_subject)
3032 printf("matching subject <null>");
3033 else
3035 printf("matching subject ");
3036 pchars(eptr, length, TRUE, md);
3038 printf(" against backref ");
3039 pchars(p, length, FALSE, md);
3040 printf("\n");
3041 #endif
3043 /* Always fail if not enough characters left */
3045 if (length > md->end_subject - p) return FALSE;
3047 /* Separate the caseless case for speed */
3049 if (md->caseless)
3050 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3051 else
3052 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3054 return TRUE;
3057 static int free_stack(match_data *md)
3059 /* Free any stack space that was allocated by the call to match(). */
3060 if (md->off_num) free(md->off_num);
3061 if (md->offset_top) free(md->offset_top);
3062 if (md->r1) free(md->r1);
3063 if (md->r2) free(md->r2);
3064 if (md->eptr) free((char *)md->eptr);
3065 if (md->ecode) free((char *)md->ecode);
3066 return 0;
3069 static int grow_stack(match_data *md)
3071 if (md->length != 0)
3073 md->length = md->length + md->length/2;
3075 else
3077 int string_len = md->end_subject - md->start_subject + 1;
3078 if (string_len < 80) {md->length = string_len; }
3079 else {md->length = 80;}
3081 PyMem_RESIZE(md->offset_top, int, md->length);
3082 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3083 PyMem_RESIZE(md->ecode, const uschar *, md->length);
3084 PyMem_RESIZE(md->off_num, int, md->length);
3085 PyMem_RESIZE(md->r1, int, md->length);
3086 PyMem_RESIZE(md->r2, int, md->length);
3087 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3088 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3090 PyErr_NoMemory();
3091 longjmp(md->error_env, 1);
3093 return 0;
3097 /*************************************************
3098 * Match from current position *
3099 *************************************************/
3101 /* On entry ecode points to the first opcode, and eptr to the first character.
3103 Arguments:
3104 eptr pointer in subject
3105 ecode position in code
3106 offset_top current top pointer
3107 md pointer to "static" info for the match
3109 Returns: TRUE if matched
3112 static BOOL
3113 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
3114 match_data *md)
3116 int save_stack_position = md->point;
3117 match_loop:
3119 #define SUCCEED goto succeed
3120 #define FAIL goto fail
3122 for (;;)
3124 int min, max, ctype;
3125 register int i;
3126 register int c;
3127 BOOL minimize = FALSE;
3129 /* Opening bracket. Check the alternative branches in turn, failing if none
3130 match. We have to set the start offset if required and there is space
3131 in the offset vector so that it is available for subsequent back references
3132 if the bracket matches. However, if the bracket fails, we must put back the
3133 previous value of both offsets in case they were set by a previous copy of
3134 the same bracket. Don't worry about setting the flag for the error case here;
3135 that is handled in the code for KET. */
3137 if ((int)*ecode >= OP_BRA)
3139 int number = (*ecode - OP_BRA) << 1;
3140 int save_offset1 = 0, save_offset2 = 0;
3142 DPRINTF(("start bracket %d\n", number/2));
3144 if (number > 0 && number < md->offset_end)
3146 save_offset1 = md->offset_vector[number];
3147 save_offset2 = md->offset_vector[number+1];
3148 md->offset_vector[number] = eptr - md->start_subject;
3150 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
3153 /* Recurse for all the alternatives. */
3157 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3158 ecode += (ecode[1] << 8) + ecode[2];
3160 while (*ecode == OP_ALT);
3162 DPRINTF(("bracket %d failed\n", number/2));
3164 if (number > 0 && number < md->offset_end)
3166 md->offset_vector[number] = save_offset1;
3167 md->offset_vector[number+1] = save_offset2;
3170 FAIL;
3173 /* Other types of node can be handled by a switch */
3175 switch(*ecode)
3177 case OP_END:
3178 md->end_match_ptr = eptr; /* Record where we ended */
3179 md->end_offset_top = offset_top; /* and how many extracts were taken */
3180 SUCCEED;
3182 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3183 whole thing doesn't match, so we have to get out via a longjmp(). */
3185 case OP_CUT:
3186 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3187 longjmp(md->fail_env, 1);
3189 /* Assertion brackets. Check the alternative branches in turn - the
3190 matching won't pass the KET for an assertion. If any one branch matches,
3191 the assertion is true. */
3193 case OP_ASSERT:
3196 if (match(eptr, ecode+3, offset_top, md)) break;
3197 ecode += (ecode[1] << 8) + ecode[2];
3199 while (*ecode == OP_ALT);
3200 if (*ecode == OP_KET) FAIL;
3202 /* Continue from after the assertion, updating the offsets high water
3203 mark, since extracts may have been taken during the assertion. */
3205 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3206 ecode += 3;
3207 offset_top = md->end_offset_top;
3208 continue;
3210 /* Negative assertion: all branches must fail to match */
3212 case OP_ASSERT_NOT:
3215 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3216 ecode += (ecode[1] << 8) + ecode[2];
3218 while (*ecode == OP_ALT);
3219 ecode += 3;
3220 continue;
3222 /* "Once" brackets are like assertion brackets except that after a match,
3223 the point in the subject string is not moved back. Thus there can never be
3224 a move back into the brackets. Check the alternative branches in turn - the
3225 matching won't pass the KET for this kind of subpattern. If any one branch
3226 matches, we carry on, leaving the subject pointer. */
3228 case OP_ONCE:
3231 if (match(eptr, ecode+3, offset_top, md)) break;
3232 ecode += (ecode[1] << 8) + ecode[2];
3234 while (*ecode == OP_ALT);
3235 if (*ecode == OP_KET) FAIL;
3237 /* Continue as from after the assertion, updating the offsets high water
3238 mark, since extracts may have been taken. */
3240 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3241 ecode += 3;
3242 offset_top = md->end_offset_top;
3243 eptr = md->end_match_ptr;
3244 continue;
3246 /* An alternation is the end of a branch; scan along to find the end of the
3247 bracketed group and go to there. */
3249 case OP_ALT:
3250 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3251 break;
3253 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3254 that it may occur zero times. It may repeat infinitely, or not at all -
3255 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3256 repeat limits are compiled as a number of copies, with the optional ones
3257 preceded by BRAZERO or BRAMINZERO. */
3259 case OP_BRAZERO:
3261 const uschar *next = ecode+1;
3262 if (match(eptr, next, offset_top, md)) SUCCEED;
3263 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3264 ecode = next + 3;
3266 break;
3268 case OP_BRAMINZERO:
3270 const uschar *next = ecode+1;
3271 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3272 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3273 ecode++;
3275 break;;
3277 /* End of a group, repeated or non-repeating. If we are at the end of
3278 an assertion "group", stop matching and SUCCEED, but record the
3279 current high water mark for use by positive assertions. */
3281 case OP_KET:
3282 case OP_KETRMIN:
3283 case OP_KETRMAX:
3285 int number;
3286 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3288 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
3290 md->end_match_ptr = eptr; /* For ONCE */
3291 md->end_offset_top = offset_top;
3292 SUCCEED;
3295 /* In all other cases we have to check the group number back at the
3296 start and if necessary complete handling an extraction by setting the
3297 final offset and bumping the high water mark. */
3299 number = (*prev - OP_BRA) << 1;
3301 DPRINTF(("end bracket %d\n", number/2));
3303 if (number > 0)
3305 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3307 md->offset_vector[number+1] = eptr - md->start_subject;
3308 if (offset_top <= number) offset_top = number + 2;
3312 /* For a non-repeating ket, just advance to the next node and continue at
3313 this level. */
3315 if (*ecode == OP_KET)
3317 ecode += 3;
3318 break;
3321 /* The repeating kets try the rest of the pattern or restart from the
3322 preceding bracket, in the appropriate order. */
3324 if (*ecode == OP_KETRMIN)
3326 const uschar *ptr;
3327 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3328 /* Handle alternation inside the BRA...KET; push the additional
3329 alternatives onto the stack */
3330 ptr=prev;
3331 do {
3332 ptr += (ptr[1]<<8)+ ptr[2];
3333 if (*ptr==OP_ALT)
3335 if (md->length == md->point)
3337 grow_stack(md);
3339 md->offset_top[md->point] = offset_top;
3340 md->eptr[md->point] = eptr;
3341 md->ecode[md->point] = ptr+3;
3342 md->r1[md->point] = 0;
3343 md->r2[md->point] = 0;
3344 md->off_num[md->point] = 0;
3345 md->point++;
3347 } while (*ptr==OP_ALT);
3348 ecode=prev+3; goto match_loop;
3350 else /* OP_KETRMAX */
3352 const uschar *ptr;
3353 /*int points_pushed=0;*/
3355 /* Push one failure point, that will resume matching at the code after
3356 the KETRMAX opcode. */
3357 if (md->length == md->point)
3359 grow_stack(md);
3361 md->offset_top[md->point] = offset_top;
3362 md->eptr[md->point] = eptr;
3363 md->ecode[md->point] = ecode+3;
3364 md->r1[md->point] = md->offset_vector[number];
3365 md->r2[md->point] = md->offset_vector[number+1];
3366 md->off_num[md->point] = number;
3367 md->point++;
3369 md->offset_vector[number] = eptr - md->start_subject;
3370 /* Handle alternation inside the BRA...KET; push each of the
3371 additional alternatives onto the stack */
3372 ptr=prev;
3373 do {
3374 ptr += (ptr[1]<<8)+ ptr[2];
3375 if (*ptr==OP_ALT)
3377 if (md->length == md->point)
3378 if (md->length == md->point)
3380 grow_stack(md);
3382 md->offset_top[md->point] = offset_top;
3383 md->eptr[md->point] = eptr;
3384 md->ecode[md->point] = ptr+3;
3385 md->r1[md->point] = 0;
3386 md->r2[md->point] = 0;
3387 md->off_num[md->point] = 0;
3388 md->point++;
3389 /*points_pushed++;*/
3391 } while (*ptr==OP_ALT);
3392 /* Jump to the first (or only) alternative and resume trying to match */
3393 ecode=prev+3; goto match_loop;
3397 /* Start of subject unless notbol, or after internal newline if multiline */
3399 case OP_CIRC:
3400 if (md->notbol && eptr == md->start_subject) FAIL;
3401 if (md->multiline)
3403 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3404 ecode++;
3405 break;
3407 /* ... else fall through */
3409 /* Start of subject assertion */
3411 case OP_SOD:
3412 if (eptr != md->start_subject) FAIL;
3413 ecode++;
3414 break;
3416 /* Assert before internal newline if multiline, or before
3417 a terminating newline unless endonly is set, else end of subject unless
3418 noteol is set. */
3420 case OP_DOLL:
3421 if (md->noteol && eptr >= md->end_subject) FAIL;
3422 if (md->multiline)
3424 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3425 ecode++;
3426 break;
3428 else if (!md->endonly)
3430 if (eptr < md->end_subject - 1 ||
3431 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3432 ecode++;
3433 break;
3435 /* ... else fall through */
3437 /* End of subject assertion */
3439 case OP_EOD:
3440 if (eptr < md->end_subject) FAIL;
3441 ecode++;
3442 break;
3444 /* Word boundary assertions */
3446 case OP_NOT_WORD_BOUNDARY:
3447 case OP_WORD_BOUNDARY:
3449 BOOL prev_is_word = (eptr != md->start_subject) &&
3450 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3451 BOOL cur_is_word = (eptr < md->end_subject) &&
3452 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3453 if ((*ecode++ == OP_WORD_BOUNDARY)?
3454 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3455 FAIL;
3457 break;
3459 case OP_NOT_WORD_BOUNDARY_L:
3460 case OP_WORD_BOUNDARY_L:
3462 BOOL prev_is_word = (eptr != md->start_subject) &&
3463 (isalnum(eptr[-1]) || eptr[-1]=='_');
3464 BOOL cur_is_word = (eptr < md->end_subject) &&
3465 (isalnum(*eptr) || *eptr=='_');
3466 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3467 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3468 FAIL;
3470 break;
3473 /* Match a single character type; inline for speed */
3475 case OP_ANY:
3476 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3477 if (eptr++ >= md->end_subject) FAIL;
3478 ecode++;
3479 break;
3481 case OP_NOT_DIGIT:
3482 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3483 FAIL;
3484 ecode++;
3485 break;
3487 case OP_DIGIT:
3488 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3489 FAIL;
3490 ecode++;
3491 break;
3493 case OP_NOT_WHITESPACE:
3494 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3495 FAIL;
3496 ecode++;
3497 break;
3499 case OP_WHITESPACE:
3500 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3501 FAIL;
3502 ecode++;
3503 break;
3505 case OP_NOT_WORDCHAR:
3506 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3507 FAIL;
3508 ecode++;
3509 break;
3511 case OP_WORDCHAR:
3512 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3513 FAIL;
3514 ecode++;
3515 break;
3517 case OP_NOT_WORDCHAR_L:
3518 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
3519 FAIL;
3520 eptr++;
3521 ecode++;
3522 break;
3524 case OP_WORDCHAR_L:
3525 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
3526 FAIL;
3527 eptr++;
3528 ecode++;
3529 break;
3531 /* Match a back reference, possibly repeatedly. Look past the end of the
3532 item to see if there is repeat information following. The code is similar
3533 to that for character classes, but repeated for efficiency. Then obey
3534 similar code to character type repeats - written out again for speed.
3535 However, if the referenced string is the empty string, always treat
3536 it as matched, any number of times (otherwise there could be infinite
3537 loops). */
3539 case OP_REF:
3541 int length;
3542 int number = ecode[1] << 1; /* Doubled reference number */
3543 ecode += 2; /* Advance past the item */
3545 if (number >= offset_top || md->offset_vector[number] < 0)
3547 md->errorcode = PCRE_ERROR_BADREF;
3548 FAIL;
3551 length = md->offset_vector[number+1] - md->offset_vector[number];
3553 switch (*ecode)
3555 case OP_CRSTAR:
3556 case OP_CRMINSTAR:
3557 case OP_CRPLUS:
3558 case OP_CRMINPLUS:
3559 case OP_CRQUERY:
3560 case OP_CRMINQUERY:
3561 c = *ecode++ - OP_CRSTAR;
3562 minimize = (c & 1) != 0;
3563 min = rep_min[c]; /* Pick up values from tables; */
3564 max = rep_max[c]; /* zero for max => infinity */
3565 if (max == 0) max = INT_MAX;
3566 break;
3568 case OP_CRRANGE:
3569 case OP_CRMINRANGE:
3570 minimize = (*ecode == OP_CRMINRANGE);
3571 min = (ecode[1] << 8) + ecode[2];
3572 max = (ecode[3] << 8) + ecode[4];
3573 if (max == 0) max = INT_MAX;
3574 ecode += 5;
3575 break;
3577 default: /* No repeat follows */
3578 if (!match_ref(number, eptr, length, md)) FAIL;
3579 eptr += length;
3580 continue; /* With the main loop */
3583 /* If the length of the reference is zero, just continue with the
3584 main loop. */
3586 if (length == 0) continue;
3588 /* First, ensure the minimum number of matches are present. We get back
3589 the length of the reference string explicitly rather than passing the
3590 address of eptr, so that eptr can be a register variable. */
3592 for (i = 1; i <= min; i++)
3594 if (!match_ref(number, eptr, length, md)) FAIL;
3595 eptr += length;
3598 /* If min = max, continue at the same level without recursion.
3599 They are not both allowed to be zero. */
3601 if (min == max) continue;
3603 /* If minimizing, keep trying and advancing the pointer */
3605 if (minimize)
3607 for (i = min;; i++)
3609 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3610 if (i >= max || !match_ref(number, eptr, length, md))
3611 FAIL;
3612 eptr += length;
3614 /* Control never gets here */
3617 /* If maximizing, find the longest string and work backwards */
3619 else
3621 const uschar *pp = eptr;
3622 for (i = min; i < max; i++)
3624 if (!match_ref(number, eptr, length, md)) break;
3625 eptr += length;
3627 while (eptr >= pp)
3629 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3630 eptr -= length;
3632 FAIL;
3635 /* Control never gets here */
3637 /* Match a character class, possibly repeatedly. Look past the end of the
3638 item to see if there is repeat information following. Then obey similar
3639 code to character type repeats - written out again for speed. If caseless
3640 matching was set at runtime but not at compile time, we have to check both
3641 versions of a character, and we have to behave differently for positive and
3642 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3643 treated differently. */
3645 case OP_CLASS:
3646 case OP_NEGCLASS:
3648 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
3649 const uschar *data = ecode + 1; /* Save for matching */
3650 ecode += 33; /* Advance past the item */
3652 switch (*ecode)
3654 case OP_CRSTAR:
3655 case OP_CRMINSTAR:
3656 case OP_CRPLUS:
3657 case OP_CRMINPLUS:
3658 case OP_CRQUERY:
3659 case OP_CRMINQUERY:
3660 c = *ecode++ - OP_CRSTAR;
3661 minimize = (c & 1) != 0;
3662 min = rep_min[c]; /* Pick up values from tables; */
3663 max = rep_max[c]; /* zero for max => infinity */
3664 if (max == 0) max = INT_MAX;
3665 break;
3667 case OP_CRRANGE:
3668 case OP_CRMINRANGE:
3669 minimize = (*ecode == OP_CRMINRANGE);
3670 min = (ecode[1] << 8) + ecode[2];
3671 max = (ecode[3] << 8) + ecode[4];
3672 if (max == 0) max = INT_MAX;
3673 ecode += 5;
3674 break;
3676 default: /* No repeat follows */
3677 min = max = 1;
3678 break;
3681 /* First, ensure the minimum number of matches are present. */
3683 for (i = 1; i <= min; i++)
3685 if (eptr >= md->end_subject) FAIL;
3686 c = *eptr++;
3688 /* Either not runtime caseless, or it was a positive class. For
3689 runtime caseless, continue if either case is in the map. */
3691 if (!nasty_case)
3693 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3694 if (md->runtime_caseless)
3696 c = pcre_fcc[c];
3697 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3701 /* Runtime caseless and it was a negative class. Continue only if
3702 both cases are in the map. */
3704 else
3706 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3707 c = pcre_fcc[c];
3708 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3711 FAIL;
3714 /* If max == min we can continue with the main loop without the
3715 need to recurse. */
3717 if (min == max) continue;
3719 /* If minimizing, keep testing the rest of the expression and advancing
3720 the pointer while it matches the class. */
3722 if (minimize)
3724 for (i = min;; i++)
3726 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3727 if (i >= max || eptr >= md->end_subject) FAIL;
3728 c = *eptr++;
3730 /* Either not runtime caseless, or it was a positive class. For
3731 runtime caseless, continue if either case is in the map. */
3733 if (!nasty_case)
3735 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3736 if (md->runtime_caseless)
3738 c = pcre_fcc[c];
3739 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3743 /* Runtime caseless and it was a negative class. Continue only if
3744 both cases are in the map. */
3746 else
3748 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3749 c = pcre_fcc[c];
3750 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3753 FAIL;
3755 /* Control never gets here */
3758 /* If maximizing, find the longest possible run, then work backwards. */
3760 else
3762 const uschar *pp = eptr;
3763 for (i = min; i < max; eptr++, i++)
3765 if (eptr >= md->end_subject) break;
3766 c = *eptr;
3768 /* Either not runtime caseless, or it was a positive class. For
3769 runtime caseless, continue if either case is in the map. */
3771 if (!nasty_case)
3773 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3774 if (md->runtime_caseless)
3776 c = pcre_fcc[c];
3777 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3781 /* Runtime caseless and it was a negative class. Continue only if
3782 both cases are in the map. */
3784 else
3786 if ((data[c/8] & (1 << (c&7))) == 0) break;
3787 c = pcre_fcc[c];
3788 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3791 break;
3794 while (eptr >= pp)
3795 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3796 FAIL;
3799 /* Control never gets here */
3801 /* OP_CLASS_L opcode: handles localized character classes */
3803 case OP_CLASS_L:
3805 const uschar *data = ecode + 1; /* Save for matching */
3806 const uschar locale_flag = *data;
3807 ecode++; data++; /* The localization support adds an extra byte */
3809 ecode += 33; /* Advance past the item */
3811 switch (*ecode)
3813 case OP_CRSTAR:
3814 case OP_CRMINSTAR:
3815 case OP_CRPLUS:
3816 case OP_CRMINPLUS:
3817 case OP_CRQUERY:
3818 case OP_CRMINQUERY:
3819 c = *ecode++ - OP_CRSTAR;
3820 minimize = (c & 1) != 0;
3821 min = rep_min[c]; /* Pick up values from tables; */
3822 max = rep_max[c]; /* zero for max => infinity */
3823 if (max == 0) max = INT_MAX;
3824 break;
3826 case OP_CRRANGE:
3827 case OP_CRMINRANGE:
3828 minimize = (*ecode == OP_CRMINRANGE);
3829 min = (ecode[1] << 8) + ecode[2];
3830 max = (ecode[3] << 8) + ecode[4];
3831 if (max == 0) max = INT_MAX;
3832 ecode += 5;
3833 break;
3835 default: /* No repeat follows */
3836 if (eptr >= md->end_subject) FAIL;
3837 c = *eptr++;
3838 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3839 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3840 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3841 #if 0
3842 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3843 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3844 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3845 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3846 #endif
3848 if (md->runtime_caseless)
3850 c = pcre_fcc[c];
3851 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3853 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3854 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3856 FAIL;
3859 /* First, ensure the minimum number of matches are present. */
3861 for (i = 1; i <= min; i++)
3863 if (eptr >= md->end_subject) FAIL;
3864 c = *eptr++;
3865 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3866 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3867 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3869 if (md->runtime_caseless)
3871 c = pcre_fcc[c];
3872 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3873 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3874 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3876 FAIL;
3879 /* If max == min we can continue with the main loop without the
3880 need to recurse. */
3882 if (min == max) continue;
3884 /* If minimizing, keep testing the rest of the expression and advancing
3885 the pointer while it matches the class. */
3887 if (minimize)
3889 for (i = min;; i++)
3891 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3892 if (i >= max || eptr >= md->end_subject) FAIL;
3893 c = *eptr++;
3894 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3895 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3896 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3898 if (md->runtime_caseless)
3900 c = pcre_fcc[c];
3901 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3902 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3903 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3905 FAIL;
3907 /* Control never gets here */
3910 /* If maximizing, find the longest possible run, then work backwards. */
3912 else
3914 const uschar *pp = eptr;
3915 for (i = min; i < max; eptr++, i++)
3917 if (eptr >= md->end_subject) break;
3918 c = *eptr;
3919 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3920 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3921 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3922 if (md->runtime_caseless)
3924 c = pcre_fcc[c];
3925 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3926 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3927 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3929 break;
3932 while (eptr >= pp)
3933 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3934 FAIL;
3937 /* Control never gets here */
3939 /* Match a run of characters */
3941 case OP_CHARS:
3943 register int length = ecode[1];
3944 ecode += 2;
3946 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3947 if (eptr >= md->end_subject)
3948 printf("matching subject <null> against pattern ");
3949 else
3951 printf("matching subject ");
3952 pchars(eptr, length, TRUE, md);
3953 printf(" against pattern ");
3955 pchars(ecode, length, FALSE, md);
3956 printf("\n");
3957 #endif
3959 if (length > md->end_subject - eptr) FAIL;
3960 if (md->caseless)
3962 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3964 else
3966 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3969 break;
3971 /* Match a single character repeatedly; different opcodes share code. */
3973 case OP_EXACT:
3974 min = max = (ecode[1] << 8) + ecode[2];
3975 ecode += 3;
3976 goto REPEATCHAR;
3978 case OP_UPTO:
3979 case OP_MINUPTO:
3980 min = 0;
3981 max = (ecode[1] << 8) + ecode[2];
3982 minimize = *ecode == OP_MINUPTO;
3983 ecode += 3;
3984 goto REPEATCHAR;
3986 case OP_STAR:
3987 case OP_MINSTAR:
3988 case OP_PLUS:
3989 case OP_MINPLUS:
3990 case OP_QUERY:
3991 case OP_MINQUERY:
3992 c = *ecode++ - OP_STAR;
3993 minimize = (c & 1) != 0;
3994 min = rep_min[c]; /* Pick up values from tables; */
3995 max = rep_max[c]; /* zero for max => infinity */
3996 if (max == 0) max = INT_MAX;
3998 /* Common code for all repeated single-character matches. We can give
3999 up quickly if there are fewer than the minimum number of characters left in
4000 the subject. */
4002 REPEATCHAR:
4003 if (min > md->end_subject - eptr) FAIL;
4004 c = *ecode++;
4006 /* The code is duplicated for the caseless and caseful cases, for speed,
4007 since matching characters is likely to be quite common. First, ensure the
4008 minimum number of matches are present. If min = max, continue at the same
4009 level without recursing. Otherwise, if minimizing, keep trying the rest of
4010 the expression and advancing one matching character if failing, up to the
4011 maximum. Alternatively, if maximizing, find the maximum number of
4012 characters and work backwards. */
4014 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4015 max, eptr));
4017 if (md->caseless)
4019 c = pcre_lcc[c];
4020 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4021 if (min == max) continue;
4022 if (minimize)
4024 for (i = min;; i++)
4026 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4027 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4028 FAIL;
4030 /* Control never gets here */
4032 else
4034 const uschar *pp = eptr;
4035 for (i = min; i < max; i++)
4037 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4038 eptr++;
4040 while (eptr >= pp)
4041 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4042 FAIL;
4044 /* Control never gets here */
4047 /* Caseful comparisons */
4049 else
4051 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4052 if (min == max) continue;
4053 if (minimize)
4055 for (i = min;; i++)
4057 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4058 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4060 /* Control never gets here */
4062 else
4064 const uschar *pp = eptr;
4065 for (i = min; i < max; i++)
4067 if (eptr >= md->end_subject || c != *eptr) break;
4068 eptr++;
4070 while (eptr >= pp)
4071 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4072 FAIL;
4075 /* Control never gets here */
4077 /* Match a negated single character */
4079 case OP_NOT:
4080 if (eptr >= md->end_subject) FAIL;
4081 ecode++;
4082 if (md->caseless)
4084 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4086 else
4088 if (*ecode++ == *eptr++) FAIL;
4090 break;
4092 /* Match a negated single character repeatedly. This is almost a repeat of
4093 the code for a repeated single character, but I haven't found a nice way of
4094 commoning these up that doesn't require a test of the positive/negative
4095 option for each character match. Maybe that wouldn't add very much to the
4096 time taken, but character matching *is* what this is all about... */
4098 case OP_NOTEXACT:
4099 min = max = (ecode[1] << 8) + ecode[2];
4100 ecode += 3;
4101 goto REPEATNOTCHAR;
4103 case OP_NOTUPTO:
4104 case OP_NOTMINUPTO:
4105 min = 0;
4106 max = (ecode[1] << 8) + ecode[2];
4107 minimize = *ecode == OP_NOTMINUPTO;
4108 ecode += 3;
4109 goto REPEATNOTCHAR;
4111 case OP_NOTSTAR:
4112 case OP_NOTMINSTAR:
4113 case OP_NOTPLUS:
4114 case OP_NOTMINPLUS:
4115 case OP_NOTQUERY:
4116 case OP_NOTMINQUERY:
4117 c = *ecode++ - OP_NOTSTAR;
4118 minimize = (c & 1) != 0;
4119 min = rep_min[c]; /* Pick up values from tables; */
4120 max = rep_max[c]; /* zero for max => infinity */
4121 if (max == 0) max = INT_MAX;
4123 /* Common code for all repeated single-character matches. We can give
4124 up quickly if there are fewer than the minimum number of characters left in
4125 the subject. */
4127 REPEATNOTCHAR:
4128 if (min > md->end_subject - eptr) FAIL;
4129 c = *ecode++;
4131 /* The code is duplicated for the caseless and caseful cases, for speed,
4132 since matching characters is likely to be quite common. First, ensure the
4133 minimum number of matches are present. If min = max, continue at the same
4134 level without recursing. Otherwise, if minimizing, keep trying the rest of
4135 the expression and advancing one matching character if failing, up to the
4136 maximum. Alternatively, if maximizing, find the maximum number of
4137 characters and work backwards. */
4139 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4140 max, eptr));
4142 if (md->caseless)
4144 c = pcre_lcc[c];
4145 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4146 if (min == max) continue;
4147 if (minimize)
4149 for (i = min;; i++)
4151 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4152 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4153 FAIL;
4155 /* Control never gets here */
4157 else
4159 const uschar *pp = eptr;
4160 for (i = min; i < max; i++)
4162 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4163 eptr++;
4165 while (eptr >= pp)
4166 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4167 FAIL;
4169 /* Control never gets here */
4172 /* Caseful comparisons */
4174 else
4176 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4177 if (min == max) continue;
4178 if (minimize)
4180 for (i = min;; i++)
4182 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4183 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4185 /* Control never gets here */
4187 else
4189 const uschar *pp = eptr;
4190 for (i = min; i < max; i++)
4192 if (eptr >= md->end_subject || c == *eptr) break;
4193 eptr++;
4195 while (eptr >= pp)
4196 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4197 FAIL;
4200 /* Control never gets here */
4202 /* Match a single character type repeatedly; several different opcodes
4203 share code. This is very similar to the code for single characters, but we
4204 repeat it in the interests of efficiency. */
4206 case OP_TYPEEXACT:
4207 min = max = (ecode[1] << 8) + ecode[2];
4208 minimize = TRUE;
4209 ecode += 3;
4210 goto REPEATTYPE;
4212 case OP_TYPEUPTO:
4213 case OP_TYPEMINUPTO:
4214 min = 0;
4215 max = (ecode[1] << 8) + ecode[2];
4216 minimize = *ecode == OP_TYPEMINUPTO;
4217 ecode += 3;
4218 goto REPEATTYPE;
4220 case OP_TYPESTAR:
4221 case OP_TYPEMINSTAR:
4222 case OP_TYPEPLUS:
4223 case OP_TYPEMINPLUS:
4224 case OP_TYPEQUERY:
4225 case OP_TYPEMINQUERY:
4226 c = *ecode++ - OP_TYPESTAR;
4227 minimize = (c & 1) != 0;
4228 min = rep_min[c]; /* Pick up values from tables; */
4229 max = rep_max[c]; /* zero for max => infinity */
4230 if (max == 0) max = INT_MAX;
4232 /* Common code for all repeated single character type matches */
4234 REPEATTYPE:
4235 ctype = *ecode++; /* Code for the character type */
4237 /* First, ensure the minimum number of matches are present. Use inline
4238 code for maximizing the speed, and do the type test once at the start
4239 (i.e. keep it out of the loop). Also test that there are at least the
4240 minimum number of characters before we start. */
4242 if (min > md->end_subject - eptr) FAIL;
4243 if (min > 0) switch(ctype)
4245 case OP_ANY:
4246 if (!md->dotall)
4247 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4248 else eptr += min;
4249 break;
4251 case OP_NOT_DIGIT:
4252 for (i = 1; i <= min; i++)
4253 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4254 break;
4256 case OP_DIGIT:
4257 for (i = 1; i <= min; i++)
4258 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4259 break;
4261 case OP_NOT_WHITESPACE:
4262 for (i = 1; i <= min; i++)
4263 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4264 break;
4266 case OP_WHITESPACE:
4267 for (i = 1; i <= min; i++)
4268 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4269 break;
4271 case OP_NOT_WORDCHAR:
4272 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4273 FAIL;
4274 break;
4276 case OP_WORDCHAR:
4277 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4278 FAIL;
4279 break;
4281 case OP_NOT_WORDCHAR_L:
4282 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
4283 FAIL;
4284 break;
4286 case OP_WORDCHAR_L:
4287 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
4288 FAIL;
4289 break;
4292 /* If min = max, continue at the same level without recursing */
4294 if (min == max) continue;
4296 /* If minimizing, we have to test the rest of the pattern before each
4297 subsequent match, so inlining isn't much help; just use the function. */
4299 if (minimize)
4301 for (i = min;; i++)
4303 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4304 if (i >= max || eptr >= md->end_subject ||
4305 !match_type(ctype, *eptr++, md->dotall))
4306 FAIL;
4308 /* Control never gets here */
4311 /* If maximizing it is worth using inline code for speed, doing the type
4312 test once at the start (i.e. keep it out of the loop). */
4314 else
4316 const uschar *pp = eptr;
4317 switch(ctype)
4319 case OP_ANY:
4320 if (!md->dotall)
4322 for (i = min; i < max; i++)
4324 if (eptr >= md->end_subject || *eptr == '\n') break;
4325 eptr++;
4328 else
4330 c = max - min;
4331 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4332 eptr += c;
4334 break;
4336 case OP_NOT_DIGIT:
4337 for (i = min; i < max; i++)
4339 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4340 break;
4341 eptr++;
4343 break;
4345 case OP_DIGIT:
4346 for (i = min; i < max; i++)
4348 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4349 break;
4350 eptr++;
4352 break;
4354 case OP_NOT_WHITESPACE:
4355 for (i = min; i < max; i++)
4357 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4358 break;
4359 eptr++;
4361 break;
4363 case OP_WHITESPACE:
4364 for (i = min; i < max; i++)
4366 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4367 break;
4368 eptr++;
4370 break;
4372 case OP_NOT_WORDCHAR:
4373 for (i = min; i < max; i++)
4375 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4376 break;
4377 eptr++;
4379 break;
4381 case OP_WORDCHAR:
4382 for (i = min; i < max; i++)
4384 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4385 break;
4386 eptr++;
4388 break;
4389 case OP_NOT_WORDCHAR_L:
4390 for (i = min; i < max; i++)
4392 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
4393 break;
4394 eptr++;
4396 break;
4398 case OP_WORDCHAR_L:
4399 for (i = min; i < max; i++)
4401 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
4402 break;
4403 eptr++;
4405 break;
4408 while (eptr >= pp)
4409 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4410 FAIL;
4412 /* Control never gets here */
4414 /* There's been some horrible disaster. */
4416 default:
4417 DPRINTF(("Unknown opcode %d\n", *ecode));
4418 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4419 FAIL;
4422 /* Do not stick any code in here without much thought; it is assumed
4423 that "continue" in the code above comes out to here to repeat the main
4424 loop. */
4426 } /* End of main loop */
4427 /* Control never reaches here */
4429 fail:
4430 if (md->point > save_stack_position)
4432 /* If there are still points remaining on the stack, pop the next one off */
4433 int off_num;
4435 md->point--;
4436 offset_top = md->offset_top[md->point];
4437 eptr = md->eptr[md->point];
4438 ecode = md->ecode[md->point];
4439 off_num = md->off_num[md->point];
4440 md->offset_vector[off_num] = md->r1[md->point];
4441 md->offset_vector[off_num+1] = md->r2[md->point];
4442 goto match_loop;
4444 /* Failure, and nothing left on the stack, so end this function call */
4446 /* Restore the top of the stack to where it was before this function
4447 call. This lets us use one stack for everything; recursive calls
4448 can push and pop information, and may increase the stack. When
4449 the call returns, the parent function can resume pushing and
4450 popping wherever it was. */
4452 md->point = save_stack_position;
4453 return FALSE;
4455 succeed:
4456 return TRUE;
4461 /*************************************************
4462 * Segregate setjmp() *
4463 *************************************************/
4465 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4466 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4467 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4468 since it's needed only for the extension \X option, and with any luck, a good
4469 compiler will spot the tail recursion and compile it efficiently.
4471 Arguments:
4472 eptr pointer in subject
4473 ecode position in code
4474 offset_top current top pointer
4475 md pointer to "static" info for the match
4477 Returns: TRUE if matched
4480 static BOOL
4481 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4482 match_data *match_block)
4484 return setjmp(match_block->fail_env) == 0 &&
4485 match(eptr, ecode, offset_top, match_block);
4490 /*************************************************
4491 * Execute a Regular Expression *
4492 *************************************************/
4494 /* This function applies a compiled re to a subject string and picks out
4495 portions of the string if it matches. Two elements in the vector are set for
4496 each substring: the offsets to the start and end of the substring.
4498 Arguments:
4499 external_re points to the compiled expression
4500 external_extra points to "hints" from pcre_study() or is NULL
4501 subject points to the subject string
4502 length length of subject string (may contain binary zeros)
4503 options option bits
4504 offsets points to a vector of ints to be filled in with offsets
4505 offsetcount the number of elements in the vector
4507 Returns: > 0 => success; value is the number of elements filled in
4508 = 0 => success, but offsets is not big enough
4509 -1 => failed to match
4510 < -1 => some kind of unexpected problem
4514 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4515 const char *subject, int length, int start_pos, int options,
4516 int *offsets, int offsetcount)
4518 /* The "volatile" directives are to make gcc -Wall stop complaining
4519 that these variables can be clobbered by the longjmp. Hopefully
4520 they won't cost too much performance. */
4521 volatile int resetcount, ocount;
4522 volatile int first_char = -1;
4523 match_data match_block;
4524 const uschar *start_bits = NULL;
4525 const uschar *start_match = (const uschar *)subject + start_pos;
4526 const uschar *end_subject;
4527 const real_pcre *re = (const real_pcre *)external_re;
4528 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4529 volatile BOOL using_temporary_offsets = FALSE;
4530 volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4531 volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4533 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4535 if (re == NULL || subject == NULL ||
4536 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4537 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4539 match_block.start_subject = (const uschar *)subject;
4540 match_block.end_subject = match_block.start_subject + length;
4541 end_subject = match_block.end_subject;
4543 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4544 match_block.runtime_caseless = match_block.caseless &&
4545 (re->options & PCRE_CASELESS) == 0;
4547 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4548 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4549 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4551 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4552 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4554 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4556 /* Set the stack state to empty */
4557 match_block.off_num = match_block.offset_top = NULL;
4558 match_block.r1 = match_block.r2 = NULL;
4559 match_block.eptr = match_block.ecode = NULL;
4560 match_block.point = match_block.length = 0;
4562 /* If the expression has got more back references than the offsets supplied can
4563 hold, we get a temporary bit of working store to use during the matching.
4564 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4565 of 2. */
4567 ocount = offsetcount & (-2);
4568 if (re->top_backref > 0 && re->top_backref >= ocount/2)
4570 ocount = re->top_backref * 2 + 2;
4571 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4572 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4573 using_temporary_offsets = TRUE;
4574 DPRINTF(("Got memory to hold back references\n"));
4576 else match_block.offset_vector = offsets;
4578 match_block.offset_end = ocount;
4579 match_block.offset_overflow = FALSE;
4581 /* Compute the minimum number of offsets that we need to reset each time. Doing
4582 this makes a huge difference to execution time when there aren't many brackets
4583 in the pattern. */
4585 resetcount = 2 + re->top_bracket * 2;
4586 if (resetcount > offsetcount) resetcount = ocount;
4588 /* If MULTILINE is set at exec time but was not set at compile time, and the
4589 anchored flag is set, we must re-check because a setting provoked by ^ in the
4590 pattern is not right in multi-line mode. Calling is_anchored() again here does
4591 the right check, because multiline is now set. If it now yields FALSE, the
4592 expression must have had ^ starting some of its branches. Check to see if
4593 that is true for *all* branches, and if so, set the startline flag. */
4595 if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4596 !is_anchored(re->code, match_block.multiline))
4598 anchored = FALSE;
4599 if (is_startline(re->code)) startline = TRUE;
4602 /* Set up the first character to match, if available. The first_char value is
4603 never set for an anchored regular expression, but the anchoring may be forced
4604 at run time, so we have to test for anchoring. The first char may be unset for
4605 an unanchored pattern, of course. If there's no first char and the pattern was
4606 studied, the may be a bitmap of possible first characters. However, we can
4607 use this only if the caseless state of the studying was correct. */
4609 if (!anchored)
4611 if ((re->options & PCRE_FIRSTSET) != 0)
4613 first_char = re->first_char;
4614 if (match_block.caseless) first_char = pcre_lcc[first_char];
4616 else
4617 if (!startline && extra != NULL &&
4618 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4619 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4620 start_bits = extra->start_bits;
4623 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4627 int rc;
4628 register int *iptr = match_block.offset_vector;
4629 register int *iend = iptr + resetcount;
4631 /* Reset the maximum number of extractions we might see. */
4633 while (iptr < iend) *iptr++ = -1;
4635 /* Advance to a unique first char if possible */
4637 if (first_char >= 0)
4639 if (match_block.caseless)
4640 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4641 start_match++;
4642 else
4643 while (start_match < end_subject && *start_match != first_char)
4644 start_match++;
4647 /* Or to just after \n for a multiline match if possible */
4649 else if (startline)
4651 if (start_match > match_block.start_subject)
4653 while (start_match < end_subject && start_match[-1] != '\n')
4654 start_match++;
4658 /* Or to a non-unique first char */
4660 else if (start_bits != NULL)
4662 while (start_match < end_subject)
4664 register int c = *start_match;
4665 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4669 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4670 printf(">>>> Match against: ");
4671 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4672 printf("\n");
4673 #endif
4675 /* When a match occurs, substrings will be set for all internal extractions;
4676 we just need to set up the whole thing as substring 0 before returning. If
4677 there were too many extractions, set the return code to zero. In the case
4678 where we had to get some local store to hold offsets for backreferences, copy
4679 those back references that we can. In this case there need not be overflow
4680 if certain parts of the pattern were not used.
4682 Before starting the match, we have to set up a longjmp() target to enable
4683 the "cut" operation to fail a match completely without backtracking. This
4684 is done in a separate function to avoid compiler warnings. We need not do
4685 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4686 enabled. */
4688 /* To handle errors such as running out of memory for the failure
4689 stack, we need to save this location via setjmp(), so
4690 error-handling code can call longjmp() to jump out of deeply-nested code. */
4691 if (setjmp(match_block.error_env)==0)
4694 if ((re->options & PCRE_EXTRA) != 0)
4696 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4697 continue;
4699 else if (!match(start_match, re->code, 2, &match_block)) continue;
4701 /* Copy the offset information from temporary store if necessary */
4703 if (using_temporary_offsets)
4705 if (offsetcount >= 4)
4707 memcpy(offsets + 2, match_block.offset_vector + 2,
4708 (offsetcount - 2) * sizeof(int));
4709 DPRINTF(("Copied offsets from temporary memory\n"));
4711 if (match_block.end_offset_top > offsetcount)
4712 match_block.offset_overflow = TRUE;
4714 DPRINTF(("Freeing temporary memory\n"));
4715 (pcre_free)(match_block.offset_vector);
4718 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4720 if (match_block.offset_end < 2) rc = 0; else
4722 offsets[0] = start_match - match_block.start_subject;
4723 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4726 DPRINTF((">>>> returning %d\n", rc));
4727 free_stack(&match_block);
4728 return rc;
4729 } /* End of (if setjmp(match_block.error_env)...) */
4730 free_stack(&match_block);
4732 /* Return an error code; pcremodule.c will preserve the exception */
4733 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4735 while (!anchored &&
4736 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4737 start_match++ < end_subject);
4739 if (using_temporary_offsets)
4741 DPRINTF(("Freeing temporary memory\n"));
4742 (pcre_free)(match_block.offset_vector);
4745 #ifdef DEBUG
4746 printf(">>>> returning %d\n", match_block.errorcode);
4747 #endif
4749 free_stack(&match_block);
4750 return match_block.errorcode;
4753 /* End of pcre.c */