Improved some error messages for command line processing.
[python/dscho.git] / Modules / pypcre.c
blob96baa89c03627facf62811d8713b99556829f668
2 /*************************************************
3 * Perl-Compatible Regular Expressions *
4 *************************************************/
6 /* DO NOT EDIT THIS FILE! */
8 /* This file is automatically written by the merge-files.py script
9 included with the PCRE distribution for Python; it's produced from
10 several C files, and code is removed in the process. If you want to
11 modify the code or track down bugs, it will be much easier to work
12 with the code in its original, multiple-file form. Don't edit this
13 file by hand, or submit patches to it.
15 The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
18 The unmodified original PCRE distribution is available at
19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20 written by: Philip Hazel <ph10@cam.ac.uk>
22 Extensively modified by the Python String-SIG: <string-sig@python.org>
23 Send bug reports to: <string-sig@python.org>
24 (They'll figure out if it's a bug in PCRE or in the Python-specific
25 changes.)
27 Copyright (c) 1997 University of Cambridge
29 -----------------------------------------------------------------------------
30 Permission is granted to anyone to use this software for any purpose on any
31 computer system, and to redistribute it freely, subject to the following
32 restrictions:
34 1. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 2. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
41 3. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43 -----------------------------------------------------------------------------
47 #define FOR_PYTHON
48 #include "pcre-int.h"
49 #include "Python.h"
50 #include "mymalloc.h"
51 #include <ctype.h>
52 #include "graminit.h"
54 /*************************************************
55 * Perl-Compatible Regular Expressions *
56 *************************************************/
58 /* This file is automatically written by the makechartables auxiliary
59 program. If you edit it by hand, you might like to edit the Makefile to
60 prevent its ever being regenerated. */
62 /* This table is a lower casing table. */
64 unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
98 /* This table is a case flipping table. */
100 unsigned char pcre_fcc[] = {
101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
134 /* This table contains bit maps for digits, letters, 'word' chars, and
135 white space. Each map is 32 bytes long and the bits run from the least
136 significant end of each byte. */
138 unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
159 /* This table identifies various classes of character by individual bits:
160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
168 unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
202 /* End of chartables.c */
203 /*************************************************
204 * Perl-Compatible Regular Expressions *
205 *************************************************/
208 This is a library of functions to support regular expressions whose syntax
209 and semantics are as close as possible to those of the Perl 5 language. See
210 the file Tech.Notes for some information on the internals.
212 Written by: Philip Hazel <ph10@cam.ac.uk>
214 Copyright (c) 1998 University of Cambridge
216 -----------------------------------------------------------------------------
217 Permission is granted to anyone to use this software for any purpose on any
218 computer system, and to redistribute it freely, subject to the following
219 restrictions:
221 1. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
225 2. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
228 3. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230 -----------------------------------------------------------------------------
234 /* Include the internals header, which itself includes Standard C headers plus
235 the external pcre header. */
240 /*************************************************
241 * Create bitmap of starting chars *
242 *************************************************/
244 /* This function scans a compiled unanchored expression and attempts to build a
245 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246 goes by, we may be able to get more clever at doing this.
248 Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
252 Returns: TRUE if table built, FALSE otherwise
255 static BOOL
256 set_start_bits(const uschar *code, uschar *start_bits)
258 register int c;
262 const uschar *tcode = code + 3;
263 BOOL try_next = TRUE;
265 while (try_next)
267 try_next = FALSE;
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
271 if (!set_start_bits(tcode, start_bits)) return FALSE;
274 else switch(*tcode)
276 default:
277 return FALSE;
279 /* BRAZERO does the bracket, but carries on. */
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
284 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
285 tcode += 3;
286 try_next = TRUE;
287 break;
289 /* Single-char * or ? sets the bit and tries the next item */
291 case OP_STAR:
292 case OP_MINSTAR:
293 case OP_QUERY:
294 case OP_MINQUERY:
295 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
296 tcode += 2;
297 try_next = TRUE;
298 break;
300 /* Single-char upto sets the bit and tries the next */
302 case OP_UPTO:
303 case OP_MINUPTO:
304 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
305 tcode += 4;
306 try_next = TRUE;
307 break;
309 /* At least one single char sets the bit and stops */
311 case OP_EXACT: /* Fall through */
312 tcode++;
314 case OP_CHARS: /* Fall through */
315 tcode++;
317 case OP_PLUS:
318 case OP_MINPLUS:
319 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
320 break;
322 /* Single character type sets the bits and stops */
324 case OP_NOT_DIGIT:
325 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
326 break;
328 case OP_DIGIT:
329 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
330 break;
332 case OP_NOT_WHITESPACE:
333 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
334 break;
336 case OP_WHITESPACE:
337 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
338 break;
340 case OP_NOT_WORDCHAR:
341 for (c = 0; c < 32; c++)
342 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
343 break;
345 case OP_WORDCHAR:
346 for (c = 0; c < 32; c++)
347 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
348 break;
350 /* One or more character type fudges the pointer and restarts, knowing
351 it will hit a single character type and stop there. */
353 case OP_TYPEPLUS:
354 case OP_TYPEMINPLUS:
355 tcode++;
356 try_next = TRUE;
357 break;
359 case OP_TYPEEXACT:
360 tcode += 3;
361 try_next = TRUE;
362 break;
364 /* Zero or more repeats of character types set the bits and then
365 try again. */
367 case OP_TYPEUPTO:
368 case OP_TYPEMINUPTO:
369 tcode += 2; /* Fall through */
371 case OP_TYPESTAR:
372 case OP_TYPEMINSTAR:
373 case OP_TYPEQUERY:
374 case OP_TYPEMINQUERY:
375 switch(tcode[1])
377 case OP_NOT_DIGIT:
378 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
379 break;
381 case OP_DIGIT:
382 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
383 break;
385 case OP_NOT_WHITESPACE:
386 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
387 break;
389 case OP_WHITESPACE:
390 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
391 break;
393 case OP_NOT_WORDCHAR:
394 for (c = 0; c < 32; c++)
395 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
396 break;
398 case OP_WORDCHAR:
399 for (c = 0; c < 32; c++)
400 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
401 break;
404 tcode += 2;
405 try_next = TRUE;
406 break;
408 /* Character class: set the bits and either carry on or not,
409 according to the repeat count. */
411 case OP_CLASS:
412 case OP_NEGCLASS:
414 tcode++;
415 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
416 tcode += 32;
417 switch (*tcode)
419 case OP_CRSTAR:
420 case OP_CRMINSTAR:
421 case OP_CRQUERY:
422 case OP_CRMINQUERY:
423 tcode++;
424 try_next = TRUE;
425 break;
427 case OP_CRRANGE:
428 case OP_CRMINRANGE:
429 if (((tcode[1] << 8) + tcode[2]) == 0)
431 tcode += 5;
432 try_next = TRUE;
434 break;
437 break; /* End of class handling */
439 } /* End of switch */
440 } /* End of try_next loop */
442 code += (code[1] << 8) + code[2]; /* Advance to next branch */
444 while (*code == OP_ALT);
445 return TRUE;
450 /*************************************************
451 * Study a compiled expression *
452 *************************************************/
454 /* This function is handed a compiled expression that it must study to produce
455 information that will speed up the matching. It returns a pcre_extra block
456 which then gets handed back to pcre_exec().
458 Arguments:
459 re points to the compiled expression
460 options contains option bits
461 errorptr points to where to place error messages;
462 set NULL unless error
464 Returns: pointer to a pcre_extra block,
465 NULL on error or if no optimization possible
468 pcre_extra *
469 pcre_study(const pcre *external_re, int options, const char **errorptr)
471 BOOL caseless;
472 uschar start_bits[32];
473 real_pcre_extra *extra;
474 const real_pcre *re = (const real_pcre *)external_re;
476 *errorptr = NULL;
478 if (re == NULL || re->magic_number != MAGIC_NUMBER)
480 *errorptr = "argument is not a compiled regular expression";
481 return NULL;
484 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
486 *errorptr = "unknown or incorrect option bit(s) set";
487 return NULL;
490 /* Caseless can either be from the compiled regex or from options. */
492 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
494 /* For an anchored pattern, or an unchored pattern that has a first char, or a
495 multiline pattern that matches only at "line starts", no further processing at
496 present. */
498 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
499 return NULL;
501 /* See if we can find a fixed set of initial characters for the pattern. */
503 memset(start_bits, 0, 32 * sizeof(uschar));
504 if (!set_start_bits(re->code, start_bits)) return NULL;
506 /* If this studying is caseless, scan the created bit map and duplicate the
507 bits for any letters. */
509 if (caseless)
511 register int c;
512 for (c = 0; c < 256; c++)
514 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
515 (pcre_ctypes[c] & ctype_letter) != 0)
517 int d = pcre_fcc[c];
518 start_bits[d/8] |= (1 << (d&7));
523 /* Get an "extra" block and put the information therein. */
525 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
527 if (extra == NULL)
529 *errorptr = "failed to get memory";
530 return NULL;
533 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
534 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
536 return (pcre_extra *)extra;
539 /* End of study.c */
540 /*************************************************
541 * Perl-Compatible Regular Expressions *
542 *************************************************/
545 This is a library of functions to support regular expressions whose syntax
546 and semantics are as close as possible to those of the Perl 5 language. See
547 the file Tech.Notes for some information on the internals.
549 Written by: Philip Hazel <ph10@cam.ac.uk>
551 Copyright (c) 1998 University of Cambridge
553 -----------------------------------------------------------------------------
554 Permission is granted to anyone to use this software for any purpose on any
555 computer system, and to redistribute it freely, subject to the following
556 restrictions:
558 1. This software is distributed in the hope that it will be useful,
559 but WITHOUT ANY WARRANTY; without even the implied warranty of
560 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
562 2. The origin of this software must not be misrepresented, either by
563 explicit claim or by omission.
565 3. Altered versions must be plainly marked as such, and must not be
566 misrepresented as being the original software.
567 -----------------------------------------------------------------------------
571 /* Define DEBUG to get debugging output on stdout. */
573 /* #define DEBUG */
575 /* Use a macro for debugging printing, 'cause that eliminates the the use
576 of #ifdef inline, and there are *still* stupid compilers about that don't like
577 indented pre-processor statements. I suppose it's only been 10 years... */
579 #ifdef DEBUG
580 #define DPRINTF(p) printf p
581 #else
582 #define DPRINTF(p) /*nothing*/
583 #endif
585 /* Include the internals header, which itself includes Standard C headers plus
586 the external pcre header. */
591 #ifndef Py_eval_input
592 /* For Python 1.4, graminit.h has to be explicitly included */
593 #define Py_eval_input eval_input
595 #endif /* FOR_PYTHON */
597 /* Allow compilation as C++ source code, should anybody want to do that. */
599 #ifdef __cplusplus
600 #define class pcre_class
601 #endif
604 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
606 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
607 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
609 /* Text forms of OP_ values and things, for debugging (not all used) */
611 #ifdef DEBUG
612 static const char *OP_names[] = {
613 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
614 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
615 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
616 "^", "$", "Any", "chars",
617 "not",
618 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{",
622 "class", "negclass", "classL", "Ref",
623 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
624 "Brazero", "Braminzero", "Bra"
626 #endif
628 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
629 are simple data values; negative values are for special things like \d and so
630 on. Zero means further processing is needed (for things like \x), or the escape
631 is invalid. */
633 static const short int escapes[] = {
634 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
635 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
636 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
637 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
638 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
639 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
640 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
641 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
642 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
643 0, 0, 0 /* x - z */
646 /* Definition to allow mutual recursion */
648 static BOOL
649 compile_regex(int, int *, uschar **, const uschar **, const char **,
650 PyObject *);
652 /* Structure for passing "static" information around between the functions
653 doing the matching, so that they are thread-safe. */
655 typedef struct match_data {
656 int errorcode; /* As it says */
657 int *offset_vector; /* Offset vector */
658 int offset_end; /* One past the end */
659 BOOL offset_overflow; /* Set if too many extractions */
660 BOOL caseless; /* Case-independent flag */
661 BOOL runtime_caseless; /* Caseless forced at run time */
662 BOOL multiline; /* Multiline flag */
663 BOOL notbol; /* NOTBOL flag */
664 BOOL noteol; /* NOTEOL flag */
665 BOOL dotall; /* Dot matches any char */
666 BOOL endonly; /* Dollar not before final \n */
667 const uschar *start_subject; /* Start of the subject string */
668 const uschar *end_subject; /* End of the subject string */
669 jmp_buf fail_env; /* Environment for longjump() break out */
670 const uschar *end_match_ptr; /* Subject position at end match */
671 int end_offset_top; /* Highwater mark at end of match */
672 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
673 matching operation */
674 int length; /* Length of the allocated stacks */
675 int point; /* Point to add next item pushed onto stacks */
676 /* Pointers to the 6 stacks */
677 int *off_num, *offset_top, *r1, *r2;
678 const uschar **eptr, **ecode;
679 } match_data;
683 /*************************************************
684 * Global variables *
685 *************************************************/
687 /* PCRE is thread-clean and doesn't use any global variables in the normal
688 sense. However, it calls memory allocation and free functions via the two
689 indirections below, which are can be changed by the caller, but are shared
690 between all threads. */
692 void *(*pcre_malloc)(size_t) = malloc;
693 void (*pcre_free)(void *) = free;
698 /*************************************************
699 * Return version string *
700 *************************************************/
702 const char *
703 pcre_version(void)
705 return PCRE_VERSION;
711 /*************************************************
712 * Return info about a compiled pattern *
713 *************************************************/
715 /* This function picks potentially useful data out of the private
716 structure.
718 Arguments:
719 external_re points to compiled code
720 optptr where to pass back the options
721 first_char where to pass back the first character,
722 or -1 if multiline and all branches start ^,
723 or -2 otherwise
725 Returns: number of identifying extraction brackets
726 or negative values on error
730 pcre_info(const pcre *external_re, int *optptr, int *first_char)
732 const real_pcre *re = (real_pcre *)external_re;
733 if (re == NULL) return PCRE_ERROR_NULL;
734 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
735 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
736 if (first_char != NULL)
737 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
738 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
739 return re->top_bracket;
745 #ifdef DEBUG
746 /*************************************************
747 * Debugging function to print chars *
748 *************************************************/
750 /* Print a sequence of chars in printable format, stopping at the end of the
751 subject if the requested.
753 Arguments:
754 p points to characters
755 length number to print
756 is_subject TRUE if printing from within md->start_subject
757 md pointer to matching data block, if is_subject is TRUE
759 Returns: nothing
762 static void
763 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
765 int c;
766 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
767 while (length-- > 0)
768 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
770 #endif
775 /*************************************************
776 * Check subpattern for empty operand *
777 *************************************************/
779 /* This function checks a bracketed subpattern to see if any of the paths
780 through it could match an empty string. This is used to diagnose an error if
781 such a subpattern is followed by a quantifier with an unlimited upper bound.
783 Argument:
784 code points to the opening bracket
786 Returns: TRUE or FALSE
789 static BOOL
790 could_be_empty(uschar *code)
792 do {
793 uschar *cc = code + 3;
795 /* Scan along the opcodes for this branch; as soon as we find something
796 that matches a non-empty string, break out and advance to test the next
797 branch. If we get to the end of the branch, return TRUE for the whole
798 sub-expression. */
800 for (;;)
802 /* Test an embedded subpattern; if it could not be empty, break the
803 loop. Otherwise carry on in the branch. */
805 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
807 if (!could_be_empty(cc)) break;
808 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
809 cc += 3;
812 else switch (*cc)
814 /* Reached end of a branch: the subpattern may match the empty string */
816 case OP_ALT:
817 case OP_KET:
818 case OP_KETRMAX:
819 case OP_KETRMIN:
820 return TRUE;
822 /* Skip over entire bracket groups with zero lower bound */
824 case OP_BRAZERO:
825 case OP_BRAMINZERO:
826 cc++;
827 /* Fall through */
829 /* Skip over assertive subpatterns */
831 case OP_ASSERT:
832 case OP_ASSERT_NOT:
833 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
834 cc += 3;
835 break;
837 /* Skip over things that don't match chars */
839 case OP_SOD:
840 case OP_EOD:
841 case OP_CIRC:
842 case OP_DOLL:
843 case OP_NOT_WORD_BOUNDARY:
844 case OP_WORD_BOUNDARY:
845 case OP_NOT_WORD_BOUNDARY_L:
846 case OP_WORD_BOUNDARY_L:
847 cc++;
848 break;
850 /* Skip over simple repeats with zero lower bound */
852 case OP_STAR:
853 case OP_MINSTAR:
854 case OP_QUERY:
855 case OP_MINQUERY:
856 case OP_NOTSTAR:
857 case OP_NOTMINSTAR:
858 case OP_NOTQUERY:
859 case OP_NOTMINQUERY:
860 case OP_TYPESTAR:
861 case OP_TYPEMINSTAR:
862 case OP_TYPEQUERY:
863 case OP_TYPEMINQUERY:
864 cc += 2;
865 break;
867 /* Skip over UPTOs (lower bound is zero) */
869 case OP_UPTO:
870 case OP_MINUPTO:
871 case OP_TYPEUPTO:
872 case OP_TYPEMINUPTO:
873 cc += 4;
874 break;
876 /* Check a class or a back reference for a zero minimum */
878 case OP_CLASS:
879 case OP_NEGCLASS:
880 case OP_REF:
881 case OP_CLASS_L:
882 switch(*cc)
884 case (OP_REF): cc += 2; break;
885 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
886 case (OP_CLASS_L): cc += 1+1+32; break;
889 switch (*cc)
891 case OP_CRSTAR:
892 case OP_CRMINSTAR:
893 case OP_CRQUERY:
894 case OP_CRMINQUERY:
895 cc++;
896 break;
898 case OP_CRRANGE:
899 case OP_CRMINRANGE:
900 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
901 cc += 3;
902 break;
904 default:
905 goto NEXT_BRANCH;
907 break;
909 /* Anything else matches at least one character */
911 default:
912 goto NEXT_BRANCH;
916 NEXT_BRANCH:
917 code += (code[1] << 8) + code[2];
919 while (*code == OP_ALT);
921 /* No branches match the empty string */
923 return FALSE;
926 /* Determine the length of a group ID in an expression like
927 (?P<foo_123>...)
928 Arguments:
929 ptr pattern position pointer (say that 3 times fast)
930 finalchar the character that will mark the end of the ID
931 errorptr points to the pointer to the error message
934 static int
935 get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
937 const uschar *start = ptr;
939 /* If the first character is not in \w, or is in \w but is a digit,
940 report an error */
941 if (!(pcre_ctypes[*ptr] & ctype_word) ||
942 (pcre_ctypes[*ptr++] & ctype_digit))
944 *errorptr = "(?P identifier must start with a letter or underscore";
945 return 0;
948 /* Increment ptr until we either hit a null byte, the desired
949 final character, or a non-word character */
950 for(; (*ptr != 0) && (*ptr != finalchar) &&
951 (pcre_ctypes[*ptr] & ctype_word); ptr++)
953 /* Empty loop body */
955 if (*ptr==finalchar)
956 return ptr-start;
957 if (*ptr==0)
959 *errorptr = "unterminated (?P identifier";
960 return 0;
962 *errorptr = "illegal character in (?P identifier";
963 return 0;
966 /*************************************************
967 * Handle escapes *
968 *************************************************/
970 /* This function is called when a \ has been encountered. It either returns a
971 positive value for a simple escape such as \n, or a negative value which
972 encodes one of the more complicated things such as \d. On entry, ptr is
973 pointing at the \. On exit, it is on the final character of the escape
974 sequence.
976 Arguments:
977 ptrptr points to the pattern position pointer
978 errorptr points to the pointer to the error message
979 bracount number of previous extracting brackets
980 options the options bits
981 isclass TRUE if inside a character class
983 Returns: zero or positive => a data character
984 negative => a special escape sequence
985 on error, errorptr is set
988 static int
989 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
990 int options, BOOL isclass)
992 const uschar *ptr = *ptrptr;
993 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
994 int i;
996 if (c == 0) *errorptr = ERR1;
998 /* Digits or letters may have special meaning; all others are literals. */
1000 else if (c < '0' || c > 'z') {}
1002 /* Do an initial lookup in a table. A non-zero result is something that can be
1003 returned immediately. Otherwise further processing may be required. */
1005 else if ((i = escapes[c - '0']) != 0) c = i;
1007 /* Escapes that need further processing, or are illegal. */
1009 else
1012 switch (c)
1014 /* The handling of escape sequences consisting of a string of digits
1015 starting with one that is not zero is not straightforward. By experiment,
1016 the way Perl works seems to be as follows:
1018 Outside a character class, the digits are read as a decimal number. If the
1019 number is less than 10, or if there are that many previous extracting
1020 left brackets, then it is a back reference. Otherwise, up to three octal
1021 digits are read to form an escaped byte. Thus \123 is likely to be octal
1022 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1023 value is greater than 377, the least significant 8 bits are taken. Inside a
1024 character class, \ followed by a digit is always an octal number. */
1026 case '1': case '2': case '3': case '4': case '5':
1027 case '6': case '7': case '8': case '9':
1030 /* PYTHON: Try to compute an octal value for a character */
1031 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
1033 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1034 c = c * 8 + ptr[i]-'0';
1035 else
1036 break; /* Non-octal character--break out of the loop */
1038 /* It's a character if there were exactly 3 octal digits, or if
1039 we're inside a character class and there was at least one
1040 octal digit. */
1041 if ( (i == 3) || (isclass && i!=0) )
1043 ptr += i-1;
1044 break;
1046 c = ptr[0]; /* Restore the first character after the \ */
1047 c -= '0'; i = 1;
1048 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1050 c = c * 10 + ptr[1] - '0';
1051 ptr++; i++;
1053 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1054 c = -(ESC_REF + c);
1056 break;
1058 /* \0 always starts an octal number, but we may drop through to here with a
1059 larger first octal digit */
1061 case '0':
1062 c -= '0';
1063 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1064 ptr[1] != '8' && ptr[1] != '9')
1065 c = c * 8 + *(++ptr) - '0';
1066 break;
1068 /* Special escapes not starting with a digit are straightforward */
1070 case 'x':
1071 c = 0;
1072 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
1074 ptr++;
1075 c = c * 16 + pcre_lcc[*ptr] -
1076 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1077 c &= 255;
1079 break;
1082 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1083 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1084 for Perl compatibility, it is a literal. */
1086 default:
1087 if ((options & PCRE_EXTRA) != 0) switch(c)
1089 case 'X':
1090 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1091 break;
1093 default:
1094 *errorptr = ERR3;
1095 break;
1097 break;
1101 *ptrptr = ptr;
1102 return c;
1107 /*************************************************
1108 * Check for counted repeat *
1109 *************************************************/
1111 /* This function is called when a '{' is encountered in a place where it might
1112 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1113 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1114 where the ddds are digits.
1116 Arguments:
1117 p pointer to the first char after '{'
1119 Returns: TRUE or FALSE
1122 static BOOL
1123 is_counted_repeat(const uschar *p)
1125 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1126 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1127 if (*p == '}') return TRUE;
1129 if (*p++ != ',') return FALSE;
1130 if (*p == '}') return TRUE;
1132 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1133 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1134 return (*p == '}');
1139 /*************************************************
1140 * Read repeat counts *
1141 *************************************************/
1143 /* Read an item of the form {n,m} and return the values. This is called only
1144 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1145 so the syntax is guaranteed to be correct, but we need to check the values.
1147 Arguments:
1148 p pointer to first char after '{'
1149 minp pointer to int for min
1150 maxp pointer to int for max
1151 returned as -1 if no max
1152 errorptr points to pointer to error message
1154 Returns: pointer to '}' on success;
1155 current ptr on error, with errorptr set
1158 static const uschar *
1159 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1161 int min = 0;
1162 int max = -1;
1164 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1166 if (*p == '}') max = min; else
1168 if (*(++p) != '}')
1170 max = 0;
1171 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1172 if (max < min)
1174 *errorptr = ERR4;
1175 return p;
1180 /* Do paranoid checks, then fill in the required variables, and pass back the
1181 pointer to the terminating '}'. */
1183 if (min > 65535 || max > 65535)
1184 *errorptr = ERR5;
1185 else
1187 *minp = min;
1188 *maxp = max;
1190 return p;
1195 /*************************************************
1196 * Compile one branch *
1197 *************************************************/
1199 /* Scan the pattern, compiling it into the code vector.
1201 Arguments:
1202 options the option bits
1203 bracket points to number of brackets used
1204 code points to the pointer to the current code point
1205 ptrptr points to the current pattern pointer
1206 errorptr points to pointer to error message
1208 Returns: TRUE on success
1209 FALSE, with *errorptr set on error
1212 static BOOL
1213 compile_branch(int options, int *brackets, uschar **codeptr,
1214 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
1216 int repeat_type, op_type;
1217 int repeat_min, repeat_max;
1218 int bravalue, length;
1219 int greedy_default, greedy_non_default;
1220 register int c;
1221 register uschar *code = *codeptr;
1222 const uschar *ptr = *ptrptr;
1223 const uschar *oldptr;
1224 uschar *previous = NULL;
1225 uschar class[32];
1226 uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
1228 /* Set up the default and non-default settings for greediness */
1230 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1231 greedy_non_default = greedy_default ^ 1;
1233 /* Switch on next character until the end of the branch */
1235 for (;; ptr++)
1237 BOOL negate_class;
1238 int class_charcount;
1239 int class_lastchar;
1241 c = *ptr;
1242 if ((options & PCRE_EXTENDED) != 0)
1244 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1245 if (c == '#')
1247 while ((c = *(++ptr)) != 0 && c != '\n');
1248 continue;
1252 switch(c)
1254 /* The branch terminates at end of string, |, or ). */
1256 case 0:
1257 case '|':
1258 case ')':
1259 *codeptr = code;
1260 *ptrptr = ptr;
1261 return TRUE;
1263 /* Handle single-character metacharacters */
1265 case '^':
1266 previous = NULL;
1267 *code++ = OP_CIRC;
1268 break;
1270 case '$':
1271 previous = NULL;
1272 *code++ = OP_DOLL;
1273 break;
1275 case '.':
1276 previous = code;
1277 *code++ = OP_ANY;
1278 break;
1280 /* Character classes. These always build a 32-byte bitmap of the permitted
1281 characters, except in the special case where there is only one character.
1282 For negated classes, we build the map as usual, then invert it at the end.
1285 case '[':
1286 previous = code;
1287 if (options & PCRE_LOCALE)
1289 *code++ = OP_CLASS_L;
1290 /* Set the flag for localized classes (like \w) to 0 */
1291 class_flag = code;
1292 *class_flag = 0;
1294 else
1296 *code++ = OP_CLASS;
1297 class_flag = NULL;
1300 /* If the first character is '^', set the negation flag, and use a
1301 different opcode. This only matters if caseless matching is specified at
1302 runtime. */
1304 if ((c = *(++ptr)) == '^')
1306 negate_class = TRUE;
1307 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
1308 c = *(++ptr);
1310 else negate_class = FALSE;
1312 /* Keep a count of chars so that we can optimize the case of just a single
1313 character. */
1315 class_charcount = 0;
1316 class_lastchar = -1;
1318 /* Initialize the 32-char bit map to all zeros. We have to build the
1319 map in a temporary bit of store, in case the class contains only 1
1320 character, because in that case the compiled code doesn't use the
1321 bit map. */
1323 memset(class, 0, 32 * sizeof(uschar));
1325 /* Process characters until ] is reached. By writing this as a "do" it
1326 means that an initial ] is taken as a data character. */
1330 if (c == 0)
1332 *errorptr = ERR6;
1333 goto FAILED;
1336 /* Backslash may introduce a single character, or it may introduce one
1337 of the specials, which just set a flag. Escaped items are checked for
1338 validity in the pre-compiling pass. The sequence \b is a special case.
1339 Inside a class (and only there) it is treated as backspace. Elsewhere
1340 it marks a word boundary. Other escapes have preset maps ready to
1341 or into the one we are building. We assume they have more than one
1342 character in them, so set class_count bigger than one. */
1344 if (c == '\\')
1346 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1347 if (-c == ESC_b) c = '\b';
1348 else if (c < 0)
1350 class_charcount = 10;
1351 switch (-c)
1353 case ESC_d:
1355 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1357 continue;
1359 case ESC_D:
1361 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1363 continue;
1365 case ESC_w:
1366 if (options & PCRE_LOCALE)
1368 *class_flag |= 1;
1370 else
1372 for (c = 0; c < 32; c++)
1373 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1375 continue;
1377 case ESC_W:
1378 if (options & PCRE_LOCALE)
1380 *class_flag |= 2;
1382 else
1384 for (c = 0; c < 32; c++)
1385 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1387 continue;
1389 case ESC_s:
1391 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1393 continue;
1395 case ESC_S:
1397 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1399 continue;
1401 default:
1402 *errorptr = ERR7;
1403 goto FAILED;
1406 /* Fall through if single character */
1409 /* A single character may be followed by '-' to form a range. However,
1410 Perl does not permit ']' to be the end of the range. A '-' character
1411 here is treated as a literal. */
1413 if (ptr[1] == '-' && ptr[2] != ']')
1415 int d;
1416 ptr += 2;
1417 d = *ptr;
1419 if (d == 0)
1421 *errorptr = ERR6;
1422 goto FAILED;
1425 /* The second part of a range can be a single-character escape, but
1426 not any of the other escapes. */
1428 if (d == '\\')
1430 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1431 if (d < 0)
1433 if (d == -ESC_b) d = '\b'; else
1435 *errorptr = ERR7;
1436 goto FAILED;
1441 if (d < c)
1443 *errorptr = ERR8;
1444 goto FAILED;
1447 for (; c <= d; c++)
1449 class[c/8] |= (1 << (c&7));
1450 if ((options & PCRE_CASELESS) != 0)
1452 int uc = pcre_fcc[c]; /* flip case */
1453 class[uc/8] |= (1 << (uc&7));
1455 class_charcount++; /* in case a one-char range */
1456 class_lastchar = c;
1458 continue; /* Go get the next char in the class */
1461 /* Handle a lone single character - we can get here for a normal
1462 non-escape char, or after \ that introduces a single character. */
1464 class [c/8] |= (1 << (c&7));
1465 if ((options & PCRE_CASELESS) != 0)
1467 c = pcre_fcc[c]; /* flip case */
1468 class[c/8] |= (1 << (c&7));
1470 class_charcount++;
1471 class_lastchar = c;
1474 /* Loop until ']' reached; the check for end of string happens inside the
1475 loop. This "while" is the end of the "do" above. */
1477 while ((c = *(++ptr)) != ']');
1479 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1480 precisely one character. This doesn't need the whole 32-byte bit map.
1481 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1482 it's negative. */
1484 if (class_charcount == 1 && class_lastchar >= 0)
1486 if (negate_class)
1488 code[-1] = OP_NOT;
1490 else
1492 code[-1] = OP_CHARS;
1493 *code++ = 1;
1495 *code++ = class_lastchar;
1498 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1499 the code vector. */
1501 else
1503 /* If this is a localized opcode, bump the code pointer up */
1504 if (class_flag) code++;
1505 if (negate_class)
1507 if (class_flag) *class_flag = (*class_flag) ^ 63;
1508 for (c = 0; c < 32; c++) code[c] = ~class[c];
1510 else
1511 memcpy(code, class, 32);
1512 code += 32;
1514 break;
1516 /* Various kinds of repeat */
1518 case '{':
1519 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1520 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1521 if (*errorptr != NULL) goto FAILED;
1522 goto REPEAT;
1524 case '*':
1525 repeat_min = 0;
1526 repeat_max = -1;
1527 goto REPEAT;
1529 case '+':
1530 repeat_min = 1;
1531 repeat_max = -1;
1532 goto REPEAT;
1534 case '?':
1535 repeat_min = 0;
1536 repeat_max = 1;
1538 REPEAT:
1539 if (previous == NULL)
1541 *errorptr = ERR9;
1542 goto FAILED;
1545 /* If the next character is '?' this is a minimizing repeat, by default,
1546 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1547 next character. */
1549 if (ptr[1] == '?')
1550 { repeat_type = greedy_non_default; ptr++; }
1551 else repeat_type = greedy_default;
1553 /* If the maximum is zero then the minimum must also be zero; Perl allows
1554 this case, so we do too - by simply omitting the item altogether. */
1556 if (repeat_max == 0) code = previous;
1558 /* If previous was a string of characters, chop off the last one and use it
1559 as the subject of the repeat. If there was only one character, we can
1560 abolish the previous item altogether. */
1562 else if (*previous == OP_CHARS)
1564 int len = previous[1];
1565 if (len == 1)
1567 c = previous[2];
1568 code = previous;
1570 else
1572 c = previous[len+1];
1573 previous[1]--;
1574 code--;
1576 op_type = 0; /* Use single-char op codes */
1577 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1580 /* If previous was a single negated character ([^a] or similar), we use
1581 one of the special opcodes, replacing it. The code is shared with single-
1582 character repeats by adding a suitable offset into repeat_type. */
1584 else if ((int)*previous == OP_NOT)
1586 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1587 c = previous[1];
1588 code = previous;
1589 goto OUTPUT_SINGLE_REPEAT;
1592 /* If previous was a character type match (\d or similar), abolish it and
1593 create a suitable repeat item. The code is shared with single-character
1594 repeats by adding a suitable offset into repeat_type. */
1596 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
1598 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1599 c = *previous;
1600 code = previous;
1602 OUTPUT_SINGLE_REPEAT:
1603 repeat_type += op_type; /* Combine both values for many cases */
1605 /* A minimum of zero is handled either as the special case * or ?, or as
1606 an UPTO, with the maximum given. */
1608 if (repeat_min == 0)
1610 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1611 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1612 else
1614 *code++ = OP_UPTO + repeat_type;
1615 *code++ = repeat_max >> 8;
1616 *code++ = (repeat_max & 255);
1620 /* The case {1,} is handled as the special case + */
1622 else if (repeat_min == 1 && repeat_max == -1)
1623 *code++ = OP_PLUS + repeat_type;
1625 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1626 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1628 else
1630 if (repeat_min != 1)
1632 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1633 *code++ = repeat_min >> 8;
1634 *code++ = (repeat_min & 255);
1637 /* If the mininum is 1 and the previous item was a character string,
1638 we either have to put back the item that got cancelled if the string
1639 length was 1, or add the character back onto the end of a longer
1640 string. For a character type nothing need be done; it will just get
1641 put back naturally. Note that the final character is always going to
1642 get added below. */
1644 else if (*previous == OP_CHARS)
1646 if (code == previous) code += 2; else previous[1]++;
1649 /* For a single negated character we also have to put back the
1650 item that got cancelled. */
1652 else if (*previous == OP_NOT) code++;
1654 /* If the maximum is unlimited, insert an OP_STAR. */
1656 if (repeat_max < 0)
1658 *code++ = c;
1659 *code++ = OP_STAR + repeat_type;
1662 /* Else insert an UPTO if the max is greater than the min. */
1664 else if (repeat_max != repeat_min)
1666 *code++ = c;
1667 repeat_max -= repeat_min;
1668 *code++ = OP_UPTO + repeat_type;
1669 *code++ = repeat_max >> 8;
1670 *code++ = (repeat_max & 255);
1674 /* The character or character type itself comes last in all cases. */
1676 *code++ = c;
1679 /* If previous was a character class or a back reference, we put the repeat
1680 stuff after it. */
1682 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1683 *previous==OP_CLASS_L || *previous == OP_REF)
1685 if (repeat_min == 0 && repeat_max == -1)
1686 *code++ = OP_CRSTAR + repeat_type;
1687 else if (repeat_min == 1 && repeat_max == -1)
1688 *code++ = OP_CRPLUS + repeat_type;
1689 else if (repeat_min == 0 && repeat_max == 1)
1690 *code++ = OP_CRQUERY + repeat_type;
1691 else
1693 *code++ = OP_CRRANGE + repeat_type;
1694 *code++ = repeat_min >> 8;
1695 *code++ = repeat_min & 255;
1696 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1697 *code++ = repeat_max >> 8;
1698 *code++ = repeat_max & 255;
1702 /* If previous was a bracket group, we may have to replicate it in certain
1703 cases. If the maximum repeat count is unlimited, check that the bracket
1704 group cannot match the empty string, and diagnose an error if it can. */
1706 else if ((int)*previous >= OP_BRA)
1708 int i;
1709 int len = code - previous;
1711 if (repeat_max == -1 && could_be_empty(previous))
1713 *errorptr = ERR10;
1714 goto FAILED;
1717 /* If the minimum is greater than zero, and the maximum is unlimited or
1718 equal to the minimum, the first copy remains where it is, and is
1719 replicated up to the minimum number of times. This case includes the +
1720 repeat, but of course no replication is needed in that case. */
1722 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1724 for (i = 1; i < repeat_min; i++)
1726 memcpy(code, previous, len);
1727 code += len;
1731 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1732 Then, if there is a fixed upper limit, replicated up to that many times,
1733 sticking BRAZERO in front of all the optional ones. */
1735 else
1737 if (repeat_min == 0)
1739 memmove(previous+1, previous, len);
1740 code++;
1741 *previous++ = OP_BRAZERO + repeat_type;
1744 for (i = 1; i < repeat_min; i++)
1746 memcpy(code, previous, len);
1747 code += len;
1750 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1752 *code++ = OP_BRAZERO + repeat_type;
1753 memcpy(code, previous, len);
1754 code += len;
1758 /* If the maximum is unlimited, set a repeater in the final copy. */
1760 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1763 /* Else there's some kind of shambles */
1765 else
1767 *errorptr = ERR11;
1768 goto FAILED;
1771 /* In all case we no longer have a previous item. */
1773 previous = NULL;
1774 break;
1777 /* Start of nested bracket sub-expression, or comment or lookahead.
1778 First deal with special things that can come after a bracket; all are
1779 introduced by ?, and the appearance of any of them means that this is not a
1780 referencing group. They were checked for validity in the first pass over
1781 the string, so we don't have to check for syntax errors here. */
1783 case '(':
1784 previous = code; /* Only real brackets can be repeated */
1785 if (*(++ptr) == '?')
1787 bravalue = OP_BRA;
1789 switch (*(++ptr))
1791 case '#':
1792 case 'i':
1793 case 'L':
1794 case 'm':
1795 case 's':
1796 case 'x':
1797 ptr++;
1798 while (*ptr != ')') ptr++;
1799 previous = NULL;
1800 continue;
1802 case ':': /* Non-extracting bracket */
1803 ptr++;
1804 break;
1806 case '=': /* Assertions can't be repeated */
1807 bravalue = OP_ASSERT;
1808 ptr++;
1809 previous = NULL;
1810 break;
1812 case '!':
1813 bravalue = OP_ASSERT_NOT;
1814 ptr++;
1815 previous = NULL;
1816 break;
1818 case ('P'):
1819 ptr++;
1820 if (*ptr=='<')
1822 /* (?P<groupname>...) */
1823 int idlen;
1824 PyObject *string, *intobj;
1826 ptr++;
1827 idlen = get_group_id(ptr, '>', errorptr);
1828 if (*errorptr) {
1829 goto FAILED;
1831 string = PyString_FromStringAndSize((char*)ptr, idlen);
1832 intobj = PyInt_FromLong( brackets[0] + 1 );
1833 if (intobj == NULL || string == NULL)
1835 Py_XDECREF(string);
1836 Py_XDECREF(intobj);
1837 *errorptr = "exception raised";
1838 goto FAILED;
1840 PyDict_SetItem(dictionary, string, intobj);
1841 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
1842 ptr += idlen+1; /* Point to rest of expression */
1843 goto do_grouping_bracket;
1845 if (*ptr=='=')
1847 /* (?P=groupname) */
1848 int idlen, refnum;
1849 PyObject *string, *intobj;
1851 ptr++;
1852 idlen = get_group_id(ptr, ')', errorptr);
1853 if (*errorptr) {
1854 goto FAILED;
1856 string = PyString_FromStringAndSize((char *)ptr, idlen);
1857 if (string==NULL) {
1858 *errorptr = "exception raised";
1859 goto FAILED;
1861 intobj = PyDict_GetItem(dictionary, string);
1862 if (intobj==NULL) {
1863 Py_DECREF(string);
1864 *errorptr = "?P= group identifier isn't defined";
1865 goto FAILED;
1868 refnum = PyInt_AsLong(intobj);
1869 Py_DECREF(string);
1870 /* The caller doesn't own the reference to the value
1871 returned from PyDict_GetItem, so intobj is not
1872 DECREF'ed. */
1874 *code++ = OP_REF;
1875 *code++ = refnum;
1876 /* The continue will cause the top-level for() loop to
1877 be resumed, so ptr will be immediately incremented.
1878 Therefore, the following line adds just idlen, not
1879 idlen+1 */
1880 ptr += idlen;
1881 continue;
1883 /* The character after ?P is neither < nor =, so
1884 report an error. Add more Python-extensions here. */
1885 *errorptr="unknown after (?P";
1886 goto FAILED;
1888 case '>': /* "Match once" brackets */
1889 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1891 bravalue = OP_ONCE;
1892 ptr++;
1893 previous = NULL;
1894 break;
1896 /* Else fall through */
1898 default:
1899 *errorptr = ERR12;
1900 goto FAILED;
1904 /* Else we have a referencing group */
1906 else
1908 do_grouping_bracket:
1909 if (++(*brackets) > EXTRACT_MAX)
1911 *errorptr = ERR13;
1912 goto FAILED;
1914 bravalue = OP_BRA + *brackets;
1917 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1918 code into a non-register variable in order to be able to pass its address
1919 because some compilers complain otherwise. */
1921 *code = bravalue;
1923 uschar *mcode = code;
1924 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
1925 goto FAILED;
1926 code = mcode;
1929 if (*ptr != ')')
1931 *errorptr = ERR14;
1932 goto FAILED;
1934 break;
1936 /* Check \ for being a real metacharacter; if not, fall through and handle
1937 it as a data character at the start of a string. Escape items are checked
1938 for validity in the pre-compiling pass. */
1940 case '\\':
1941 oldptr = ptr;
1942 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1944 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1945 are arranged to be the negation of the corresponding OP_values. For the
1946 back references, the values are ESC_REF plus the reference number. Only
1947 back references and those types that consume a character may be repeated.
1948 We can test for values between ESC_b and ESC_Z for the latter; this may
1949 have to change if any new ones are ever created. */
1951 if (c < 0)
1953 if (-c >= ESC_REF)
1955 int refnum = -c - ESC_REF;
1956 if (*brackets < refnum)
1958 *errorptr = ERR15;
1959 goto FAILED;
1961 previous = code;
1962 *code++ = OP_REF;
1963 *code++ = refnum;
1965 else
1967 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1968 if ( (options & PCRE_LOCALE) != 0)
1970 switch (c)
1972 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1973 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1974 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1975 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1978 *code++ = -c;
1980 continue;
1983 /* Data character: Reset and fall through */
1985 ptr = oldptr;
1986 c = '\\';
1988 /* Handle a run of data characters until a metacharacter is encountered.
1989 The first character is guaranteed not to be whitespace or # when the
1990 extended flag is set. */
1992 NORMAL_CHAR:
1993 default:
1994 previous = code;
1995 *code = OP_CHARS;
1996 code += 2;
1997 length = 0;
2001 if ((options & PCRE_EXTENDED) != 0)
2003 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2004 if (c == '#')
2006 while ((c = *(++ptr)) != 0 && c != '\n');
2007 if (c == 0) break;
2008 continue;
2012 /* Backslash may introduce a data char or a metacharacter. Escaped items
2013 are checked for validity in the pre-compiling pass. Stop the string
2014 before a metaitem. */
2016 if (c == '\\')
2018 oldptr = ptr;
2019 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
2020 if (c < 0) { ptr = oldptr; break; }
2023 /* Ordinary character or single-char escape */
2025 *code++ = c;
2026 length++;
2029 /* This "while" is the end of the "do" above. */
2031 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2033 /* Compute the length and set it in the data vector, and advance to
2034 the next state. */
2036 previous[1] = length;
2037 if (length < 255) ptr--;
2038 break;
2040 } /* end of big loop */
2042 /* Control never reaches here by falling through, only by a goto for all the
2043 error states. Pass back the position in the pattern so that it can be displayed
2044 to the user for diagnosing the error. */
2046 FAILED:
2047 *ptrptr = ptr;
2048 return FALSE;
2054 /*************************************************
2055 * Compile sequence of alternatives *
2056 *************************************************/
2058 /* On entry, ptr is pointing past the bracket character, but on return
2059 it points to the closing bracket, or vertical bar, or end of string.
2060 The code variable is pointing at the byte into which the BRA operator has been
2061 stored.
2063 Argument:
2064 options the option bits
2065 brackets -> int containing the number of extracting brackets used
2066 codeptr -> the address of the current code pointer
2067 ptrptr -> the address of the current pattern pointer
2068 errorptr -> pointer to error message
2070 Returns: TRUE on success
2073 static BOOL
2074 compile_regex(int options, int *brackets, uschar **codeptr,
2075 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
2077 const uschar *ptr = *ptrptr;
2078 uschar *code = *codeptr;
2079 uschar *start_bracket = code;
2081 for (;;)
2083 int length;
2084 uschar *last_branch = code;
2086 code += 3;
2087 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
2089 *ptrptr = ptr;
2090 return FALSE;
2093 /* Fill in the length of the last branch */
2095 length = code - last_branch;
2096 last_branch[1] = length >> 8;
2097 last_branch[2] = length & 255;
2099 /* Reached end of expression, either ')' or end of pattern. Insert a
2100 terminating ket and the length of the whole bracketed item, and return,
2101 leaving the pointer at the terminating char. */
2103 if (*ptr != '|')
2105 length = code - start_bracket;
2106 *code++ = OP_KET;
2107 *code++ = length >> 8;
2108 *code++ = length & 255;
2109 *codeptr = code;
2110 *ptrptr = ptr;
2111 return TRUE;
2114 /* Another branch follows; insert an "or" node and advance the pointer. */
2116 *code = OP_ALT;
2117 ptr++;
2119 /* Control never reaches here */
2124 /*************************************************
2125 * Check for anchored expression *
2126 *************************************************/
2128 /* Try to find out if this is an anchored regular expression. Consider each
2129 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2130 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2131 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2132 counts, since OP_CIRC can match in the middle.
2134 A branch is also implicitly anchored if it starts with .* because that will try
2135 the rest of the pattern at all possible matching points, so there is no point
2136 trying them again.
2138 Argument: points to start of expression (the bracket)
2139 Returns: TRUE or FALSE
2142 static BOOL
2143 is_anchored(register const uschar *code, BOOL multiline)
2145 do {
2146 int op = (int)code[3];
2147 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2148 { if (!is_anchored(code+3, multiline)) return FALSE; }
2149 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2150 { if (code[4] != OP_ANY) return FALSE; }
2151 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2152 code += (code[1] << 8) + code[2];
2154 while (*code == OP_ALT);
2155 return TRUE;
2160 /*************************************************
2161 * Check for start with \n line expression *
2162 *************************************************/
2164 /* This is called for multiline expressions to try to find out if every branch
2165 starts with ^ so that "first char" processing can be done to speed things up.
2167 Argument: points to start of expression (the bracket)
2168 Returns: TRUE or FALSE
2171 static BOOL
2172 is_startline(const uschar *code)
2174 do {
2175 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2176 { if (!is_startline(code+3)) return FALSE; }
2177 else if (code[3] != OP_CIRC) return FALSE;
2178 code += (code[1] << 8) + code[2];
2180 while (*code == OP_ALT);
2181 return TRUE;
2186 /*************************************************
2187 * Check for fixed first char *
2188 *************************************************/
2190 /* Try to find out if there is a fixed first character. This is called for
2191 unanchored expressions, as it speeds up their processing quite considerably.
2192 Consider each alternative branch. If they all start with the same char, or with
2193 a bracket all of whose alternatives start with the same char (recurse ad lib),
2194 then we return that char, otherwise -1.
2196 Argument: points to start of expression (the bracket)
2197 Returns: -1 or the fixed first char
2200 static int
2201 find_firstchar(uschar *code)
2203 register int c = -1;
2206 register int charoffset = 4;
2208 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2210 register int d;
2211 if ((d = find_firstchar(code+3)) < 0) return -1;
2212 if (c < 0) c = d; else if (c != d) return -1;
2215 else switch(code[3])
2217 default:
2218 return -1;
2220 case OP_EXACT: /* Fall through */
2221 charoffset++;
2223 case OP_CHARS: /* Fall through */
2224 charoffset++;
2226 case OP_PLUS:
2227 case OP_MINPLUS:
2228 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2229 break;
2231 code += (code[1] << 8) + code[2];
2233 while (*code == OP_ALT);
2234 return c;
2239 /*************************************************
2240 * Compile a Regular Expression *
2241 *************************************************/
2243 /* This function takes a string and returns a pointer to a block of store
2244 holding a compiled version of the expression.
2246 Arguments:
2247 pattern the regular expression
2248 options various option bits
2249 errorptr pointer to pointer to error text
2250 erroroffset ptr offset in pattern where error was detected
2252 Returns: pointer to compiled data block, or NULL on error,
2253 with errorptr and erroroffset set
2256 pcre *
2257 pcre_compile(const char *pattern, int options, const char **errorptr,
2258 int *erroroffset, PyObject *dictionary)
2260 real_pcre *re;
2261 int spaces = 0;
2262 int length = 3; /* For initial BRA plus length */
2263 int runlength;
2264 int c, size;
2265 int bracount = 0;
2266 int brastack[200];
2267 int top_backref = 0;
2268 unsigned int brastackptr = 0;
2269 uschar *code;
2270 const uschar *ptr;
2272 #ifdef DEBUG
2273 uschar *code_base, *code_end;
2274 #endif
2276 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2277 can do is just return NULL. */
2279 if (errorptr == NULL) return NULL;
2280 *errorptr = NULL;
2282 /* However, we can give a message for this error */
2284 if (erroroffset == NULL)
2286 *errorptr = ERR16;
2287 return NULL;
2289 *erroroffset = 0;
2291 if ((options & ~PUBLIC_OPTIONS) != 0)
2293 *errorptr = ERR17;
2294 return NULL;
2297 DPRINTF(("------------------------------------------------------------------\n"));
2298 DPRINTF(("%s\n", pattern));
2300 /* The first thing to do is to make a pass over the pattern to compute the
2301 amount of store required to hold the compiled code. This does not have to be
2302 perfect as long as errors are overestimates. At the same time we can detect any
2303 internal flag settings. Make an attempt to correct for any counted white space
2304 if an "extended" flag setting appears late in the pattern. We can't be so
2305 clever for #-comments. */
2307 ptr = (const uschar *)(pattern - 1);
2308 while ((c = *(++ptr)) != 0)
2310 int min, max;
2311 int class_charcount;
2313 if ((pcre_ctypes[c] & ctype_space) != 0)
2315 if ((options & PCRE_EXTENDED) != 0) continue;
2316 spaces++;
2319 if (c == '#' && (options & PCRE_EXTENDED) != 0)
2321 while ((c = *(++ptr)) != 0 && c != '\n');
2322 continue;
2325 switch(c)
2327 /* A backslashed item may be an escaped "normal" character or a
2328 character type. For a "normal" character, put the pointers and
2329 character back so that tests for whitespace etc. in the input
2330 are done correctly. */
2332 case '\\':
2334 const uschar *save_ptr = ptr;
2335 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2336 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2337 if (c >= 0)
2339 ptr = save_ptr;
2340 c = '\\';
2341 goto NORMAL_CHAR;
2344 length++;
2346 /* A back reference needs an additional char, plus either one or 5
2347 bytes for a repeat. We also need to keep the value of the highest
2348 back reference. */
2350 if (c <= -ESC_REF)
2352 int refnum = -c - ESC_REF;
2353 if (refnum > top_backref) top_backref = refnum;
2354 length++; /* For single back reference */
2355 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2357 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2358 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2359 if ((min == 0 && (max == 1 || max == -1)) ||
2360 (min == 1 && max == -1))
2361 length++;
2362 else length += 5;
2363 if (ptr[1] == '?') ptr++;
2366 continue;
2368 case '^':
2369 case '.':
2370 case '$':
2371 case '*': /* These repeats won't be after brackets; */
2372 case '+': /* those are handled separately */
2373 case '?':
2374 length++;
2375 continue;
2377 /* This covers the cases of repeats after a single char, metachar, class,
2378 or back reference. */
2380 case '{':
2381 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2382 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2383 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2384 if ((min == 0 && (max == 1 || max == -1)) ||
2385 (min == 1 && max == -1))
2386 length++;
2387 else
2389 length--; /* Uncount the original char or metachar */
2390 if (min == 1) length++; else if (min > 0) length += 4;
2391 if (max > 0) length += 4; else length += 2;
2393 if (ptr[1] == '?') ptr++;
2394 continue;
2396 /* An alternation contains an offset to the next branch or ket. */
2397 case '|':
2398 length += 3;
2399 continue;
2401 /* A character class uses 33 characters. Don't worry about character types
2402 that aren't allowed in classes - they'll get picked up during the compile.
2403 A character class that contains only one character uses 2 or 3 bytes,
2404 depending on whether it is negated or not. Notice this where we can. */
2406 case '[':
2407 class_charcount = 0;
2408 if (*(++ptr) == '^') ptr++;
2411 if (*ptr == '\\')
2413 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
2414 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2415 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2417 else class_charcount++;
2418 ptr++;
2420 while (*ptr != 0 && *ptr != ']');
2422 /* Repeats for negated single chars are handled by the general code */
2424 if (class_charcount == 1) length += 3; else
2426 length += 33;
2427 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2429 /* A repeat needs either 1 or 5 bytes. */
2431 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2433 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2434 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2435 if ((min == 0 && (max == 1 || max == -1)) ||
2436 (min == 1 && max == -1))
2437 length++;
2438 else length += 5;
2439 if (ptr[1] == '?') ptr++;
2442 continue;
2444 /* Brackets may be genuine groups or special things */
2446 case '(':
2448 /* Handle special forms of bracket, which all start (? */
2450 if (ptr[1] == '?') switch (c = ptr[2])
2452 /* Skip over comments entirely */
2453 case '#':
2454 ptr += 3;
2455 while (*ptr != 0 && *ptr != ')') ptr++;
2456 if (*ptr == 0)
2458 *errorptr = ERR18;
2459 goto PCRE_ERROR_RETURN;
2461 continue;
2463 /* Non-referencing groups and lookaheads just move the pointer on, and
2464 then behave like a non-special bracket, except that they don't increment
2465 the count of extracting brackets. */
2467 case ':':
2468 case '=':
2469 case '!':
2470 ptr += 2;
2471 break;
2473 case ('P'):
2475 int idlen;
2476 switch (*ptr++) {
2477 case ('<'):
2478 idlen = get_group_id(ptr++, '>', errorptr);
2479 if (*errorptr) goto PCRE_ERROR_RETURN;
2480 ptr += idlen+1;
2481 break;
2482 case ('='):
2483 idlen = get_group_id(ptr++, ')', errorptr);
2484 if (*errorptr) goto PCRE_ERROR_RETURN;
2485 ptr += idlen+1;
2486 length++;
2487 break;
2490 break;
2492 /* Ditto for the "once only" bracket, allowed only if the extra bit
2493 is set. */
2495 case '>':
2496 if ((options & PCRE_EXTRA) != 0)
2498 ptr += 2;
2499 break;
2501 /* Else fall through */
2503 /* Else loop setting valid options until ) is met. Anything else is an
2504 error. */
2506 default:
2507 ptr += 2;
2508 for (;; ptr++)
2510 if ((c = *ptr) == 'i')
2512 options |= PCRE_CASELESS;
2513 continue;
2515 else if ((c = *ptr) == 'L')
2517 options |= PCRE_LOCALE;
2518 continue;
2520 else if ((c = *ptr) == 'm')
2522 options |= PCRE_MULTILINE;
2523 continue;
2525 else if (c == 's')
2527 options |= PCRE_DOTALL;
2528 continue;
2530 else if (c == 'x')
2532 options |= PCRE_EXTENDED;
2533 length -= spaces; /* Already counted spaces */
2534 continue;
2536 else if (c == ')') break;
2538 *errorptr = ERR12;
2539 goto PCRE_ERROR_RETURN;
2541 continue; /* End of this bracket handling */
2544 /* Extracting brackets must be counted so we can process escapes in a
2545 Perlish way. */
2547 else bracount++;
2549 /* Non-special forms of bracket. Save length for computing whole length
2550 at end if there's a repeat that requires duplication of the group. */
2552 if (brastackptr >= sizeof(brastack)/sizeof(int))
2554 *errorptr = ERR19;
2555 goto PCRE_ERROR_RETURN;
2558 brastack[brastackptr++] = length;
2559 length += 3;
2560 continue;
2562 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2563 have to replicate this bracket up to that many times. If brastackptr is
2564 0 this is an unmatched bracket which will generate an error, but take care
2565 not to try to access brastack[-1]. */
2567 case ')':
2568 length += 3;
2570 int minval = 1;
2571 int maxval = 1;
2572 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
2574 /* Leave ptr at the final char; for read_repeat_counts this happens
2575 automatically; for the others we need an increment. */
2577 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2579 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2580 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2582 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2583 else if (c == '+') { maxval = -1; ptr++; }
2584 else if (c == '?') { minval = 0; ptr++; }
2586 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2587 if there is a limited maximum we have to replicate up to maxval-1 times
2588 and allow for a BRAZERO item before each optional copy, as we also have
2589 to do before the first copy if the minimum is zero. */
2591 if (minval == 0) length++;
2592 else if (minval > 1) length += (minval - 1) * duplength;
2593 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2595 continue;
2597 /* Non-special character. For a run of such characters the length required
2598 is the number of characters + 2, except that the maximum run length is 255.
2599 We won't get a skipped space or a non-data escape or the start of a #
2600 comment as the first character, so the length can't be zero. */
2602 NORMAL_CHAR:
2603 default:
2604 length += 2;
2605 runlength = 0;
2608 if ((pcre_ctypes[c] & ctype_space) != 0)
2610 if ((options & PCRE_EXTENDED) != 0) continue;
2611 spaces++;
2614 if (c == '#' && (options & PCRE_EXTENDED) != 0)
2616 while ((c = *(++ptr)) != 0 && c != '\n');
2617 continue;
2620 /* Backslash may introduce a data char or a metacharacter; stop the
2621 string before the latter. */
2623 if (c == '\\')
2625 const uschar *saveptr = ptr;
2626 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2627 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2628 if (c < 0) { ptr = saveptr; break; }
2631 /* Ordinary character or single-char escape */
2633 runlength++;
2636 /* This "while" is the end of the "do" above. */
2638 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2640 ptr--;
2641 length += runlength;
2642 continue;
2646 length += 4; /* For final KET and END */
2648 if (length > 65539)
2650 *errorptr = ERR20;
2651 return NULL;
2654 /* Compute the size of data block needed and get it, either from malloc or
2655 externally provided function. We specify "code[0]" in the offsetof() expression
2656 rather than just "code", because it has been reported that one broken compiler
2657 fails on "code" because it is also an independent variable. It should make no
2658 difference to the value of the offsetof(). */
2660 size = length + offsetof(real_pcre, code[0]);
2661 re = (real_pcre *)(pcre_malloc)(size+50);
2663 if (re == NULL)
2665 *errorptr = ERR21;
2666 return NULL;
2669 /* Put in the magic number and the options. */
2671 re->magic_number = MAGIC_NUMBER;
2672 re->options = options;
2674 /* Set up a starting, non-extracting bracket, then compile the expression. On
2675 error, *errorptr will be set non-NULL, so we don't need to look at the result
2676 of the function here. */
2678 ptr = (const uschar *)pattern;
2679 code = re->code;
2680 *code = OP_BRA;
2681 bracount = 0;
2682 (void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2683 re->top_bracket = bracount;
2684 re->top_backref = top_backref;
2686 /* If not reached end of pattern on success, there's an excess bracket. */
2688 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2690 /* Fill in the terminating state and check for disastrous overflow, but
2691 if debugging, leave the test till after things are printed out. */
2693 *code++ = OP_END;
2696 #ifndef DEBUG
2697 if (code - re->code > length) *errorptr = ERR23;
2698 #endif
2700 /* Failed to compile */
2702 if (*errorptr != NULL)
2704 (pcre_free)(re);
2705 PCRE_ERROR_RETURN:
2706 *erroroffset = ptr - (const uschar *)pattern;
2707 return NULL;
2710 /* If the anchored option was not passed, set flag if we can determine that it
2711 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2712 we can determine what the first character has to be, because that speeds up
2713 unanchored matches no end. In the case of multiline matches, an alternative is
2714 to set the PCRE_STARTLINE flag if all branches start with ^. */
2716 if ((options & PCRE_ANCHORED) == 0)
2718 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2719 re->options |= PCRE_ANCHORED;
2720 else
2722 int ch = find_firstchar(re->code);
2723 if (ch >= 0)
2725 re->first_char = ch;
2726 re->options |= PCRE_FIRSTSET;
2728 else if (is_startline(re->code))
2729 re->options |= PCRE_STARTLINE;
2733 /* Print out the compiled data for debugging */
2735 #ifdef DEBUG
2737 printf("Length = %d top_bracket = %d top_backref=%d\n",
2738 length, re->top_bracket, re->top_backref);
2740 if (re->options != 0)
2742 printf("%s%s%s%s%s%s%s%s\n",
2743 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2744 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2745 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2746 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2747 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2748 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2749 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2750 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2753 if ((re->options & PCRE_FIRSTSET) != 0)
2755 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2756 else printf("First char = \\x%02x\n", re->first_char);
2759 code_end = code;
2760 code_base = code = re->code;
2762 while (code < code_end)
2764 int charlength;
2766 printf("%3d ", code - code_base);
2768 if (*code >= OP_BRA)
2770 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2771 code += 2;
2774 else switch(*code)
2776 case OP_CHARS:
2777 charlength = *(++code);
2778 printf("%3d ", charlength);
2779 while (charlength-- > 0)
2780 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2781 break;
2783 case OP_KETRMAX:
2784 case OP_KETRMIN:
2785 case OP_ALT:
2786 case OP_KET:
2787 case OP_ASSERT:
2788 case OP_ASSERT_NOT:
2789 case OP_ONCE:
2790 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2791 code += 2;
2792 break;
2794 case OP_STAR:
2795 case OP_MINSTAR:
2796 case OP_PLUS:
2797 case OP_MINPLUS:
2798 case OP_QUERY:
2799 case OP_MINQUERY:
2800 case OP_TYPESTAR:
2801 case OP_TYPEMINSTAR:
2802 case OP_TYPEPLUS:
2803 case OP_TYPEMINPLUS:
2804 case OP_TYPEQUERY:
2805 case OP_TYPEMINQUERY:
2806 if (*code >= OP_TYPESTAR)
2807 printf(" %s", OP_names[code[1]]);
2808 else if (isprint(c = code[1])) printf(" %c", c);
2809 else printf(" \\x%02x", c);
2810 printf("%s", OP_names[*code++]);
2811 break;
2813 case OP_EXACT:
2814 case OP_UPTO:
2815 case OP_MINUPTO:
2816 if (isprint(c = code[3])) printf(" %c{", c);
2817 else printf(" \\x%02x{", c);
2818 if (*code != OP_EXACT) printf("0,");
2819 printf("%d}", (code[1] << 8) + code[2]);
2820 if (*code == OP_MINUPTO) printf("?");
2821 code += 3;
2822 break;
2824 case OP_TYPEEXACT:
2825 case OP_TYPEUPTO:
2826 case OP_TYPEMINUPTO:
2827 printf(" %s{", OP_names[code[3]]);
2828 if (*code != OP_TYPEEXACT) printf(",");
2829 printf("%d}", (code[1] << 8) + code[2]);
2830 if (*code == OP_TYPEMINUPTO) printf("?");
2831 code += 3;
2832 break;
2834 case OP_NOT:
2835 if (isprint(c = *(++code))) printf(" [^%c]", c);
2836 else printf(" [^\\x%02x]", c);
2837 break;
2839 case OP_NOTSTAR:
2840 case OP_NOTMINSTAR:
2841 case OP_NOTPLUS:
2842 case OP_NOTMINPLUS:
2843 case OP_NOTQUERY:
2844 case OP_NOTMINQUERY:
2845 if (isprint(c = code[1])) printf(" [^%c]", c);
2846 else printf(" [^\\x%02x]", c);
2847 printf("%s", OP_names[*code++]);
2848 break;
2850 case OP_NOTEXACT:
2851 case OP_NOTUPTO:
2852 case OP_NOTMINUPTO:
2853 if (isprint(c = code[3])) printf(" [^%c]{", c);
2854 else printf(" [^\\x%02x]{", c);
2855 if (*code != OP_NOTEXACT) printf(",");
2856 printf("%d}", (code[1] << 8) + code[2]);
2857 if (*code == OP_NOTMINUPTO) printf("?");
2858 code += 3;
2859 break;
2861 case OP_REF:
2862 printf(" \\%d", *(++code));
2863 code ++;
2864 goto CLASS_REF_REPEAT;
2866 case OP_CLASS:
2867 case OP_NEGCLASS:
2868 case OP_CLASS_L:
2870 int i, min, max;
2872 if (*code==OP_CLASS_L)
2874 code++;
2875 printf("Locflag = %i ", *code++);
2876 printf(" [");
2878 else
2880 if (*code++ == OP_CLASS) printf(" [");
2881 else printf(" ^[");
2885 for (i = 0; i < 256; i++)
2887 if ((code[i/8] & (1 << (i&7))) != 0)
2889 int j;
2890 for (j = i+1; j < 256; j++)
2891 if ((code[j/8] & (1 << (j&7))) == 0) break;
2892 if (i == '-' || i == ']') printf("\\");
2893 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2894 if (--j > i)
2896 printf("-");
2897 if (j == '-' || j == ']') printf("\\");
2898 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2900 i = j;
2903 printf("]");
2904 code += 32;
2905 /* code ++;*/
2907 CLASS_REF_REPEAT:
2909 switch(*code)
2911 case OP_CRSTAR:
2912 case OP_CRMINSTAR:
2913 case OP_CRPLUS:
2914 case OP_CRMINPLUS:
2915 case OP_CRQUERY:
2916 case OP_CRMINQUERY:
2917 printf("%s", OP_names[*code]);
2918 break;
2920 case OP_CRRANGE:
2921 case OP_CRMINRANGE:
2922 min = (code[1] << 8) + code[2];
2923 max = (code[3] << 8) + code[4];
2924 if (max == 0) printf("{%d,}", min);
2925 else printf("{%d,%d}", min, max);
2926 if (*code == OP_CRMINRANGE) printf("?");
2927 code += 4;
2928 break;
2930 default:
2931 code--;
2934 break;
2936 /* Anything else is just a one-node item */
2938 default:
2939 printf(" %s", OP_names[*code]);
2940 break;
2943 code++;
2944 printf("\n");
2946 printf("------------------------------------------------------------------\n");
2948 /* This check is done here in the debugging case so that the code that
2949 was compiled can be seen. */
2951 if (code - re->code > length)
2953 printf("length=%i, code length=%i\n", length, code-re->code);
2954 *errorptr = ERR23;
2955 (pcre_free)(re);
2956 *erroroffset = ptr - (uschar *)pattern;
2957 return NULL;
2959 #endif
2961 return (pcre *)re;
2966 /*************************************************
2967 * Match a character type *
2968 *************************************************/
2970 /* Not used in all the places it might be as it's sometimes faster
2971 to put the code inline.
2973 Arguments:
2974 type the character type
2975 c the character
2976 dotall the dotall flag
2978 Returns: TRUE if character is of the type
2981 static BOOL
2982 match_type(int type, int c, BOOL dotall)
2985 #ifdef DEBUG
2986 if (isprint(c)) printf("matching subject %c against ", c);
2987 else printf("matching subject \\x%02x against ", c);
2988 printf("%s\n", OP_names[type]);
2989 #endif
2991 switch(type)
2993 case OP_ANY: return dotall || c != '\n';
2994 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2995 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2996 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2997 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2998 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2999 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
3000 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3001 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
3003 return FALSE;
3008 /*************************************************
3009 * Match a back-reference *
3010 *************************************************/
3012 /* If a back reference hasn't been set, the match fails.
3014 Arguments:
3015 number reference number
3016 eptr points into the subject
3017 length length to be matched
3018 md points to match data block
3020 Returns: TRUE if matched
3023 static BOOL
3024 match_ref(int number, register const uschar *eptr, int length, match_data *md)
3026 const uschar *p = md->start_subject + md->offset_vector[number];
3028 #ifdef DEBUG
3029 if (eptr >= md->end_subject)
3030 printf("matching subject <null>");
3031 else
3033 printf("matching subject ");
3034 pchars(eptr, length, TRUE, md);
3036 printf(" against backref ");
3037 pchars(p, length, FALSE, md);
3038 printf("\n");
3039 #endif
3041 /* Always fail if not enough characters left */
3043 if (length > md->end_subject - p) return FALSE;
3045 /* Separate the caseless case for speed */
3047 if (md->caseless)
3048 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3049 else
3050 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3052 return TRUE;
3055 static int free_stack(match_data *md)
3057 /* Free any stack space that was allocated by the call to match(). */
3058 if (md->off_num) free(md->off_num);
3059 if (md->offset_top) free(md->offset_top);
3060 if (md->r1) free(md->r1);
3061 if (md->r2) free(md->r2);
3062 if (md->eptr) free((char *)md->eptr);
3063 if (md->ecode) free((char *)md->ecode);
3064 return 0;
3067 static int grow_stack(match_data *md)
3069 if (md->length != 0)
3071 md->length = md->length + md->length/2;
3073 else
3075 int string_len = md->end_subject - md->start_subject + 1;
3076 if (string_len < 80) {md->length = string_len; }
3077 else {md->length = 80;}
3079 PyMem_RESIZE(md->offset_top, int, md->length);
3080 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3081 PyMem_RESIZE(md->ecode, const uschar *, md->length);
3082 PyMem_RESIZE(md->off_num, int, md->length);
3083 PyMem_RESIZE(md->r1, int, md->length);
3084 PyMem_RESIZE(md->r2, int, md->length);
3085 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3086 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3088 PyErr_NoMemory();
3089 longjmp(md->error_env, 1);
3091 return 0;
3095 /*************************************************
3096 * Match from current position *
3097 *************************************************/
3099 /* On entry ecode points to the first opcode, and eptr to the first character.
3101 Arguments:
3102 eptr pointer in subject
3103 ecode position in code
3104 offset_top current top pointer
3105 md pointer to "static" info for the match
3107 Returns: TRUE if matched
3110 static BOOL
3111 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
3112 match_data *md)
3114 int save_stack_position = md->point;
3115 match_loop:
3117 #define SUCCEED goto succeed
3118 #define FAIL goto fail
3120 for (;;)
3122 int min, max, ctype;
3123 register int i;
3124 register int c;
3125 BOOL minimize = FALSE;
3127 /* Opening bracket. Check the alternative branches in turn, failing if none
3128 match. We have to set the start offset if required and there is space
3129 in the offset vector so that it is available for subsequent back references
3130 if the bracket matches. However, if the bracket fails, we must put back the
3131 previous value of both offsets in case they were set by a previous copy of
3132 the same bracket. Don't worry about setting the flag for the error case here;
3133 that is handled in the code for KET. */
3135 if ((int)*ecode >= OP_BRA)
3137 int number = (*ecode - OP_BRA) << 1;
3138 int save_offset1 = 0, save_offset2 = 0;
3140 DPRINTF(("start bracket %d\n", number/2));
3142 if (number > 0 && number < md->offset_end)
3144 save_offset1 = md->offset_vector[number];
3145 save_offset2 = md->offset_vector[number+1];
3146 md->offset_vector[number] = eptr - md->start_subject;
3148 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
3151 /* Recurse for all the alternatives. */
3155 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3156 ecode += (ecode[1] << 8) + ecode[2];
3158 while (*ecode == OP_ALT);
3160 DPRINTF(("bracket %d failed\n", number/2));
3162 if (number > 0 && number < md->offset_end)
3164 md->offset_vector[number] = save_offset1;
3165 md->offset_vector[number+1] = save_offset2;
3168 FAIL;
3171 /* Other types of node can be handled by a switch */
3173 switch(*ecode)
3175 case OP_END:
3176 md->end_match_ptr = eptr; /* Record where we ended */
3177 md->end_offset_top = offset_top; /* and how many extracts were taken */
3178 SUCCEED;
3180 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3181 whole thing doesn't match, so we have to get out via a longjmp(). */
3183 case OP_CUT:
3184 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3185 longjmp(md->fail_env, 1);
3187 /* Assertion brackets. Check the alternative branches in turn - the
3188 matching won't pass the KET for an assertion. If any one branch matches,
3189 the assertion is true. */
3191 case OP_ASSERT:
3194 if (match(eptr, ecode+3, offset_top, md)) break;
3195 ecode += (ecode[1] << 8) + ecode[2];
3197 while (*ecode == OP_ALT);
3198 if (*ecode == OP_KET) FAIL;
3200 /* Continue from after the assertion, updating the offsets high water
3201 mark, since extracts may have been taken during the assertion. */
3203 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3204 ecode += 3;
3205 offset_top = md->end_offset_top;
3206 continue;
3208 /* Negative assertion: all branches must fail to match */
3210 case OP_ASSERT_NOT:
3213 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3214 ecode += (ecode[1] << 8) + ecode[2];
3216 while (*ecode == OP_ALT);
3217 ecode += 3;
3218 continue;
3220 /* "Once" brackets are like assertion brackets except that after a match,
3221 the point in the subject string is not moved back. Thus there can never be
3222 a move back into the brackets. Check the alternative branches in turn - the
3223 matching won't pass the KET for this kind of subpattern. If any one branch
3224 matches, we carry on, leaving the subject pointer. */
3226 case OP_ONCE:
3229 if (match(eptr, ecode+3, offset_top, md)) break;
3230 ecode += (ecode[1] << 8) + ecode[2];
3232 while (*ecode == OP_ALT);
3233 if (*ecode == OP_KET) FAIL;
3235 /* Continue as from after the assertion, updating the offsets high water
3236 mark, since extracts may have been taken. */
3238 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3239 ecode += 3;
3240 offset_top = md->end_offset_top;
3241 eptr = md->end_match_ptr;
3242 continue;
3244 /* An alternation is the end of a branch; scan along to find the end of the
3245 bracketed group and go to there. */
3247 case OP_ALT:
3248 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3249 break;
3251 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3252 that it may occur zero times. It may repeat infinitely, or not at all -
3253 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3254 repeat limits are compiled as a number of copies, with the optional ones
3255 preceded by BRAZERO or BRAMINZERO. */
3257 case OP_BRAZERO:
3259 const uschar *next = ecode+1;
3260 if (match(eptr, next, offset_top, md)) SUCCEED;
3261 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3262 ecode = next + 3;
3264 break;
3266 case OP_BRAMINZERO:
3268 const uschar *next = ecode+1;
3269 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3270 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3271 ecode++;
3273 break;;
3275 /* End of a group, repeated or non-repeating. If we are at the end of
3276 an assertion "group", stop matching and SUCCEED, but record the
3277 current high water mark for use by positive assertions. */
3279 case OP_KET:
3280 case OP_KETRMIN:
3281 case OP_KETRMAX:
3283 int number;
3284 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3286 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
3288 md->end_match_ptr = eptr; /* For ONCE */
3289 md->end_offset_top = offset_top;
3290 SUCCEED;
3293 /* In all other cases we have to check the group number back at the
3294 start and if necessary complete handling an extraction by setting the
3295 final offset and bumping the high water mark. */
3297 number = (*prev - OP_BRA) << 1;
3299 DPRINTF(("end bracket %d\n", number/2));
3301 if (number > 0)
3303 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3305 md->offset_vector[number+1] = eptr - md->start_subject;
3306 if (offset_top <= number) offset_top = number + 2;
3310 /* For a non-repeating ket, just advance to the next node and continue at
3311 this level. */
3313 if (*ecode == OP_KET)
3315 ecode += 3;
3316 break;
3319 /* The repeating kets try the rest of the pattern or restart from the
3320 preceding bracket, in the appropriate order. */
3322 if (*ecode == OP_KETRMIN)
3324 const uschar *ptr;
3325 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3326 /* Handle alternation inside the BRA...KET; push the additional
3327 alternatives onto the stack */
3328 ptr=prev;
3329 do {
3330 ptr += (ptr[1]<<8)+ ptr[2];
3331 if (*ptr==OP_ALT)
3333 if (md->length == md->point)
3335 grow_stack(md);
3337 md->offset_top[md->point] = offset_top;
3338 md->eptr[md->point] = eptr;
3339 md->ecode[md->point] = ptr+3;
3340 md->r1[md->point] = 0;
3341 md->r2[md->point] = 0;
3342 md->off_num[md->point] = 0;
3343 md->point++;
3345 } while (*ptr==OP_ALT);
3346 ecode=prev+3; goto match_loop;
3348 else /* OP_KETRMAX */
3350 const uschar *ptr;
3351 /*int points_pushed=0;*/
3353 /* Push one failure point, that will resume matching at the code after
3354 the KETRMAX opcode. */
3355 if (md->length == md->point)
3357 grow_stack(md);
3359 md->offset_top[md->point] = offset_top;
3360 md->eptr[md->point] = eptr;
3361 md->ecode[md->point] = ecode+3;
3362 md->r1[md->point] = md->offset_vector[number];
3363 md->r2[md->point] = md->offset_vector[number+1];
3364 md->off_num[md->point] = number;
3365 md->point++;
3367 md->offset_vector[number] = eptr - md->start_subject;
3368 /* Handle alternation inside the BRA...KET; push each of the
3369 additional alternatives onto the stack */
3370 ptr=prev;
3371 do {
3372 ptr += (ptr[1]<<8)+ ptr[2];
3373 if (*ptr==OP_ALT)
3375 if (md->length == md->point)
3376 if (md->length == md->point)
3378 grow_stack(md);
3380 md->offset_top[md->point] = offset_top;
3381 md->eptr[md->point] = eptr;
3382 md->ecode[md->point] = ptr+3;
3383 md->r1[md->point] = 0;
3384 md->r2[md->point] = 0;
3385 md->off_num[md->point] = 0;
3386 md->point++;
3387 /*points_pushed++;*/
3389 } while (*ptr==OP_ALT);
3390 /* Jump to the first (or only) alternative and resume trying to match */
3391 ecode=prev+3; goto match_loop;
3395 /* Start of subject unless notbol, or after internal newline if multiline */
3397 case OP_CIRC:
3398 if (md->notbol && eptr == md->start_subject) FAIL;
3399 if (md->multiline)
3401 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3402 ecode++;
3403 break;
3405 /* ... else fall through */
3407 /* Start of subject assertion */
3409 case OP_SOD:
3410 if (eptr != md->start_subject) FAIL;
3411 ecode++;
3412 break;
3414 /* Assert before internal newline if multiline, or before
3415 a terminating newline unless endonly is set, else end of subject unless
3416 noteol is set. */
3418 case OP_DOLL:
3419 if (md->noteol && eptr >= md->end_subject) FAIL;
3420 if (md->multiline)
3422 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3423 ecode++;
3424 break;
3426 else if (!md->endonly)
3428 if (eptr < md->end_subject - 1 ||
3429 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3430 ecode++;
3431 break;
3433 /* ... else fall through */
3435 /* End of subject assertion */
3437 case OP_EOD:
3438 if (eptr < md->end_subject) FAIL;
3439 ecode++;
3440 break;
3442 /* Word boundary assertions */
3444 case OP_NOT_WORD_BOUNDARY:
3445 case OP_WORD_BOUNDARY:
3447 BOOL prev_is_word = (eptr != md->start_subject) &&
3448 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3449 BOOL cur_is_word = (eptr < md->end_subject) &&
3450 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3451 if ((*ecode++ == OP_WORD_BOUNDARY)?
3452 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3453 FAIL;
3455 break;
3457 case OP_NOT_WORD_BOUNDARY_L:
3458 case OP_WORD_BOUNDARY_L:
3460 BOOL prev_is_word = (eptr != md->start_subject) &&
3461 (isalnum(eptr[-1]) || eptr[-1]=='_');
3462 BOOL cur_is_word = (eptr < md->end_subject) &&
3463 (isalnum(*eptr) || *eptr=='_');
3464 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3465 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3466 FAIL;
3468 break;
3471 /* Match a single character type; inline for speed */
3473 case OP_ANY:
3474 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3475 if (eptr++ >= md->end_subject) FAIL;
3476 ecode++;
3477 break;
3479 case OP_NOT_DIGIT:
3480 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3481 FAIL;
3482 ecode++;
3483 break;
3485 case OP_DIGIT:
3486 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3487 FAIL;
3488 ecode++;
3489 break;
3491 case OP_NOT_WHITESPACE:
3492 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3493 FAIL;
3494 ecode++;
3495 break;
3497 case OP_WHITESPACE:
3498 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3499 FAIL;
3500 ecode++;
3501 break;
3503 case OP_NOT_WORDCHAR:
3504 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3505 FAIL;
3506 ecode++;
3507 break;
3509 case OP_WORDCHAR:
3510 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3511 FAIL;
3512 ecode++;
3513 break;
3515 case OP_NOT_WORDCHAR_L:
3516 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
3517 FAIL;
3518 eptr++;
3519 ecode++;
3520 break;
3522 case OP_WORDCHAR_L:
3523 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
3524 FAIL;
3525 eptr++;
3526 ecode++;
3527 break;
3529 /* Match a back reference, possibly repeatedly. Look past the end of the
3530 item to see if there is repeat information following. The code is similar
3531 to that for character classes, but repeated for efficiency. Then obey
3532 similar code to character type repeats - written out again for speed.
3533 However, if the referenced string is the empty string, always treat
3534 it as matched, any number of times (otherwise there could be infinite
3535 loops). */
3537 case OP_REF:
3539 int length;
3540 int number = ecode[1] << 1; /* Doubled reference number */
3541 ecode += 2; /* Advance past the item */
3543 if (number >= offset_top || md->offset_vector[number] < 0)
3545 md->errorcode = PCRE_ERROR_BADREF;
3546 FAIL;
3549 length = md->offset_vector[number+1] - md->offset_vector[number];
3551 switch (*ecode)
3553 case OP_CRSTAR:
3554 case OP_CRMINSTAR:
3555 case OP_CRPLUS:
3556 case OP_CRMINPLUS:
3557 case OP_CRQUERY:
3558 case OP_CRMINQUERY:
3559 c = *ecode++ - OP_CRSTAR;
3560 minimize = (c & 1) != 0;
3561 min = rep_min[c]; /* Pick up values from tables; */
3562 max = rep_max[c]; /* zero for max => infinity */
3563 if (max == 0) max = INT_MAX;
3564 break;
3566 case OP_CRRANGE:
3567 case OP_CRMINRANGE:
3568 minimize = (*ecode == OP_CRMINRANGE);
3569 min = (ecode[1] << 8) + ecode[2];
3570 max = (ecode[3] << 8) + ecode[4];
3571 if (max == 0) max = INT_MAX;
3572 ecode += 5;
3573 break;
3575 default: /* No repeat follows */
3576 if (!match_ref(number, eptr, length, md)) FAIL;
3577 eptr += length;
3578 continue; /* With the main loop */
3581 /* If the length of the reference is zero, just continue with the
3582 main loop. */
3584 if (length == 0) continue;
3586 /* First, ensure the minimum number of matches are present. We get back
3587 the length of the reference string explicitly rather than passing the
3588 address of eptr, so that eptr can be a register variable. */
3590 for (i = 1; i <= min; i++)
3592 if (!match_ref(number, eptr, length, md)) FAIL;
3593 eptr += length;
3596 /* If min = max, continue at the same level without recursion.
3597 They are not both allowed to be zero. */
3599 if (min == max) continue;
3601 /* If minimizing, keep trying and advancing the pointer */
3603 if (minimize)
3605 for (i = min;; i++)
3607 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3608 if (i >= max || !match_ref(number, eptr, length, md))
3609 FAIL;
3610 eptr += length;
3612 /* Control never gets here */
3615 /* If maximizing, find the longest string and work backwards */
3617 else
3619 const uschar *pp = eptr;
3620 for (i = min; i < max; i++)
3622 if (!match_ref(number, eptr, length, md)) break;
3623 eptr += length;
3625 while (eptr >= pp)
3627 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3628 eptr -= length;
3630 FAIL;
3633 /* Control never gets here */
3635 /* Match a character class, possibly repeatedly. Look past the end of the
3636 item to see if there is repeat information following. Then obey similar
3637 code to character type repeats - written out again for speed. If caseless
3638 matching was set at runtime but not at compile time, we have to check both
3639 versions of a character, and we have to behave differently for positive and
3640 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3641 treated differently. */
3643 case OP_CLASS:
3644 case OP_NEGCLASS:
3646 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
3647 const uschar *data = ecode + 1; /* Save for matching */
3648 ecode += 33; /* Advance past the item */
3650 switch (*ecode)
3652 case OP_CRSTAR:
3653 case OP_CRMINSTAR:
3654 case OP_CRPLUS:
3655 case OP_CRMINPLUS:
3656 case OP_CRQUERY:
3657 case OP_CRMINQUERY:
3658 c = *ecode++ - OP_CRSTAR;
3659 minimize = (c & 1) != 0;
3660 min = rep_min[c]; /* Pick up values from tables; */
3661 max = rep_max[c]; /* zero for max => infinity */
3662 if (max == 0) max = INT_MAX;
3663 break;
3665 case OP_CRRANGE:
3666 case OP_CRMINRANGE:
3667 minimize = (*ecode == OP_CRMINRANGE);
3668 min = (ecode[1] << 8) + ecode[2];
3669 max = (ecode[3] << 8) + ecode[4];
3670 if (max == 0) max = INT_MAX;
3671 ecode += 5;
3672 break;
3674 default: /* No repeat follows */
3675 min = max = 1;
3676 break;
3679 /* First, ensure the minimum number of matches are present. */
3681 for (i = 1; i <= min; i++)
3683 if (eptr >= md->end_subject) FAIL;
3684 c = *eptr++;
3686 /* Either not runtime caseless, or it was a positive class. For
3687 runtime caseless, continue if either case is in the map. */
3689 if (!nasty_case)
3691 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3692 if (md->runtime_caseless)
3694 c = pcre_fcc[c];
3695 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3699 /* Runtime caseless and it was a negative class. Continue only if
3700 both cases are in the map. */
3702 else
3704 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3705 c = pcre_fcc[c];
3706 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3709 FAIL;
3712 /* If max == min we can continue with the main loop without the
3713 need to recurse. */
3715 if (min == max) continue;
3717 /* If minimizing, keep testing the rest of the expression and advancing
3718 the pointer while it matches the class. */
3720 if (minimize)
3722 for (i = min;; i++)
3724 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3725 if (i >= max || eptr >= md->end_subject) FAIL;
3726 c = *eptr++;
3728 /* Either not runtime caseless, or it was a positive class. For
3729 runtime caseless, continue if either case is in the map. */
3731 if (!nasty_case)
3733 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3734 if (md->runtime_caseless)
3736 c = pcre_fcc[c];
3737 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3741 /* Runtime caseless and it was a negative class. Continue only if
3742 both cases are in the map. */
3744 else
3746 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3747 c = pcre_fcc[c];
3748 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3751 FAIL;
3753 /* Control never gets here */
3756 /* If maximizing, find the longest possible run, then work backwards. */
3758 else
3760 const uschar *pp = eptr;
3761 for (i = min; i < max; eptr++, i++)
3763 if (eptr >= md->end_subject) break;
3764 c = *eptr;
3766 /* Either not runtime caseless, or it was a positive class. For
3767 runtime caseless, continue if either case is in the map. */
3769 if (!nasty_case)
3771 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3772 if (md->runtime_caseless)
3774 c = pcre_fcc[c];
3775 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3779 /* Runtime caseless and it was a negative class. Continue only if
3780 both cases are in the map. */
3782 else
3784 if ((data[c/8] & (1 << (c&7))) == 0) break;
3785 c = pcre_fcc[c];
3786 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3789 break;
3792 while (eptr >= pp)
3793 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3794 FAIL;
3797 /* Control never gets here */
3799 /* OP_CLASS_L opcode: handles localized character classes */
3801 case OP_CLASS_L:
3803 const uschar *data = ecode + 1; /* Save for matching */
3804 const uschar locale_flag = *data;
3805 ecode++; data++; /* The localization support adds an extra byte */
3807 ecode += 33; /* Advance past the item */
3809 switch (*ecode)
3811 case OP_CRSTAR:
3812 case OP_CRMINSTAR:
3813 case OP_CRPLUS:
3814 case OP_CRMINPLUS:
3815 case OP_CRQUERY:
3816 case OP_CRMINQUERY:
3817 c = *ecode++ - OP_CRSTAR;
3818 minimize = (c & 1) != 0;
3819 min = rep_min[c]; /* Pick up values from tables; */
3820 max = rep_max[c]; /* zero for max => infinity */
3821 if (max == 0) max = INT_MAX;
3822 break;
3824 case OP_CRRANGE:
3825 case OP_CRMINRANGE:
3826 minimize = (*ecode == OP_CRMINRANGE);
3827 min = (ecode[1] << 8) + ecode[2];
3828 max = (ecode[3] << 8) + ecode[4];
3829 if (max == 0) max = INT_MAX;
3830 ecode += 5;
3831 break;
3833 default: /* No repeat follows */
3834 if (eptr >= md->end_subject) FAIL;
3835 c = *eptr++;
3836 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3837 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3838 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3839 #if 0
3840 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3841 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3842 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3843 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3844 #endif
3846 if (md->runtime_caseless)
3848 c = pcre_fcc[c];
3849 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3851 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3852 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3854 FAIL;
3857 /* First, ensure the minimum number of matches are present. */
3859 for (i = 1; i <= min; i++)
3861 if (eptr >= md->end_subject) FAIL;
3862 c = *eptr++;
3863 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3864 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3865 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3867 if (md->runtime_caseless)
3869 c = pcre_fcc[c];
3870 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3871 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3872 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3874 FAIL;
3877 /* If max == min we can continue with the main loop without the
3878 need to recurse. */
3880 if (min == max) continue;
3882 /* If minimizing, keep testing the rest of the expression and advancing
3883 the pointer while it matches the class. */
3885 if (minimize)
3887 for (i = min;; i++)
3889 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3890 if (i >= max || eptr >= md->end_subject) FAIL;
3891 c = *eptr++;
3892 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3893 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3894 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3896 if (md->runtime_caseless)
3898 c = pcre_fcc[c];
3899 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3900 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3901 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3903 FAIL;
3905 /* Control never gets here */
3908 /* If maximizing, find the longest possible run, then work backwards. */
3910 else
3912 const uschar *pp = eptr;
3913 for (i = min; i < max; eptr++, i++)
3915 if (eptr >= md->end_subject) break;
3916 c = *eptr;
3917 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3918 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3919 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3920 if (md->runtime_caseless)
3922 c = pcre_fcc[c];
3923 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3924 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3925 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3927 break;
3930 while (eptr >= pp)
3931 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3932 FAIL;
3935 /* Control never gets here */
3937 /* Match a run of characters */
3939 case OP_CHARS:
3941 register int length = ecode[1];
3942 ecode += 2;
3944 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3945 if (eptr >= md->end_subject)
3946 printf("matching subject <null> against pattern ");
3947 else
3949 printf("matching subject ");
3950 pchars(eptr, length, TRUE, md);
3951 printf(" against pattern ");
3953 pchars(ecode, length, FALSE, md);
3954 printf("\n");
3955 #endif
3957 if (length > md->end_subject - eptr) FAIL;
3958 if (md->caseless)
3960 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3962 else
3964 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3967 break;
3969 /* Match a single character repeatedly; different opcodes share code. */
3971 case OP_EXACT:
3972 min = max = (ecode[1] << 8) + ecode[2];
3973 ecode += 3;
3974 goto REPEATCHAR;
3976 case OP_UPTO:
3977 case OP_MINUPTO:
3978 min = 0;
3979 max = (ecode[1] << 8) + ecode[2];
3980 minimize = *ecode == OP_MINUPTO;
3981 ecode += 3;
3982 goto REPEATCHAR;
3984 case OP_STAR:
3985 case OP_MINSTAR:
3986 case OP_PLUS:
3987 case OP_MINPLUS:
3988 case OP_QUERY:
3989 case OP_MINQUERY:
3990 c = *ecode++ - OP_STAR;
3991 minimize = (c & 1) != 0;
3992 min = rep_min[c]; /* Pick up values from tables; */
3993 max = rep_max[c]; /* zero for max => infinity */
3994 if (max == 0) max = INT_MAX;
3996 /* Common code for all repeated single-character matches. We can give
3997 up quickly if there are fewer than the minimum number of characters left in
3998 the subject. */
4000 REPEATCHAR:
4001 if (min > md->end_subject - eptr) FAIL;
4002 c = *ecode++;
4004 /* The code is duplicated for the caseless and caseful cases, for speed,
4005 since matching characters is likely to be quite common. First, ensure the
4006 minimum number of matches are present. If min = max, continue at the same
4007 level without recursing. Otherwise, if minimizing, keep trying the rest of
4008 the expression and advancing one matching character if failing, up to the
4009 maximum. Alternatively, if maximizing, find the maximum number of
4010 characters and work backwards. */
4012 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4013 max, eptr));
4015 if (md->caseless)
4017 c = pcre_lcc[c];
4018 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4019 if (min == max) continue;
4020 if (minimize)
4022 for (i = min;; i++)
4024 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4025 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4026 FAIL;
4028 /* Control never gets here */
4030 else
4032 const uschar *pp = eptr;
4033 for (i = min; i < max; i++)
4035 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4036 eptr++;
4038 while (eptr >= pp)
4039 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4040 FAIL;
4042 /* Control never gets here */
4045 /* Caseful comparisons */
4047 else
4049 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4050 if (min == max) continue;
4051 if (minimize)
4053 for (i = min;; i++)
4055 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4056 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4058 /* Control never gets here */
4060 else
4062 const uschar *pp = eptr;
4063 for (i = min; i < max; i++)
4065 if (eptr >= md->end_subject || c != *eptr) break;
4066 eptr++;
4068 while (eptr >= pp)
4069 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4070 FAIL;
4073 /* Control never gets here */
4075 /* Match a negated single character */
4077 case OP_NOT:
4078 if (eptr >= md->end_subject) FAIL;
4079 ecode++;
4080 if (md->caseless)
4082 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4084 else
4086 if (*ecode++ == *eptr++) FAIL;
4088 break;
4090 /* Match a negated single character repeatedly. This is almost a repeat of
4091 the code for a repeated single character, but I haven't found a nice way of
4092 commoning these up that doesn't require a test of the positive/negative
4093 option for each character match. Maybe that wouldn't add very much to the
4094 time taken, but character matching *is* what this is all about... */
4096 case OP_NOTEXACT:
4097 min = max = (ecode[1] << 8) + ecode[2];
4098 ecode += 3;
4099 goto REPEATNOTCHAR;
4101 case OP_NOTUPTO:
4102 case OP_NOTMINUPTO:
4103 min = 0;
4104 max = (ecode[1] << 8) + ecode[2];
4105 minimize = *ecode == OP_NOTMINUPTO;
4106 ecode += 3;
4107 goto REPEATNOTCHAR;
4109 case OP_NOTSTAR:
4110 case OP_NOTMINSTAR:
4111 case OP_NOTPLUS:
4112 case OP_NOTMINPLUS:
4113 case OP_NOTQUERY:
4114 case OP_NOTMINQUERY:
4115 c = *ecode++ - OP_NOTSTAR;
4116 minimize = (c & 1) != 0;
4117 min = rep_min[c]; /* Pick up values from tables; */
4118 max = rep_max[c]; /* zero for max => infinity */
4119 if (max == 0) max = INT_MAX;
4121 /* Common code for all repeated single-character matches. We can give
4122 up quickly if there are fewer than the minimum number of characters left in
4123 the subject. */
4125 REPEATNOTCHAR:
4126 if (min > md->end_subject - eptr) FAIL;
4127 c = *ecode++;
4129 /* The code is duplicated for the caseless and caseful cases, for speed,
4130 since matching characters is likely to be quite common. First, ensure the
4131 minimum number of matches are present. If min = max, continue at the same
4132 level without recursing. Otherwise, if minimizing, keep trying the rest of
4133 the expression and advancing one matching character if failing, up to the
4134 maximum. Alternatively, if maximizing, find the maximum number of
4135 characters and work backwards. */
4137 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4138 max, eptr));
4140 if (md->caseless)
4142 c = pcre_lcc[c];
4143 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4144 if (min == max) continue;
4145 if (minimize)
4147 for (i = min;; i++)
4149 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4150 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4151 FAIL;
4153 /* Control never gets here */
4155 else
4157 const uschar *pp = eptr;
4158 for (i = min; i < max; i++)
4160 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4161 eptr++;
4163 while (eptr >= pp)
4164 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4165 FAIL;
4167 /* Control never gets here */
4170 /* Caseful comparisons */
4172 else
4174 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4175 if (min == max) continue;
4176 if (minimize)
4178 for (i = min;; i++)
4180 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4181 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4183 /* Control never gets here */
4185 else
4187 const uschar *pp = eptr;
4188 for (i = min; i < max; i++)
4190 if (eptr >= md->end_subject || c == *eptr) break;
4191 eptr++;
4193 while (eptr >= pp)
4194 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4195 FAIL;
4198 /* Control never gets here */
4200 /* Match a single character type repeatedly; several different opcodes
4201 share code. This is very similar to the code for single characters, but we
4202 repeat it in the interests of efficiency. */
4204 case OP_TYPEEXACT:
4205 min = max = (ecode[1] << 8) + ecode[2];
4206 minimize = TRUE;
4207 ecode += 3;
4208 goto REPEATTYPE;
4210 case OP_TYPEUPTO:
4211 case OP_TYPEMINUPTO:
4212 min = 0;
4213 max = (ecode[1] << 8) + ecode[2];
4214 minimize = *ecode == OP_TYPEMINUPTO;
4215 ecode += 3;
4216 goto REPEATTYPE;
4218 case OP_TYPESTAR:
4219 case OP_TYPEMINSTAR:
4220 case OP_TYPEPLUS:
4221 case OP_TYPEMINPLUS:
4222 case OP_TYPEQUERY:
4223 case OP_TYPEMINQUERY:
4224 c = *ecode++ - OP_TYPESTAR;
4225 minimize = (c & 1) != 0;
4226 min = rep_min[c]; /* Pick up values from tables; */
4227 max = rep_max[c]; /* zero for max => infinity */
4228 if (max == 0) max = INT_MAX;
4230 /* Common code for all repeated single character type matches */
4232 REPEATTYPE:
4233 ctype = *ecode++; /* Code for the character type */
4235 /* First, ensure the minimum number of matches are present. Use inline
4236 code for maximizing the speed, and do the type test once at the start
4237 (i.e. keep it out of the loop). Also test that there are at least the
4238 minimum number of characters before we start. */
4240 if (min > md->end_subject - eptr) FAIL;
4241 if (min > 0) switch(ctype)
4243 case OP_ANY:
4244 if (!md->dotall)
4245 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4246 else eptr += min;
4247 break;
4249 case OP_NOT_DIGIT:
4250 for (i = 1; i <= min; i++)
4251 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4252 break;
4254 case OP_DIGIT:
4255 for (i = 1; i <= min; i++)
4256 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4257 break;
4259 case OP_NOT_WHITESPACE:
4260 for (i = 1; i <= min; i++)
4261 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4262 break;
4264 case OP_WHITESPACE:
4265 for (i = 1; i <= min; i++)
4266 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4267 break;
4269 case OP_NOT_WORDCHAR:
4270 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4271 FAIL;
4272 break;
4274 case OP_WORDCHAR:
4275 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4276 FAIL;
4277 break;
4279 case OP_NOT_WORDCHAR_L:
4280 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
4281 FAIL;
4282 break;
4284 case OP_WORDCHAR_L:
4285 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
4286 FAIL;
4287 break;
4290 /* If min = max, continue at the same level without recursing */
4292 if (min == max) continue;
4294 /* If minimizing, we have to test the rest of the pattern before each
4295 subsequent match, so inlining isn't much help; just use the function. */
4297 if (minimize)
4299 for (i = min;; i++)
4301 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4302 if (i >= max || eptr >= md->end_subject ||
4303 !match_type(ctype, *eptr++, md->dotall))
4304 FAIL;
4306 /* Control never gets here */
4309 /* If maximizing it is worth using inline code for speed, doing the type
4310 test once at the start (i.e. keep it out of the loop). */
4312 else
4314 const uschar *pp = eptr;
4315 switch(ctype)
4317 case OP_ANY:
4318 if (!md->dotall)
4320 for (i = min; i < max; i++)
4322 if (eptr >= md->end_subject || *eptr == '\n') break;
4323 eptr++;
4326 else
4328 c = max - min;
4329 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4330 eptr += c;
4332 break;
4334 case OP_NOT_DIGIT:
4335 for (i = min; i < max; i++)
4337 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4338 break;
4339 eptr++;
4341 break;
4343 case OP_DIGIT:
4344 for (i = min; i < max; i++)
4346 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4347 break;
4348 eptr++;
4350 break;
4352 case OP_NOT_WHITESPACE:
4353 for (i = min; i < max; i++)
4355 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4356 break;
4357 eptr++;
4359 break;
4361 case OP_WHITESPACE:
4362 for (i = min; i < max; i++)
4364 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4365 break;
4366 eptr++;
4368 break;
4370 case OP_NOT_WORDCHAR:
4371 for (i = min; i < max; i++)
4373 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4374 break;
4375 eptr++;
4377 break;
4379 case OP_WORDCHAR:
4380 for (i = min; i < max; i++)
4382 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4383 break;
4384 eptr++;
4386 break;
4387 case OP_NOT_WORDCHAR_L:
4388 for (i = min; i < max; i++)
4390 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
4391 break;
4392 eptr++;
4394 break;
4396 case OP_WORDCHAR_L:
4397 for (i = min; i < max; i++)
4399 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
4400 break;
4401 eptr++;
4403 break;
4406 while (eptr >= pp)
4407 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4408 FAIL;
4410 /* Control never gets here */
4412 /* There's been some horrible disaster. */
4414 default:
4415 DPRINTF(("Unknown opcode %d\n", *ecode));
4416 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4417 FAIL;
4420 /* Do not stick any code in here without much thought; it is assumed
4421 that "continue" in the code above comes out to here to repeat the main
4422 loop. */
4424 } /* End of main loop */
4425 /* Control never reaches here */
4427 fail:
4428 if (md->point > save_stack_position)
4430 /* If there are still points remaining on the stack, pop the next one off */
4431 int off_num;
4433 md->point--;
4434 offset_top = md->offset_top[md->point];
4435 eptr = md->eptr[md->point];
4436 ecode = md->ecode[md->point];
4437 off_num = md->off_num[md->point];
4438 md->offset_vector[off_num] = md->r1[md->point];
4439 md->offset_vector[off_num+1] = md->r2[md->point];
4440 goto match_loop;
4442 /* Failure, and nothing left on the stack, so end this function call */
4444 /* Restore the top of the stack to where it was before this function
4445 call. This lets us use one stack for everything; recursive calls
4446 can push and pop information, and may increase the stack. When
4447 the call returns, the parent function can resume pushing and
4448 popping wherever it was. */
4450 md->point = save_stack_position;
4451 return FALSE;
4453 succeed:
4454 return TRUE;
4459 /*************************************************
4460 * Segregate setjmp() *
4461 *************************************************/
4463 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4464 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4465 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4466 since it's needed only for the extension \X option, and with any luck, a good
4467 compiler will spot the tail recursion and compile it efficiently.
4469 Arguments:
4470 eptr pointer in subject
4471 ecode position in code
4472 offset_top current top pointer
4473 md pointer to "static" info for the match
4475 Returns: TRUE if matched
4478 static BOOL
4479 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4480 match_data *match_block)
4482 return setjmp(match_block->fail_env) == 0 &&
4483 match(eptr, ecode, offset_top, match_block);
4488 /*************************************************
4489 * Execute a Regular Expression *
4490 *************************************************/
4492 /* This function applies a compiled re to a subject string and picks out
4493 portions of the string if it matches. Two elements in the vector are set for
4494 each substring: the offsets to the start and end of the substring.
4496 Arguments:
4497 external_re points to the compiled expression
4498 external_extra points to "hints" from pcre_study() or is NULL
4499 subject points to the subject string
4500 length length of subject string (may contain binary zeros)
4501 options option bits
4502 offsets points to a vector of ints to be filled in with offsets
4503 offsetcount the number of elements in the vector
4505 Returns: > 0 => success; value is the number of elements filled in
4506 = 0 => success, but offsets is not big enough
4507 -1 => failed to match
4508 < -1 => some kind of unexpected problem
4512 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4513 const char *subject, int length, int start_pos, int options,
4514 int *offsets, int offsetcount)
4516 /* The "volatile" directives are to make gcc -Wall stop complaining
4517 that these variables can be clobbered by the longjmp. Hopefully
4518 they won't cost too much performance. */
4519 volatile int resetcount, ocount;
4520 volatile int first_char = -1;
4521 match_data match_block;
4522 const uschar *start_bits = NULL;
4523 const uschar *start_match = (const uschar *)subject + start_pos;
4524 const uschar *end_subject;
4525 const real_pcre *re = (const real_pcre *)external_re;
4526 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4527 volatile BOOL using_temporary_offsets = FALSE;
4528 volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4529 volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4531 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4533 if (re == NULL || subject == NULL ||
4534 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4535 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4537 match_block.start_subject = (const uschar *)subject;
4538 match_block.end_subject = match_block.start_subject + length;
4539 end_subject = match_block.end_subject;
4541 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4542 match_block.runtime_caseless = match_block.caseless &&
4543 (re->options & PCRE_CASELESS) == 0;
4545 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4546 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4547 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4549 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4550 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4552 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4554 /* Set the stack state to empty */
4555 match_block.off_num = match_block.offset_top = NULL;
4556 match_block.r1 = match_block.r2 = NULL;
4557 match_block.eptr = match_block.ecode = NULL;
4558 match_block.point = match_block.length = 0;
4560 /* If the expression has got more back references than the offsets supplied can
4561 hold, we get a temporary bit of working store to use during the matching.
4562 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4563 of 2. */
4565 ocount = offsetcount & (-2);
4566 if (re->top_backref > 0 && re->top_backref >= ocount/2)
4568 ocount = re->top_backref * 2 + 2;
4569 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4570 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4571 using_temporary_offsets = TRUE;
4572 DPRINTF(("Got memory to hold back references\n"));
4574 else match_block.offset_vector = offsets;
4576 match_block.offset_end = ocount;
4577 match_block.offset_overflow = FALSE;
4579 /* Compute the minimum number of offsets that we need to reset each time. Doing
4580 this makes a huge difference to execution time when there aren't many brackets
4581 in the pattern. */
4583 resetcount = 2 + re->top_bracket * 2;
4584 if (resetcount > offsetcount) resetcount = ocount;
4586 /* If MULTILINE is set at exec time but was not set at compile time, and the
4587 anchored flag is set, we must re-check because a setting provoked by ^ in the
4588 pattern is not right in multi-line mode. Calling is_anchored() again here does
4589 the right check, because multiline is now set. If it now yields FALSE, the
4590 expression must have had ^ starting some of its branches. Check to see if
4591 that is true for *all* branches, and if so, set the startline flag. */
4593 if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4594 !is_anchored(re->code, match_block.multiline))
4596 anchored = FALSE;
4597 if (is_startline(re->code)) startline = TRUE;
4600 /* Set up the first character to match, if available. The first_char value is
4601 never set for an anchored regular expression, but the anchoring may be forced
4602 at run time, so we have to test for anchoring. The first char may be unset for
4603 an unanchored pattern, of course. If there's no first char and the pattern was
4604 studied, the may be a bitmap of possible first characters. However, we can
4605 use this only if the caseless state of the studying was correct. */
4607 if (!anchored)
4609 if ((re->options & PCRE_FIRSTSET) != 0)
4611 first_char = re->first_char;
4612 if (match_block.caseless) first_char = pcre_lcc[first_char];
4614 else
4615 if (!startline && extra != NULL &&
4616 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4617 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4618 start_bits = extra->start_bits;
4621 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4625 int rc;
4626 register int *iptr = match_block.offset_vector;
4627 register int *iend = iptr + resetcount;
4629 /* Reset the maximum number of extractions we might see. */
4631 while (iptr < iend) *iptr++ = -1;
4633 /* Advance to a unique first char if possible */
4635 if (first_char >= 0)
4637 if (match_block.caseless)
4638 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4639 start_match++;
4640 else
4641 while (start_match < end_subject && *start_match != first_char)
4642 start_match++;
4645 /* Or to just after \n for a multiline match if possible */
4647 else if (startline)
4649 if (start_match > match_block.start_subject)
4651 while (start_match < end_subject && start_match[-1] != '\n')
4652 start_match++;
4656 /* Or to a non-unique first char */
4658 else if (start_bits != NULL)
4660 while (start_match < end_subject)
4662 register int c = *start_match;
4663 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4667 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4668 printf(">>>> Match against: ");
4669 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4670 printf("\n");
4671 #endif
4673 /* When a match occurs, substrings will be set for all internal extractions;
4674 we just need to set up the whole thing as substring 0 before returning. If
4675 there were too many extractions, set the return code to zero. In the case
4676 where we had to get some local store to hold offsets for backreferences, copy
4677 those back references that we can. In this case there need not be overflow
4678 if certain parts of the pattern were not used.
4680 Before starting the match, we have to set up a longjmp() target to enable
4681 the "cut" operation to fail a match completely without backtracking. This
4682 is done in a separate function to avoid compiler warnings. We need not do
4683 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4684 enabled. */
4686 /* To handle errors such as running out of memory for the failure
4687 stack, we need to save this location via setjmp(), so
4688 error-handling code can call longjmp() to jump out of deeply-nested code. */
4689 if (setjmp(match_block.error_env)==0)
4692 if ((re->options & PCRE_EXTRA) != 0)
4694 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4695 continue;
4697 else if (!match(start_match, re->code, 2, &match_block)) continue;
4699 /* Copy the offset information from temporary store if necessary */
4701 if (using_temporary_offsets)
4703 if (offsetcount >= 4)
4705 memcpy(offsets + 2, match_block.offset_vector + 2,
4706 (offsetcount - 2) * sizeof(int));
4707 DPRINTF(("Copied offsets from temporary memory\n"));
4709 if (match_block.end_offset_top > offsetcount)
4710 match_block.offset_overflow = TRUE;
4712 DPRINTF(("Freeing temporary memory\n"));
4713 (pcre_free)(match_block.offset_vector);
4716 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4718 if (match_block.offset_end < 2) rc = 0; else
4720 offsets[0] = start_match - match_block.start_subject;
4721 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4724 DPRINTF((">>>> returning %d\n", rc));
4725 free_stack(&match_block);
4726 return rc;
4727 } /* End of (if setjmp(match_block.error_env)...) */
4728 free_stack(&match_block);
4730 /* Return an error code; pcremodule.c will preserve the exception */
4731 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4733 while (!anchored &&
4734 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4735 start_match++ < end_subject);
4737 if (using_temporary_offsets)
4739 DPRINTF(("Freeing temporary memory\n"));
4740 (pcre_free)(match_block.offset_vector);
4743 #ifdef DEBUG
4744 printf(">>>> returning %d\n", match_block.errorcode);
4745 #endif
4747 return match_block.errorcode;
4750 /* End of pcre.c */