This commit was manufactured by cvs2svn to create tag 'r22a4-fork'.
[python/dscho.git] / Modules / pypcre.c
blobc6a14ec52813a27e04b4c3b006343c277bcfff68
2 /*************************************************
3 * Perl-Compatible Regular Expressions *
4 *************************************************/
6 /* DO NOT EDIT THIS FILE! */
8 /* This file is automatically written by the merge-files.py script
9 included with the PCRE distribution for Python; it's produced from
10 several C files, and code is removed in the process. If you want to
11 modify the code or track down bugs, it will be much easier to work
12 with the code in its original, multiple-file form. Don't edit this
13 file by hand, or submit patches to it.
15 The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
18 The unmodified original PCRE distribution is available at
19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20 written by: Philip Hazel <ph10@cam.ac.uk>
22 Extensively modified by the Python String-SIG: <string-sig@python.org>
23 Send bug reports to: <string-sig@python.org>
24 (They'll figure out if it's a bug in PCRE or in the Python-specific
25 changes.)
27 Copyright (c) 1997 University of Cambridge
29 -----------------------------------------------------------------------------
30 Permission is granted to anyone to use this software for any purpose on any
31 computer system, and to redistribute it freely, subject to the following
32 restrictions:
34 1. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 2. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
41 3. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43 -----------------------------------------------------------------------------
47 #define FOR_PYTHON
48 #include "pcre-int.h"
49 #include "Python.h"
50 #include <ctype.h>
51 #include "graminit.h"
53 /*************************************************
54 * Perl-Compatible Regular Expressions *
55 *************************************************/
57 /* This file is automatically written by the makechartables auxiliary
58 program. If you edit it by hand, you might like to edit the Makefile to
59 prevent its ever being regenerated. */
61 /* This table is a lower casing table. */
63 unsigned char pcre_lcc[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
97 /* This table is a case flipping table. */
99 unsigned char pcre_fcc[] = {
100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
108 64, 97, 98, 99,100,101,102,103,
109 104,105,106,107,108,109,110,111,
110 112,113,114,115,116,117,118,119,
111 120,121,122, 91, 92, 93, 94, 95,
112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
133 /* This table contains bit maps for digits, letters, 'word' chars, and
134 white space. Each map is 32 bytes long and the bits run from the least
135 significant end of each byte. */
137 unsigned char pcre_cbits[] = {
138 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
139 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
149 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158 /* This table identifies various classes of character by individual bits:
159 0x01 white space character
160 0x02 letter
161 0x04 decimal digit
162 0x08 hexadecimal digit
163 0x10 alphanumeric or '_'
164 0x80 regular expression metacharacter or binary zero
167 unsigned char pcre_ctypes[] = {
168 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
169 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
172 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
173 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
174 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
175 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
176 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
177 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
179 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
180 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
181 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
183 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201 /* End of chartables.c */
202 /*************************************************
203 * Perl-Compatible Regular Expressions *
204 *************************************************/
207 This is a library of functions to support regular expressions whose syntax
208 and semantics are as close as possible to those of the Perl 5 language. See
209 the file Tech.Notes for some information on the internals.
211 Written by: Philip Hazel <ph10@cam.ac.uk>
213 Copyright (c) 1998 University of Cambridge
215 -----------------------------------------------------------------------------
216 Permission is granted to anyone to use this software for any purpose on any
217 computer system, and to redistribute it freely, subject to the following
218 restrictions:
220 1. This software is distributed in the hope that it will be useful,
221 but WITHOUT ANY WARRANTY; without even the implied warranty of
222 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224 2. The origin of this software must not be misrepresented, either by
225 explicit claim or by omission.
227 3. Altered versions must be plainly marked as such, and must not be
228 misrepresented as being the original software.
229 -----------------------------------------------------------------------------
233 /* Include the internals header, which itself includes Standard C headers plus
234 the external pcre header. */
239 /*************************************************
240 * Create bitmap of starting chars *
241 *************************************************/
243 /* This function scans a compiled unanchored expression and attempts to build a
244 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
245 goes by, we may be able to get more clever at doing this.
247 Arguments:
248 code points to an expression
249 start_bits points to a 32-byte table, initialized to 0
251 Returns: TRUE if table built, FALSE otherwise
254 static BOOL
255 set_start_bits(const uschar *code, uschar *start_bits)
257 register int c;
258 volatile int dummy;
262 const uschar *tcode = code + 3;
263 BOOL try_next = TRUE;
265 while (try_next)
267 try_next = FALSE;
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
271 if (!set_start_bits(tcode, start_bits)) return FALSE;
274 else switch(*tcode)
276 default:
277 return FALSE;
279 /* BRAZERO does the bracket, but carries on. */
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
284 dummy = 1;
285 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
286 tcode += 3;
287 try_next = TRUE;
288 break;
290 /* Single-char * or ? sets the bit and tries the next item */
292 case OP_STAR:
293 case OP_MINSTAR:
294 case OP_QUERY:
295 case OP_MINQUERY:
296 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
297 tcode += 2;
298 try_next = TRUE;
299 break;
301 /* Single-char upto sets the bit and tries the next */
303 case OP_UPTO:
304 case OP_MINUPTO:
305 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
306 tcode += 4;
307 try_next = TRUE;
308 break;
310 /* At least one single char sets the bit and stops */
312 case OP_EXACT: /* Fall through */
313 tcode++;
315 case OP_CHARS: /* Fall through */
316 tcode++;
318 case OP_PLUS:
319 case OP_MINPLUS:
320 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
321 break;
323 /* Single character type sets the bits and stops */
325 case OP_NOT_DIGIT:
326 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
327 break;
329 case OP_DIGIT:
330 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
331 break;
333 case OP_NOT_WHITESPACE:
334 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
335 break;
337 case OP_WHITESPACE:
338 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
339 break;
341 case OP_NOT_WORDCHAR:
342 for (c = 0; c < 32; c++)
343 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
344 break;
346 case OP_WORDCHAR:
347 for (c = 0; c < 32; c++)
348 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
349 break;
351 /* One or more character type fudges the pointer and restarts, knowing
352 it will hit a single character type and stop there. */
354 case OP_TYPEPLUS:
355 case OP_TYPEMINPLUS:
356 tcode++;
357 try_next = TRUE;
358 break;
360 case OP_TYPEEXACT:
361 tcode += 3;
362 try_next = TRUE;
363 break;
365 /* Zero or more repeats of character types set the bits and then
366 try again. */
368 case OP_TYPEUPTO:
369 case OP_TYPEMINUPTO:
370 tcode += 2; /* Fall through */
372 case OP_TYPESTAR:
373 case OP_TYPEMINSTAR:
374 case OP_TYPEQUERY:
375 case OP_TYPEMINQUERY:
376 switch(tcode[1])
378 case OP_NOT_DIGIT:
379 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
380 break;
382 case OP_DIGIT:
383 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
384 break;
386 case OP_NOT_WHITESPACE:
387 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
388 break;
390 case OP_WHITESPACE:
391 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
392 break;
394 case OP_NOT_WORDCHAR:
395 for (c = 0; c < 32; c++)
396 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
397 break;
399 case OP_WORDCHAR:
400 for (c = 0; c < 32; c++)
401 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
402 break;
405 tcode += 2;
406 try_next = TRUE;
407 break;
409 /* Character class: set the bits and either carry on or not,
410 according to the repeat count. */
412 case OP_CLASS:
413 case OP_NEGCLASS:
415 tcode++;
416 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
417 tcode += 32;
418 switch (*tcode)
420 case OP_CRSTAR:
421 case OP_CRMINSTAR:
422 case OP_CRQUERY:
423 case OP_CRMINQUERY:
424 tcode++;
425 try_next = TRUE;
426 break;
428 case OP_CRRANGE:
429 case OP_CRMINRANGE:
430 if (((tcode[1] << 8) + tcode[2]) == 0)
432 tcode += 5;
433 try_next = TRUE;
435 break;
438 break; /* End of class handling */
440 } /* End of switch */
441 } /* End of try_next loop */
443 code += (code[1] << 8) + code[2]; /* Advance to next branch */
445 while (*code == OP_ALT);
446 return TRUE;
451 /*************************************************
452 * Study a compiled expression *
453 *************************************************/
455 /* This function is handed a compiled expression that it must study to produce
456 information that will speed up the matching. It returns a pcre_extra block
457 which then gets handed back to pcre_exec().
459 Arguments:
460 re points to the compiled expression
461 options contains option bits
462 errorptr points to where to place error messages;
463 set NULL unless error
465 Returns: pointer to a pcre_extra block,
466 NULL on error or if no optimization possible
469 pcre_extra *
470 pcre_study(const pcre *external_re, int options, const char **errorptr)
472 BOOL caseless;
473 uschar start_bits[32];
474 real_pcre_extra *extra;
475 const real_pcre *re = (const real_pcre *)external_re;
477 *errorptr = NULL;
479 if (re == NULL || re->magic_number != MAGIC_NUMBER)
481 *errorptr = "argument is not a compiled regular expression";
482 return NULL;
485 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
487 *errorptr = "unknown or incorrect option bit(s) set";
488 return NULL;
491 /* Caseless can either be from the compiled regex or from options. */
493 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
495 /* For an anchored pattern, or an unanchored pattern that has a first char, or a
496 multiline pattern that matches only at "line starts", no further processing at
497 present. */
499 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
500 return NULL;
502 /* See if we can find a fixed set of initial characters for the pattern. */
504 memset(start_bits, 0, 32 * sizeof(uschar));
505 if (!set_start_bits(re->code, start_bits)) return NULL;
507 /* If this studying is caseless, scan the created bit map and duplicate the
508 bits for any letters. */
510 if (caseless)
512 register int c;
513 for (c = 0; c < 256; c++)
515 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
516 (pcre_ctypes[c] & ctype_letter) != 0)
518 int d = pcre_fcc[c];
519 start_bits[d/8] |= (1 << (d&7));
524 /* Get an "extra" block and put the information therein. */
526 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
528 if (extra == NULL)
530 *errorptr = "failed to get memory";
531 return NULL;
534 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
535 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
537 return (pcre_extra *)extra;
540 /* End of study.c */
541 /*************************************************
542 * Perl-Compatible Regular Expressions *
543 *************************************************/
546 This is a library of functions to support regular expressions whose syntax
547 and semantics are as close as possible to those of the Perl 5 language. See
548 the file Tech.Notes for some information on the internals.
550 Written by: Philip Hazel <ph10@cam.ac.uk>
552 Copyright (c) 1998 University of Cambridge
554 -----------------------------------------------------------------------------
555 Permission is granted to anyone to use this software for any purpose on any
556 computer system, and to redistribute it freely, subject to the following
557 restrictions:
559 1. This software is distributed in the hope that it will be useful,
560 but WITHOUT ANY WARRANTY; without even the implied warranty of
561 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
563 2. The origin of this software must not be misrepresented, either by
564 explicit claim or by omission.
566 3. Altered versions must be plainly marked as such, and must not be
567 misrepresented as being the original software.
568 -----------------------------------------------------------------------------
572 /* Define DEBUG to get debugging output on stdout. */
574 /* #define DEBUG */
576 /* Use a macro for debugging printing, 'cause that eliminates the the use
577 of #ifdef inline, and there are *still* stupid compilers about that don't like
578 indented pre-processor statements. I suppose it's only been 10 years... */
580 #ifdef DEBUG
581 #define DPRINTF(p) printf p
582 #else
583 #define DPRINTF(p) /*nothing*/
584 #endif
586 /* Include the internals header, which itself includes Standard C headers plus
587 the external pcre header. */
592 #ifndef Py_eval_input
593 /* For Python 1.4, graminit.h has to be explicitly included */
594 #define Py_eval_input eval_input
596 #endif /* FOR_PYTHON */
598 /* Allow compilation as C++ source code, should anybody want to do that. */
600 #ifdef __cplusplus
601 #define class pcre_class
602 #endif
605 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
607 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
608 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
610 /* Text forms of OP_ values and things, for debugging (not all used) */
612 #ifdef DEBUG
613 static const char *OP_names[] = {
614 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
615 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
616 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
617 "^", "$", "Any", "chars",
618 "not",
619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{",
623 "class", "negclass", "classL", "Ref",
624 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
625 "Brazero", "Braminzero", "Bra"
627 #endif
629 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
630 are simple data values; negative values are for special things like \d and so
631 on. Zero means further processing is needed (for things like \x), or the escape
632 is invalid. */
634 static const short int escapes[] = {
635 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
636 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
637 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
638 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
639 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
640 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
641 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
642 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
643 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
644 0, 0, 0 /* x - z */
647 /* Definition to allow mutual recursion */
649 static BOOL
650 compile_regex(int, int *, uschar **, const uschar **, const char **,
651 PyObject *);
653 /* Structure for passing "static" information around between the functions
654 doing the matching, so that they are thread-safe. */
656 typedef struct match_data {
657 int errorcode; /* As it says */
658 int *offset_vector; /* Offset vector */
659 int offset_end; /* One past the end */
660 BOOL offset_overflow; /* Set if too many extractions */
661 BOOL caseless; /* Case-independent flag */
662 BOOL runtime_caseless; /* Caseless forced at run time */
663 BOOL multiline; /* Multiline flag */
664 BOOL notbol; /* NOTBOL flag */
665 BOOL noteol; /* NOTEOL flag */
666 BOOL dotall; /* Dot matches any char */
667 BOOL endonly; /* Dollar not before final \n */
668 const uschar *start_subject; /* Start of the subject string */
669 const uschar *end_subject; /* End of the subject string */
670 jmp_buf fail_env; /* Environment for longjump() break out */
671 const uschar *end_match_ptr; /* Subject position at end match */
672 int end_offset_top; /* Highwater mark at end of match */
673 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
674 matching operation */
675 int length; /* Length of the allocated stacks */
676 int point; /* Point to add next item pushed onto stacks */
677 /* Pointers to the 6 stacks */
678 int *off_num, *offset_top, *r1, *r2;
679 const uschar **eptr, **ecode;
680 } match_data;
684 /*************************************************
685 * Global variables *
686 *************************************************/
688 /* PCRE is thread-clean and doesn't use any global variables in the normal
689 sense. However, it calls memory allocation and free functions via the two
690 indirections below, which are can be changed by the caller, but are shared
691 between all threads. */
693 void *(*pcre_malloc)(size_t) = malloc;
694 void (*pcre_free)(void *) = free;
699 /*************************************************
700 * Return version string *
701 *************************************************/
703 const char *
704 pcre_version(void)
706 return PCRE_VERSION;
712 /*************************************************
713 * Return info about a compiled pattern *
714 *************************************************/
716 /* This function picks potentially useful data out of the private
717 structure.
719 Arguments:
720 external_re points to compiled code
721 optptr where to pass back the options
722 first_char where to pass back the first character,
723 or -1 if multiline and all branches start ^,
724 or -2 otherwise
726 Returns: number of identifying extraction brackets
727 or negative values on error
731 pcre_info(const pcre *external_re, int *optptr, int *first_char)
733 const real_pcre *re = (real_pcre *)external_re;
734 if (re == NULL) return PCRE_ERROR_NULL;
735 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
736 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
737 if (first_char != NULL)
738 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
739 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
740 return re->top_bracket;
746 #ifdef DEBUG
747 /*************************************************
748 * Debugging function to print chars *
749 *************************************************/
751 /* Print a sequence of chars in printable format, stopping at the end of the
752 subject if the requested.
754 Arguments:
755 p points to characters
756 length number to print
757 is_subject TRUE if printing from within md->start_subject
758 md pointer to matching data block, if is_subject is TRUE
760 Returns: nothing
763 static void
764 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
766 int c;
767 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
768 while (length-- > 0)
769 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
771 #endif
776 /*************************************************
777 * Check subpattern for empty operand *
778 *************************************************/
780 /* This function checks a bracketed subpattern to see if any of the paths
781 through it could match an empty string. This is used to diagnose an error if
782 such a subpattern is followed by a quantifier with an unlimited upper bound.
784 Argument:
785 code points to the opening bracket
787 Returns: TRUE or FALSE
790 static BOOL
791 could_be_empty(uschar *code)
793 do {
794 uschar *cc = code + 3;
796 /* Scan along the opcodes for this branch; as soon as we find something
797 that matches a non-empty string, break out and advance to test the next
798 branch. If we get to the end of the branch, return TRUE for the whole
799 sub-expression. */
801 for (;;)
803 /* Test an embedded subpattern; if it could not be empty, break the
804 loop. Otherwise carry on in the branch. */
806 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
808 if (!could_be_empty(cc)) break;
809 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
810 cc += 3;
813 else switch (*cc)
815 /* Reached end of a branch: the subpattern may match the empty string */
817 case OP_ALT:
818 case OP_KET:
819 case OP_KETRMAX:
820 case OP_KETRMIN:
821 return TRUE;
823 /* Skip over entire bracket groups with zero lower bound */
825 case OP_BRAZERO:
826 case OP_BRAMINZERO:
827 cc++;
828 /* Fall through */
830 /* Skip over assertive subpatterns */
832 case OP_ASSERT:
833 case OP_ASSERT_NOT:
834 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
835 cc += 3;
836 break;
838 /* Skip over things that don't match chars */
840 case OP_SOD:
841 case OP_EOD:
842 case OP_CIRC:
843 case OP_DOLL:
844 case OP_NOT_WORD_BOUNDARY:
845 case OP_WORD_BOUNDARY:
846 case OP_NOT_WORD_BOUNDARY_L:
847 case OP_WORD_BOUNDARY_L:
848 cc++;
849 break;
851 /* Skip over simple repeats with zero lower bound */
853 case OP_STAR:
854 case OP_MINSTAR:
855 case OP_QUERY:
856 case OP_MINQUERY:
857 case OP_NOTSTAR:
858 case OP_NOTMINSTAR:
859 case OP_NOTQUERY:
860 case OP_NOTMINQUERY:
861 case OP_TYPESTAR:
862 case OP_TYPEMINSTAR:
863 case OP_TYPEQUERY:
864 case OP_TYPEMINQUERY:
865 cc += 2;
866 break;
868 /* Skip over UPTOs (lower bound is zero) */
870 case OP_UPTO:
871 case OP_MINUPTO:
872 case OP_TYPEUPTO:
873 case OP_TYPEMINUPTO:
874 cc += 4;
875 break;
877 /* Check a class or a back reference for a zero minimum */
879 case OP_CLASS:
880 case OP_NEGCLASS:
881 case OP_REF:
882 case OP_CLASS_L:
883 switch(*cc)
885 case (OP_REF): cc += 2; break;
886 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
887 case (OP_CLASS_L): cc += 1+1+32; break;
890 switch (*cc)
892 case OP_CRSTAR:
893 case OP_CRMINSTAR:
894 case OP_CRQUERY:
895 case OP_CRMINQUERY:
896 cc++;
897 break;
899 case OP_CRRANGE:
900 case OP_CRMINRANGE:
901 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
902 cc += 3;
903 break;
905 default:
906 goto NEXT_BRANCH;
908 break;
910 /* Anything else matches at least one character */
912 default:
913 goto NEXT_BRANCH;
917 NEXT_BRANCH:
918 code += (code[1] << 8) + code[2];
920 while (*code == OP_ALT);
922 /* No branches match the empty string */
924 return FALSE;
927 /* Determine the length of a group ID in an expression like
928 (?P<foo_123>...)
929 Arguments:
930 ptr pattern position pointer (say that 3 times fast)
931 finalchar the character that will mark the end of the ID
932 errorptr points to the pointer to the error message
935 static int
936 get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
938 const uschar *start = ptr;
940 /* If the first character is not in \w, or is in \w but is a digit,
941 report an error */
942 if (!(pcre_ctypes[*ptr] & ctype_word) ||
943 (pcre_ctypes[*ptr++] & ctype_digit))
945 *errorptr = "(?P identifier must start with a letter or underscore";
946 return 0;
949 /* Increment ptr until we either hit a null byte, the desired
950 final character, or a non-word character */
951 for(; (*ptr != 0) && (*ptr != finalchar) &&
952 (pcre_ctypes[*ptr] & ctype_word); ptr++)
954 /* Empty loop body */
956 if (*ptr==finalchar)
957 return ptr-start;
958 if (*ptr==0)
960 *errorptr = "unterminated (?P identifier";
961 return 0;
963 *errorptr = "illegal character in (?P identifier";
964 return 0;
967 /*************************************************
968 * Handle escapes *
969 *************************************************/
971 /* This function is called when a \ has been encountered. It either returns a
972 positive value for a simple escape such as \n, or a negative value which
973 encodes one of the more complicated things such as \d. On entry, ptr is
974 pointing at the \. On exit, it is on the final character of the escape
975 sequence.
977 Arguments:
978 ptrptr points to the pattern position pointer
979 errorptr points to the pointer to the error message
980 bracount number of previous extracting brackets
981 options the options bits
982 isclass TRUE if inside a character class
984 Returns: zero or positive => a data character
985 negative => a special escape sequence
986 on error, errorptr is set
989 static int
990 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
991 int options, BOOL isclass)
993 const uschar *ptr = *ptrptr;
994 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
995 int i;
997 if (c == 0) *errorptr = ERR1;
999 /* Digits or letters may have special meaning; all others are literals. */
1001 else if (c < '0' || c > 'z') {}
1003 /* Do an initial lookup in a table. A non-zero result is something that can be
1004 returned immediately. Otherwise further processing may be required. */
1006 else if ((i = escapes[c - '0']) != 0) c = i;
1008 /* Escapes that need further processing, or are illegal. */
1010 else
1013 switch (c)
1015 /* The handling of escape sequences consisting of a string of digits
1016 starting with one that is not zero is not straightforward. By experiment,
1017 the way Perl works seems to be as follows:
1019 Outside a character class, the digits are read as a decimal number. If the
1020 number is less than 10, or if there are that many previous extracting
1021 left brackets, then it is a back reference. Otherwise, up to three octal
1022 digits are read to form an escaped byte. Thus \123 is likely to be octal
1023 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1024 value is greater than 377, the least significant 8 bits are taken. Inside a
1025 character class, \ followed by a digit is always an octal number. */
1027 case '1': case '2': case '3': case '4': case '5':
1028 case '6': case '7': case '8': case '9':
1031 /* PYTHON: Try to compute an octal value for a character */
1032 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
1034 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1035 c = (c * 8 + ptr[i]-'0') & 255;
1036 else
1037 break; /* Non-octal character--break out of the loop */
1039 /* It's a character if there were exactly 3 octal digits, or if
1040 we're inside a character class and there was at least one
1041 octal digit. */
1042 if ( (i == 3) || (isclass && i!=0) )
1044 ptr += i-1;
1045 break;
1047 c = ptr[0]; /* Restore the first character after the \ */
1048 c -= '0'; i = 1;
1049 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1051 c = c * 10 + ptr[1] - '0';
1052 ptr++; i++;
1054 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1055 c = -(ESC_REF + c);
1057 break;
1059 /* \0 always starts an octal number, but we may drop through to here with a
1060 larger first octal digit */
1062 case '0':
1063 c -= '0';
1064 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1065 ptr[1] != '8' && ptr[1] != '9')
1066 c = (c * 8 + *(++ptr) - '0') & 255;
1067 break;
1069 /* Special escapes not starting with a digit are straightforward */
1071 case 'x':
1072 c = 0;
1073 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
1075 ptr++;
1076 c = c * 16 + pcre_lcc[*ptr] -
1077 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1078 c &= 255;
1080 break;
1083 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1084 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1085 for Perl compatibility, it is a literal. */
1087 default:
1088 if ((options & PCRE_EXTRA) != 0) switch(c)
1090 case 'X':
1091 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1092 break;
1094 default:
1095 *errorptr = ERR3;
1096 break;
1098 break;
1102 *ptrptr = ptr;
1103 return c;
1108 /*************************************************
1109 * Check for counted repeat *
1110 *************************************************/
1112 /* This function is called when a '{' is encountered in a place where it might
1113 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1114 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1115 where the ddds are digits.
1117 Arguments:
1118 p pointer to the first char after '{'
1120 Returns: TRUE or FALSE
1123 static BOOL
1124 is_counted_repeat(const uschar *p)
1126 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1127 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1128 if (*p == '}') return TRUE;
1130 if (*p++ != ',') return FALSE;
1131 if (*p == '}') return TRUE;
1133 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1134 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1135 return (*p == '}');
1140 /*************************************************
1141 * Read repeat counts *
1142 *************************************************/
1144 /* Read an item of the form {n,m} and return the values. This is called only
1145 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1146 so the syntax is guaranteed to be correct, but we need to check the values.
1148 Arguments:
1149 p pointer to first char after '{'
1150 minp pointer to int for min
1151 maxp pointer to int for max
1152 returned as -1 if no max
1153 errorptr points to pointer to error message
1155 Returns: pointer to '}' on success;
1156 current ptr on error, with errorptr set
1159 static const uschar *
1160 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1162 int min = 0;
1163 int max = -1;
1165 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1167 if (*p == '}') max = min; else
1169 if (*(++p) != '}')
1171 max = 0;
1172 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1173 if (max < min)
1175 *errorptr = ERR4;
1176 return p;
1181 /* Do paranoid checks, then fill in the required variables, and pass back the
1182 pointer to the terminating '}'. */
1184 if (min > 65535 || max > 65535)
1185 *errorptr = ERR5;
1186 else
1188 *minp = min;
1189 *maxp = max;
1191 return p;
1196 /*************************************************
1197 * Compile one branch *
1198 *************************************************/
1200 /* Scan the pattern, compiling it into the code vector.
1202 Arguments:
1203 options the option bits
1204 bracket points to number of brackets used
1205 code points to the pointer to the current code point
1206 ptrptr points to the current pattern pointer
1207 errorptr points to pointer to error message
1209 Returns: TRUE on success
1210 FALSE, with *errorptr set on error
1213 static BOOL
1214 compile_branch(int options, int *brackets, uschar **codeptr,
1215 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
1217 int repeat_type, op_type;
1218 int repeat_min, repeat_max;
1219 int bravalue, length;
1220 int greedy_default, greedy_non_default;
1221 register int c;
1222 register uschar *code = *codeptr;
1223 const uschar *ptr = *ptrptr;
1224 const uschar *oldptr;
1225 uschar *previous = NULL;
1226 uschar class[32];
1227 uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
1229 /* Set up the default and non-default settings for greediness */
1231 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1232 greedy_non_default = greedy_default ^ 1;
1234 /* Switch on next character until the end of the branch */
1236 for (;; ptr++)
1238 BOOL negate_class;
1239 int class_charcount;
1240 int class_lastchar;
1242 c = *ptr;
1243 if ((options & PCRE_EXTENDED) != 0)
1245 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1246 if (c == '#')
1248 while ((c = *(++ptr)) != 0 && c != '\n');
1249 continue;
1253 switch(c)
1255 /* The branch terminates at end of string, |, or ). */
1257 case 0:
1258 case '|':
1259 case ')':
1260 *codeptr = code;
1261 *ptrptr = ptr;
1262 return TRUE;
1264 /* Handle single-character metacharacters */
1266 case '^':
1267 previous = NULL;
1268 *code++ = OP_CIRC;
1269 break;
1271 case '$':
1272 previous = NULL;
1273 *code++ = OP_DOLL;
1274 break;
1276 case '.':
1277 previous = code;
1278 *code++ = OP_ANY;
1279 break;
1281 /* Character classes. These always build a 32-byte bitmap of the permitted
1282 characters, except in the special case where there is only one character.
1283 For negated classes, we build the map as usual, then invert it at the end.
1286 case '[':
1287 previous = code;
1288 if (options & PCRE_LOCALE)
1290 *code++ = OP_CLASS_L;
1291 /* Set the flag for localized classes (like \w) to 0 */
1292 class_flag = code;
1293 *class_flag = 0;
1295 else
1297 *code++ = OP_CLASS;
1298 class_flag = NULL;
1301 /* If the first character is '^', set the negation flag, and use a
1302 different opcode. This only matters if caseless matching is specified at
1303 runtime. */
1305 if ((c = *(++ptr)) == '^')
1307 negate_class = TRUE;
1308 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
1309 c = *(++ptr);
1311 else negate_class = FALSE;
1313 /* Keep a count of chars so that we can optimize the case of just a single
1314 character. */
1316 class_charcount = 0;
1317 class_lastchar = -1;
1319 /* Initialize the 32-char bit map to all zeros. We have to build the
1320 map in a temporary bit of store, in case the class contains only 1
1321 character, because in that case the compiled code doesn't use the
1322 bit map. */
1324 memset(class, 0, 32 * sizeof(uschar));
1326 /* Process characters until ] is reached. By writing this as a "do" it
1327 means that an initial ] is taken as a data character. */
1331 if (c == 0)
1333 *errorptr = ERR6;
1334 goto FAILED;
1337 /* Backslash may introduce a single character, or it may introduce one
1338 of the specials, which just set a flag. Escaped items are checked for
1339 validity in the pre-compiling pass. The sequence \b is a special case.
1340 Inside a class (and only there) it is treated as backspace. Elsewhere
1341 it marks a word boundary. Other escapes have preset maps ready to
1342 or into the one we are building. We assume they have more than one
1343 character in them, so set class_count bigger than one. */
1345 if (c == '\\')
1347 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1348 if (-c == ESC_b) c = '\b';
1349 else if (c < 0)
1351 class_charcount = 10;
1352 switch (-c)
1354 case ESC_d:
1356 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1358 continue;
1360 case ESC_D:
1362 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1364 continue;
1366 case ESC_w:
1367 if (options & PCRE_LOCALE)
1369 *class_flag |= 1;
1371 else
1373 for (c = 0; c < 32; c++)
1374 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1376 continue;
1378 case ESC_W:
1379 if (options & PCRE_LOCALE)
1381 *class_flag |= 2;
1383 else
1385 for (c = 0; c < 32; c++)
1386 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1388 continue;
1390 case ESC_s:
1392 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1394 continue;
1396 case ESC_S:
1398 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1400 continue;
1402 default:
1403 *errorptr = ERR7;
1404 goto FAILED;
1407 /* Fall through if single character */
1410 /* A single character may be followed by '-' to form a range. However,
1411 Perl does not permit ']' to be the end of the range. A '-' character
1412 here is treated as a literal. */
1414 if (ptr[1] == '-' && ptr[2] != ']')
1416 int d;
1417 ptr += 2;
1418 d = *ptr;
1420 if (d == 0)
1422 *errorptr = ERR6;
1423 goto FAILED;
1426 /* The second part of a range can be a single-character escape, but
1427 not any of the other escapes. */
1429 if (d == '\\')
1431 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1432 if (d < 0)
1434 if (d == -ESC_b) d = '\b'; else
1436 *errorptr = ERR7;
1437 goto FAILED;
1442 if (d < c)
1444 *errorptr = ERR8;
1445 goto FAILED;
1448 for (; c <= d; c++)
1450 class[c/8] |= (1 << (c&7));
1451 if ((options & PCRE_CASELESS) != 0)
1453 int uc = pcre_fcc[c]; /* flip case */
1454 class[uc/8] |= (1 << (uc&7));
1456 class_charcount++; /* in case a one-char range */
1457 class_lastchar = c;
1459 continue; /* Go get the next char in the class */
1462 /* Handle a lone single character - we can get here for a normal
1463 non-escape char, or after \ that introduces a single character. */
1465 class [c/8] |= (1 << (c&7));
1466 if ((options & PCRE_CASELESS) != 0)
1468 c = pcre_fcc[c]; /* flip case */
1469 class[c/8] |= (1 << (c&7));
1471 class_charcount++;
1472 class_lastchar = c;
1475 /* Loop until ']' reached; the check for end of string happens inside the
1476 loop. This "while" is the end of the "do" above. */
1478 while ((c = *(++ptr)) != ']');
1480 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1481 precisely one character. This doesn't need the whole 32-byte bit map.
1482 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1483 it's negative. */
1485 if (class_charcount == 1 && class_lastchar >= 0)
1487 if (negate_class)
1489 code[-1] = OP_NOT;
1491 else
1493 code[-1] = OP_CHARS;
1494 *code++ = 1;
1496 *code++ = class_lastchar;
1499 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1500 the code vector. */
1502 else
1504 /* If this is a localized opcode, bump the code pointer up */
1505 if (class_flag) code++;
1506 if (negate_class)
1508 if (class_flag) *class_flag = (*class_flag) ^ 63;
1509 for (c = 0; c < 32; c++) code[c] = ~class[c];
1511 else
1512 memcpy(code, class, 32);
1513 code += 32;
1515 break;
1517 /* Various kinds of repeat */
1519 case '{':
1520 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1521 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1522 if (*errorptr != NULL) goto FAILED;
1523 goto REPEAT;
1525 case '*':
1526 repeat_min = 0;
1527 repeat_max = -1;
1528 goto REPEAT;
1530 case '+':
1531 repeat_min = 1;
1532 repeat_max = -1;
1533 goto REPEAT;
1535 case '?':
1536 repeat_min = 0;
1537 repeat_max = 1;
1539 REPEAT:
1540 if (previous == NULL)
1542 *errorptr = ERR9;
1543 goto FAILED;
1546 /* If the next character is '?' this is a minimizing repeat, by default,
1547 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1548 next character. */
1550 if (ptr[1] == '?')
1551 { repeat_type = greedy_non_default; ptr++; }
1552 else repeat_type = greedy_default;
1554 /* If the maximum is zero then the minimum must also be zero; Perl allows
1555 this case, so we do too - by simply omitting the item altogether. */
1557 if (repeat_max == 0) code = previous;
1559 /* If previous was a string of characters, chop off the last one and use it
1560 as the subject of the repeat. If there was only one character, we can
1561 abolish the previous item altogether. */
1563 else if (*previous == OP_CHARS)
1565 int len = previous[1];
1566 if (len == 1)
1568 c = previous[2];
1569 code = previous;
1571 else
1573 c = previous[len+1];
1574 previous[1]--;
1575 code--;
1577 op_type = 0; /* Use single-char op codes */
1578 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1581 /* If previous was a single negated character ([^a] or similar), we use
1582 one of the special opcodes, replacing it. The code is shared with single-
1583 character repeats by adding a suitable offset into repeat_type. */
1585 else if ((int)*previous == OP_NOT)
1587 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1588 c = previous[1];
1589 code = previous;
1590 goto OUTPUT_SINGLE_REPEAT;
1593 /* If previous was a character type match (\d or similar), abolish it and
1594 create a suitable repeat item. The code is shared with single-character
1595 repeats by adding a suitable offset into repeat_type. */
1597 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
1599 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1600 c = *previous;
1601 code = previous;
1603 OUTPUT_SINGLE_REPEAT:
1604 repeat_type += op_type; /* Combine both values for many cases */
1606 /* A minimum of zero is handled either as the special case * or ?, or as
1607 an UPTO, with the maximum given. */
1609 if (repeat_min == 0)
1611 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1612 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1613 else
1615 *code++ = OP_UPTO + repeat_type;
1616 *code++ = repeat_max >> 8;
1617 *code++ = (repeat_max & 255);
1621 /* The case {1,} is handled as the special case + */
1623 else if (repeat_min == 1 && repeat_max == -1)
1624 *code++ = OP_PLUS + repeat_type;
1626 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1627 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1629 else
1631 if (repeat_min != 1)
1633 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1634 *code++ = repeat_min >> 8;
1635 *code++ = (repeat_min & 255);
1638 /* If the minimum is 1 and the previous item was a character string,
1639 we either have to put back the item that got canceled if the string
1640 length was 1, or add the character back onto the end of a longer
1641 string. For a character type nothing need be done; it will just get
1642 put back naturally. Note that the final character is always going to
1643 get added below. */
1645 else if (*previous == OP_CHARS)
1647 if (code == previous) code += 2; else previous[1]++;
1650 /* For a single negated character we also have to put back the
1651 item that got canceled. */
1653 else if (*previous == OP_NOT) code++;
1655 /* If the maximum is unlimited, insert an OP_STAR. */
1657 if (repeat_max < 0)
1659 *code++ = c;
1660 *code++ = OP_STAR + repeat_type;
1663 /* Else insert an UPTO if the max is greater than the min. */
1665 else if (repeat_max != repeat_min)
1667 *code++ = c;
1668 repeat_max -= repeat_min;
1669 *code++ = OP_UPTO + repeat_type;
1670 *code++ = repeat_max >> 8;
1671 *code++ = (repeat_max & 255);
1675 /* The character or character type itself comes last in all cases. */
1677 *code++ = c;
1680 /* If previous was a character class or a back reference, we put the repeat
1681 stuff after it. */
1683 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1684 *previous==OP_CLASS_L || *previous == OP_REF)
1686 if (repeat_min == 0 && repeat_max == -1)
1687 *code++ = OP_CRSTAR + repeat_type;
1688 else if (repeat_min == 1 && repeat_max == -1)
1689 *code++ = OP_CRPLUS + repeat_type;
1690 else if (repeat_min == 0 && repeat_max == 1)
1691 *code++ = OP_CRQUERY + repeat_type;
1692 else
1694 *code++ = OP_CRRANGE + repeat_type;
1695 *code++ = repeat_min >> 8;
1696 *code++ = repeat_min & 255;
1697 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1698 *code++ = repeat_max >> 8;
1699 *code++ = repeat_max & 255;
1703 /* If previous was a bracket group, we may have to replicate it in certain
1704 cases. If the maximum repeat count is unlimited, check that the bracket
1705 group cannot match the empty string, and diagnose an error if it can. */
1707 else if ((int)*previous >= OP_BRA)
1709 int i;
1710 int len = code - previous;
1712 if (repeat_max == -1 && could_be_empty(previous))
1714 *errorptr = ERR10;
1715 goto FAILED;
1718 /* If the minimum is greater than zero, and the maximum is unlimited or
1719 equal to the minimum, the first copy remains where it is, and is
1720 replicated up to the minimum number of times. This case includes the +
1721 repeat, but of course no replication is needed in that case. */
1723 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1725 for (i = 1; i < repeat_min; i++)
1727 memcpy(code, previous, len);
1728 code += len;
1732 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1733 Then, if there is a fixed upper limit, replicated up to that many times,
1734 sticking BRAZERO in front of all the optional ones. */
1736 else
1738 if (repeat_min == 0)
1740 memmove(previous+1, previous, len);
1741 code++;
1742 *previous++ = OP_BRAZERO + repeat_type;
1745 for (i = 1; i < repeat_min; i++)
1747 memcpy(code, previous, len);
1748 code += len;
1751 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1753 *code++ = OP_BRAZERO + repeat_type;
1754 memcpy(code, previous, len);
1755 code += len;
1759 /* If the maximum is unlimited, set a repeater in the final copy. */
1761 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1764 /* Else there's some kind of shambles */
1766 else
1768 *errorptr = ERR11;
1769 goto FAILED;
1772 /* In all case we no longer have a previous item. */
1774 previous = NULL;
1775 break;
1778 /* Start of nested bracket sub-expression, or comment or lookahead.
1779 First deal with special things that can come after a bracket; all are
1780 introduced by ?, and the appearance of any of them means that this is not a
1781 referencing group. They were checked for validity in the first pass over
1782 the string, so we don't have to check for syntax errors here. */
1784 case '(':
1785 previous = code; /* Only real brackets can be repeated */
1786 if (*(++ptr) == '?')
1788 bravalue = OP_BRA;
1790 switch (*(++ptr))
1792 case '#':
1793 case 'i':
1794 case 'L':
1795 case 'm':
1796 case 's':
1797 case 'x':
1798 ptr++;
1799 while (*ptr != ')') ptr++;
1800 previous = NULL;
1801 continue;
1803 case ':': /* Non-extracting bracket */
1804 ptr++;
1805 break;
1807 case '=': /* Assertions can't be repeated */
1808 bravalue = OP_ASSERT;
1809 ptr++;
1810 previous = NULL;
1811 break;
1813 case '!':
1814 bravalue = OP_ASSERT_NOT;
1815 ptr++;
1816 previous = NULL;
1817 break;
1819 case ('P'):
1820 ptr++;
1821 if (*ptr=='<')
1823 /* (?P<groupname>...) */
1824 int idlen;
1825 PyObject *string, *intobj;
1827 ptr++;
1828 idlen = get_group_id(ptr, '>', errorptr);
1829 if (*errorptr) {
1830 goto FAILED;
1832 string = PyString_FromStringAndSize((char*)ptr, idlen);
1833 intobj = PyInt_FromLong( brackets[0] + 1 );
1834 if (intobj == NULL || string == NULL)
1836 Py_XDECREF(string);
1837 Py_XDECREF(intobj);
1838 *errorptr = "exception raised";
1839 goto FAILED;
1841 PyDict_SetItem(dictionary, string, intobj);
1842 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
1843 ptr += idlen+1; /* Point to rest of expression */
1844 goto do_grouping_bracket;
1846 if (*ptr=='=')
1848 /* (?P=groupname) */
1849 int idlen, refnum;
1850 PyObject *string, *intobj;
1852 ptr++;
1853 idlen = get_group_id(ptr, ')', errorptr);
1854 if (*errorptr) {
1855 goto FAILED;
1857 string = PyString_FromStringAndSize((char *)ptr, idlen);
1858 if (string==NULL) {
1859 *errorptr = "exception raised";
1860 goto FAILED;
1862 intobj = PyDict_GetItem(dictionary, string);
1863 if (intobj==NULL) {
1864 Py_DECREF(string);
1865 *errorptr = "?P= group identifier isn't defined";
1866 goto FAILED;
1869 refnum = PyInt_AsLong(intobj);
1870 Py_DECREF(string);
1871 /* The caller doesn't own the reference to the value
1872 returned from PyDict_GetItem, so intobj is not
1873 DECREF'ed. */
1875 *code++ = OP_REF;
1876 *code++ = refnum;
1877 /* The continue will cause the top-level for() loop to
1878 be resumed, so ptr will be immediately incremented.
1879 Therefore, the following line adds just idlen, not
1880 idlen+1 */
1881 ptr += idlen;
1882 continue;
1884 /* The character after ?P is neither < nor =, so
1885 report an error. Add more Python-extensions here. */
1886 *errorptr="unknown after (?P";
1887 goto FAILED;
1889 case '>': /* "Match once" brackets */
1890 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1892 bravalue = OP_ONCE;
1893 ptr++;
1894 previous = NULL;
1895 break;
1897 /* Else fall through */
1899 default:
1900 *errorptr = ERR12;
1901 goto FAILED;
1905 /* Else we have a referencing group */
1907 else
1909 do_grouping_bracket:
1910 if (++(*brackets) > EXTRACT_MAX)
1912 *errorptr = ERR13;
1913 goto FAILED;
1915 bravalue = OP_BRA + *brackets;
1918 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1919 code into a non-register variable in order to be able to pass its address
1920 because some compilers complain otherwise. */
1922 *code = bravalue;
1924 uschar *mcode = code;
1925 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
1926 goto FAILED;
1927 code = mcode;
1930 if (*ptr != ')')
1932 *errorptr = ERR14;
1933 goto FAILED;
1935 break;
1937 /* Check \ for being a real metacharacter; if not, fall through and handle
1938 it as a data character at the start of a string. Escape items are checked
1939 for validity in the pre-compiling pass. */
1941 case '\\':
1942 oldptr = ptr;
1943 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1945 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1946 are arranged to be the negation of the corresponding OP_values. For the
1947 back references, the values are ESC_REF plus the reference number. Only
1948 back references and those types that consume a character may be repeated.
1949 We can test for values between ESC_b and ESC_Z for the latter; this may
1950 have to change if any new ones are ever created. */
1952 if (c < 0)
1954 if (-c >= ESC_REF)
1956 int refnum = -c - ESC_REF;
1957 if (*brackets < refnum)
1959 *errorptr = ERR15;
1960 goto FAILED;
1962 previous = code;
1963 *code++ = OP_REF;
1964 *code++ = refnum;
1966 else
1968 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1969 if ( (options & PCRE_LOCALE) != 0)
1971 switch (c)
1973 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1974 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1975 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1976 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1979 *code++ = -c;
1981 continue;
1984 /* Data character: Reset and fall through */
1986 ptr = oldptr;
1987 c = '\\';
1989 /* Handle a run of data characters until a metacharacter is encountered.
1990 The first character is guaranteed not to be whitespace or # when the
1991 extended flag is set. */
1993 NORMAL_CHAR:
1994 default:
1995 previous = code;
1996 *code = OP_CHARS;
1997 code += 2;
1998 length = 0;
2002 if ((options & PCRE_EXTENDED) != 0)
2004 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2005 if (c == '#')
2007 while ((c = *(++ptr)) != 0 && c != '\n');
2008 if (c == 0) break;
2009 continue;
2013 /* Backslash may introduce a data char or a metacharacter. Escaped items
2014 are checked for validity in the pre-compiling pass. Stop the string
2015 before a metaitem. */
2017 if (c == '\\')
2019 oldptr = ptr;
2020 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
2021 if (c < 0) { ptr = oldptr; break; }
2024 /* Ordinary character or single-char escape */
2026 *code++ = c;
2027 length++;
2030 /* This "while" is the end of the "do" above. */
2032 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2034 /* Compute the length and set it in the data vector, and advance to
2035 the next state. */
2037 previous[1] = length;
2038 if (length < 255) ptr--;
2039 break;
2041 } /* end of big loop */
2043 /* Control never reaches here by falling through, only by a goto for all the
2044 error states. Pass back the position in the pattern so that it can be displayed
2045 to the user for diagnosing the error. */
2047 FAILED:
2048 *ptrptr = ptr;
2049 return FALSE;
2055 /*************************************************
2056 * Compile sequence of alternatives *
2057 *************************************************/
2059 /* On entry, ptr is pointing past the bracket character, but on return
2060 it points to the closing bracket, or vertical bar, or end of string.
2061 The code variable is pointing at the byte into which the BRA operator has been
2062 stored.
2064 Argument:
2065 options the option bits
2066 brackets -> int containing the number of extracting brackets used
2067 codeptr -> the address of the current code pointer
2068 ptrptr -> the address of the current pattern pointer
2069 errorptr -> pointer to error message
2071 Returns: TRUE on success
2074 static BOOL
2075 compile_regex(int options, int *brackets, uschar **codeptr,
2076 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
2078 const uschar *ptr = *ptrptr;
2079 uschar *code = *codeptr;
2080 uschar *start_bracket = code;
2082 for (;;)
2084 int length;
2085 uschar *last_branch = code;
2087 code += 3;
2088 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
2090 *ptrptr = ptr;
2091 return FALSE;
2094 /* Fill in the length of the last branch */
2096 length = code - last_branch;
2097 last_branch[1] = length >> 8;
2098 last_branch[2] = length & 255;
2100 /* Reached end of expression, either ')' or end of pattern. Insert a
2101 terminating ket and the length of the whole bracketed item, and return,
2102 leaving the pointer at the terminating char. */
2104 if (*ptr != '|')
2106 length = code - start_bracket;
2107 *code++ = OP_KET;
2108 *code++ = length >> 8;
2109 *code++ = length & 255;
2110 *codeptr = code;
2111 *ptrptr = ptr;
2112 return TRUE;
2115 /* Another branch follows; insert an "or" node and advance the pointer. */
2117 *code = OP_ALT;
2118 ptr++;
2120 /* Control never reaches here */
2125 /*************************************************
2126 * Check for anchored expression *
2127 *************************************************/
2129 /* Try to find out if this is an anchored regular expression. Consider each
2130 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2131 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2132 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2133 counts, since OP_CIRC can match in the middle.
2135 A branch is also implicitly anchored if it starts with .* because that will try
2136 the rest of the pattern at all possible matching points, so there is no point
2137 trying them again.
2139 Argument: points to start of expression (the bracket)
2140 Returns: TRUE or FALSE
2143 static BOOL
2144 is_anchored(register const uschar *code, BOOL multiline)
2146 do {
2147 int op = (int)code[3];
2148 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2149 { if (!is_anchored(code+3, multiline)) return FALSE; }
2150 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2151 { if (code[4] != OP_ANY) return FALSE; }
2152 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2153 code += (code[1] << 8) + code[2];
2155 while (*code == OP_ALT);
2156 return TRUE;
2161 /*************************************************
2162 * Check for start with \n line expression *
2163 *************************************************/
2165 /* This is called for multiline expressions to try to find out if every branch
2166 starts with ^ so that "first char" processing can be done to speed things up.
2168 Argument: points to start of expression (the bracket)
2169 Returns: TRUE or FALSE
2172 static BOOL
2173 is_startline(const uschar *code)
2175 do {
2176 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2177 { if (!is_startline(code+3)) return FALSE; }
2178 else if (code[3] != OP_CIRC) return FALSE;
2179 code += (code[1] << 8) + code[2];
2181 while (*code == OP_ALT);
2182 return TRUE;
2187 /*************************************************
2188 * Check for fixed first char *
2189 *************************************************/
2191 /* Try to find out if there is a fixed first character. This is called for
2192 unanchored expressions, as it speeds up their processing quite considerably.
2193 Consider each alternative branch. If they all start with the same char, or with
2194 a bracket all of whose alternatives start with the same char (recurse ad lib),
2195 then we return that char, otherwise -1.
2197 Argument: points to start of expression (the bracket)
2198 Returns: -1 or the fixed first char
2201 static int
2202 find_firstchar(uschar *code)
2204 register int c = -1;
2207 register int charoffset = 4;
2209 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2211 register int d;
2212 if ((d = find_firstchar(code+3)) < 0) return -1;
2213 if (c < 0) c = d; else if (c != d) return -1;
2216 else switch(code[3])
2218 default:
2219 return -1;
2221 case OP_EXACT: /* Fall through */
2222 charoffset++;
2224 case OP_CHARS: /* Fall through */
2225 charoffset++;
2227 case OP_PLUS:
2228 case OP_MINPLUS:
2229 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2230 break;
2232 code += (code[1] << 8) + code[2];
2234 while (*code == OP_ALT);
2235 return c;
2240 /*************************************************
2241 * Compile a Regular Expression *
2242 *************************************************/
2244 /* This function takes a string and returns a pointer to a block of store
2245 holding a compiled version of the expression.
2247 Arguments:
2248 pattern the regular expression
2249 options various option bits
2250 errorptr pointer to pointer to error text
2251 erroroffset ptr offset in pattern where error was detected
2253 Returns: pointer to compiled data block, or NULL on error,
2254 with errorptr and erroroffset set
2257 pcre *
2258 pcre_compile(const char *pattern, int options, const char **errorptr,
2259 int *erroroffset, PyObject *dictionary)
2261 real_pcre *re;
2262 int spaces = 0;
2263 int length = 3; /* For initial BRA plus length */
2264 int runlength;
2265 int c, size;
2266 int bracount = 0;
2267 int brastack[200];
2268 int top_backref = 0;
2269 unsigned int brastackptr = 0;
2270 uschar *code;
2271 const uschar *ptr;
2273 #ifdef DEBUG
2274 uschar *code_base, *code_end;
2275 #endif
2277 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2278 can do is just return NULL. */
2280 if (errorptr == NULL) return NULL;
2281 *errorptr = NULL;
2283 /* However, we can give a message for this error */
2285 if (erroroffset == NULL)
2287 *errorptr = ERR16;
2288 return NULL;
2290 *erroroffset = 0;
2292 if ((options & ~PUBLIC_OPTIONS) != 0)
2294 *errorptr = ERR17;
2295 return NULL;
2298 DPRINTF(("------------------------------------------------------------------\n"));
2299 DPRINTF(("%s\n", pattern));
2301 /* The first thing to do is to make a pass over the pattern to compute the
2302 amount of store required to hold the compiled code. This does not have to be
2303 perfect as long as errors are overestimates. At the same time we can detect any
2304 internal flag settings. Make an attempt to correct for any counted white space
2305 if an "extended" flag setting appears late in the pattern. We can't be so
2306 clever for #-comments. */
2308 ptr = (const uschar *)(pattern - 1);
2309 while ((c = *(++ptr)) != 0)
2311 int min, max;
2312 int class_charcount;
2314 if ((pcre_ctypes[c] & ctype_space) != 0)
2316 if ((options & PCRE_EXTENDED) != 0) continue;
2317 spaces++;
2320 if (c == '#' && (options & PCRE_EXTENDED) != 0)
2322 while ((c = *(++ptr)) != 0 && c != '\n');
2323 continue;
2326 switch(c)
2328 /* A backslashed item may be an escaped "normal" character or a
2329 character type. For a "normal" character, put the pointers and
2330 character back so that tests for whitespace etc. in the input
2331 are done correctly. */
2333 case '\\':
2335 const uschar *save_ptr = ptr;
2336 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2337 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2338 if (c >= 0)
2340 ptr = save_ptr;
2341 c = '\\';
2342 goto NORMAL_CHAR;
2345 length++;
2347 /* A back reference needs an additional char, plus either one or 5
2348 bytes for a repeat. We also need to keep the value of the highest
2349 back reference. */
2351 if (c <= -ESC_REF)
2353 int refnum = -c - ESC_REF;
2354 if (refnum > top_backref) top_backref = refnum;
2355 length++; /* For single back reference */
2356 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2358 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2359 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2360 if ((min == 0 && (max == 1 || max == -1)) ||
2361 (min == 1 && max == -1))
2362 length++;
2363 else length += 5;
2364 if (ptr[1] == '?') ptr++;
2367 continue;
2369 case '^':
2370 case '.':
2371 case '$':
2372 case '*': /* These repeats won't be after brackets; */
2373 case '+': /* those are handled separately */
2374 case '?':
2375 length++;
2376 continue;
2378 /* This covers the cases of repeats after a single char, metachar, class,
2379 or back reference. */
2381 case '{':
2382 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2383 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2384 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2385 if ((min == 0 && (max == 1 || max == -1)) ||
2386 (min == 1 && max == -1))
2387 length++;
2388 else
2390 length--; /* Uncount the original char or metachar */
2391 if (min == 1) length++; else if (min > 0) length += 4;
2392 if (max > 0) length += 4; else length += 2;
2394 if (ptr[1] == '?') ptr++;
2395 continue;
2397 /* An alternation contains an offset to the next branch or ket. */
2398 case '|':
2399 length += 3;
2400 continue;
2402 /* A character class uses 33 characters. Don't worry about character types
2403 that aren't allowed in classes - they'll get picked up during the compile.
2404 A character class that contains only one character uses 2 or 3 bytes,
2405 depending on whether it is negated or not. Notice this where we can. */
2407 case '[':
2408 class_charcount = 0;
2409 if (*(++ptr) == '^') ptr++;
2412 if (*ptr == '\\')
2414 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
2415 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2416 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2418 else class_charcount++;
2419 ptr++;
2421 while (*ptr != 0 && *ptr != ']');
2423 /* Repeats for negated single chars are handled by the general code */
2425 if (class_charcount == 1) length += 3; else
2427 length += 33;
2428 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2430 /* A repeat needs either 1 or 5 bytes. */
2432 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2434 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2435 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2436 if ((min == 0 && (max == 1 || max == -1)) ||
2437 (min == 1 && max == -1))
2438 length++;
2439 else length += 5;
2440 if (ptr[1] == '?') ptr++;
2443 continue;
2445 /* Brackets may be genuine groups or special things */
2447 case '(':
2449 /* Handle special forms of bracket, which all start (? */
2451 if (ptr[1] == '?') switch (c = ptr[2])
2453 /* Skip over comments entirely */
2454 case '#':
2455 ptr += 3;
2456 while (*ptr != 0 && *ptr != ')') ptr++;
2457 if (*ptr == 0)
2459 *errorptr = ERR18;
2460 goto PCRE_ERROR_RETURN;
2462 continue;
2464 /* Non-referencing groups and lookaheads just move the pointer on, and
2465 then behave like a non-special bracket, except that they don't increment
2466 the count of extracting brackets. */
2468 case ':':
2469 case '=':
2470 case '!':
2471 ptr += 2;
2472 break;
2474 case ('P'):
2476 int idlen;
2477 switch (*ptr++) {
2478 case ('<'):
2479 idlen = get_group_id(ptr++, '>', errorptr);
2480 if (*errorptr) goto PCRE_ERROR_RETURN;
2481 ptr += idlen+1;
2482 break;
2483 case ('='):
2484 idlen = get_group_id(ptr++, ')', errorptr);
2485 if (*errorptr) goto PCRE_ERROR_RETURN;
2486 ptr += idlen+1;
2487 length++;
2488 break;
2491 break;
2493 /* Ditto for the "once only" bracket, allowed only if the extra bit
2494 is set. */
2496 case '>':
2497 if ((options & PCRE_EXTRA) != 0)
2499 ptr += 2;
2500 break;
2502 /* Else fall through */
2504 /* Else loop setting valid options until ) is met. Anything else is an
2505 error. */
2507 default:
2508 ptr += 2;
2509 for (;; ptr++)
2511 if ((c = *ptr) == 'i')
2513 options |= PCRE_CASELESS;
2514 continue;
2516 else if ((c = *ptr) == 'L')
2518 options |= PCRE_LOCALE;
2519 continue;
2521 else if ((c = *ptr) == 'm')
2523 options |= PCRE_MULTILINE;
2524 continue;
2526 else if (c == 's')
2528 options |= PCRE_DOTALL;
2529 continue;
2531 else if (c == 'x')
2533 options |= PCRE_EXTENDED;
2534 length -= spaces; /* Already counted spaces */
2535 continue;
2537 else if (c == ')') break;
2539 *errorptr = ERR12;
2540 goto PCRE_ERROR_RETURN;
2542 continue; /* End of this bracket handling */
2545 /* Extracting brackets must be counted so we can process escapes in a
2546 Perlish way. */
2548 else bracount++;
2550 /* Non-special forms of bracket. Save length for computing whole length
2551 at end if there's a repeat that requires duplication of the group. */
2553 if (brastackptr >= sizeof(brastack)/sizeof(int))
2555 *errorptr = ERR19;
2556 goto PCRE_ERROR_RETURN;
2559 brastack[brastackptr++] = length;
2560 length += 3;
2561 continue;
2563 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2564 have to replicate this bracket up to that many times. If brastackptr is
2565 0 this is an unmatched bracket which will generate an error, but take care
2566 not to try to access brastack[-1]. */
2568 case ')':
2569 length += 3;
2571 int minval = 1;
2572 int maxval = 1;
2573 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
2575 /* Leave ptr at the final char; for read_repeat_counts this happens
2576 automatically; for the others we need an increment. */
2578 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2580 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2581 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2584 else if (c == '+') { maxval = -1; ptr++; }
2585 else if (c == '?') { minval = 0; ptr++; }
2587 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2588 if there is a limited maximum we have to replicate up to maxval-1 times
2589 and allow for a BRAZERO item before each optional copy, as we also have
2590 to do before the first copy if the minimum is zero. */
2592 if (minval == 0) length++;
2593 else if (minval > 1) length += (minval - 1) * duplength;
2594 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2596 continue;
2598 /* Non-special character. For a run of such characters the length required
2599 is the number of characters + 2, except that the maximum run length is 255.
2600 We won't get a skipped space or a non-data escape or the start of a #
2601 comment as the first character, so the length can't be zero. */
2603 NORMAL_CHAR:
2604 default:
2605 length += 2;
2606 runlength = 0;
2609 if ((pcre_ctypes[c] & ctype_space) != 0)
2611 if ((options & PCRE_EXTENDED) != 0) continue;
2612 spaces++;
2615 if (c == '#' && (options & PCRE_EXTENDED) != 0)
2617 while ((c = *(++ptr)) != 0 && c != '\n');
2618 continue;
2621 /* Backslash may introduce a data char or a metacharacter; stop the
2622 string before the latter. */
2624 if (c == '\\')
2626 const uschar *saveptr = ptr;
2627 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2628 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2629 if (c < 0) { ptr = saveptr; break; }
2632 /* Ordinary character or single-char escape */
2634 runlength++;
2637 /* This "while" is the end of the "do" above. */
2639 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2641 ptr--;
2642 length += runlength;
2643 continue;
2647 length += 4; /* For final KET and END */
2649 if (length > 65539)
2651 *errorptr = ERR20;
2652 return NULL;
2655 /* Compute the size of data block needed and get it, either from malloc or
2656 externally provided function. We specify "code[0]" in the offsetof() expression
2657 rather than just "code", because it has been reported that one broken compiler
2658 fails on "code" because it is also an independent variable. It should make no
2659 difference to the value of the offsetof(). */
2661 size = length + offsetof(real_pcre, code[0]);
2662 re = (real_pcre *)(pcre_malloc)(size+50);
2664 if (re == NULL)
2666 *errorptr = ERR21;
2667 return NULL;
2670 /* Put in the magic number and the options. */
2672 re->magic_number = MAGIC_NUMBER;
2673 re->options = options;
2675 /* Set up a starting, non-extracting bracket, then compile the expression. On
2676 error, *errorptr will be set non-NULL, so we don't need to look at the result
2677 of the function here. */
2679 ptr = (const uschar *)pattern;
2680 code = re->code;
2681 *code = OP_BRA;
2682 bracount = 0;
2683 (void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2684 re->top_bracket = bracount;
2685 re->top_backref = top_backref;
2687 /* If not reached end of pattern on success, there's an excess bracket. */
2689 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2691 /* Fill in the terminating state and check for disastrous overflow, but
2692 if debugging, leave the test till after things are printed out. */
2694 *code++ = OP_END;
2697 #ifndef DEBUG
2698 if (code - re->code > length) *errorptr = ERR23;
2699 #endif
2701 /* Failed to compile */
2703 if (*errorptr != NULL)
2705 (pcre_free)(re);
2706 PCRE_ERROR_RETURN:
2707 *erroroffset = ptr - (const uschar *)pattern;
2708 return NULL;
2711 /* If the anchored option was not passed, set flag if we can determine that it
2712 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2713 we can determine what the first character has to be, because that speeds up
2714 unanchored matches no end. In the case of multiline matches, an alternative is
2715 to set the PCRE_STARTLINE flag if all branches start with ^. */
2717 if ((options & PCRE_ANCHORED) == 0)
2719 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2720 re->options |= PCRE_ANCHORED;
2721 else
2723 int ch = find_firstchar(re->code);
2724 if (ch >= 0)
2726 re->first_char = ch;
2727 re->options |= PCRE_FIRSTSET;
2729 else if (is_startline(re->code))
2730 re->options |= PCRE_STARTLINE;
2734 /* Print out the compiled data for debugging */
2736 #ifdef DEBUG
2738 printf("Length = %d top_bracket = %d top_backref=%d\n",
2739 length, re->top_bracket, re->top_backref);
2741 if (re->options != 0)
2743 printf("%s%s%s%s%s%s%s%s\n",
2744 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2745 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2746 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2747 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2748 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2749 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2750 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2751 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2754 if ((re->options & PCRE_FIRSTSET) != 0)
2756 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2757 else printf("First char = \\x%02x\n", re->first_char);
2760 code_end = code;
2761 code_base = code = re->code;
2763 while (code < code_end)
2765 int charlength;
2767 printf("%3d ", code - code_base);
2769 if (*code >= OP_BRA)
2771 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2772 code += 2;
2775 else switch(*code)
2777 case OP_CHARS:
2778 charlength = *(++code);
2779 printf("%3d ", charlength);
2780 while (charlength-- > 0)
2781 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2782 break;
2784 case OP_KETRMAX:
2785 case OP_KETRMIN:
2786 case OP_ALT:
2787 case OP_KET:
2788 case OP_ASSERT:
2789 case OP_ASSERT_NOT:
2790 case OP_ONCE:
2791 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2792 code += 2;
2793 break;
2795 case OP_STAR:
2796 case OP_MINSTAR:
2797 case OP_PLUS:
2798 case OP_MINPLUS:
2799 case OP_QUERY:
2800 case OP_MINQUERY:
2801 case OP_TYPESTAR:
2802 case OP_TYPEMINSTAR:
2803 case OP_TYPEPLUS:
2804 case OP_TYPEMINPLUS:
2805 case OP_TYPEQUERY:
2806 case OP_TYPEMINQUERY:
2807 if (*code >= OP_TYPESTAR)
2808 printf(" %s", OP_names[code[1]]);
2809 else if (isprint(c = code[1])) printf(" %c", c);
2810 else printf(" \\x%02x", c);
2811 printf("%s", OP_names[*code++]);
2812 break;
2814 case OP_EXACT:
2815 case OP_UPTO:
2816 case OP_MINUPTO:
2817 if (isprint(c = code[3])) printf(" %c{", c);
2818 else printf(" \\x%02x{", c);
2819 if (*code != OP_EXACT) printf("0,");
2820 printf("%d}", (code[1] << 8) + code[2]);
2821 if (*code == OP_MINUPTO) printf("?");
2822 code += 3;
2823 break;
2825 case OP_TYPEEXACT:
2826 case OP_TYPEUPTO:
2827 case OP_TYPEMINUPTO:
2828 printf(" %s{", OP_names[code[3]]);
2829 if (*code != OP_TYPEEXACT) printf(",");
2830 printf("%d}", (code[1] << 8) + code[2]);
2831 if (*code == OP_TYPEMINUPTO) printf("?");
2832 code += 3;
2833 break;
2835 case OP_NOT:
2836 if (isprint(c = *(++code))) printf(" [^%c]", c);
2837 else printf(" [^\\x%02x]", c);
2838 break;
2840 case OP_NOTSTAR:
2841 case OP_NOTMINSTAR:
2842 case OP_NOTPLUS:
2843 case OP_NOTMINPLUS:
2844 case OP_NOTQUERY:
2845 case OP_NOTMINQUERY:
2846 if (isprint(c = code[1])) printf(" [^%c]", c);
2847 else printf(" [^\\x%02x]", c);
2848 printf("%s", OP_names[*code++]);
2849 break;
2851 case OP_NOTEXACT:
2852 case OP_NOTUPTO:
2853 case OP_NOTMINUPTO:
2854 if (isprint(c = code[3])) printf(" [^%c]{", c);
2855 else printf(" [^\\x%02x]{", c);
2856 if (*code != OP_NOTEXACT) printf(",");
2857 printf("%d}", (code[1] << 8) + code[2]);
2858 if (*code == OP_NOTMINUPTO) printf("?");
2859 code += 3;
2860 break;
2862 case OP_REF:
2863 printf(" \\%d", *(++code));
2864 code ++;
2865 goto CLASS_REF_REPEAT;
2867 case OP_CLASS:
2868 case OP_NEGCLASS:
2869 case OP_CLASS_L:
2871 int i, min, max;
2873 if (*code==OP_CLASS_L)
2875 code++;
2876 printf("Locflag = %i ", *code++);
2877 printf(" [");
2879 else
2881 if (*code++ == OP_CLASS) printf(" [");
2882 else printf(" ^[");
2886 for (i = 0; i < 256; i++)
2888 if ((code[i/8] & (1 << (i&7))) != 0)
2890 int j;
2891 for (j = i+1; j < 256; j++)
2892 if ((code[j/8] & (1 << (j&7))) == 0) break;
2893 if (i == '-' || i == ']') printf("\\");
2894 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2895 if (--j > i)
2897 printf("-");
2898 if (j == '-' || j == ']') printf("\\");
2899 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2901 i = j;
2904 printf("]");
2905 code += 32;
2906 /* code ++;*/
2908 CLASS_REF_REPEAT:
2910 switch(*code)
2912 case OP_CRSTAR:
2913 case OP_CRMINSTAR:
2914 case OP_CRPLUS:
2915 case OP_CRMINPLUS:
2916 case OP_CRQUERY:
2917 case OP_CRMINQUERY:
2918 printf("%s", OP_names[*code]);
2919 break;
2921 case OP_CRRANGE:
2922 case OP_CRMINRANGE:
2923 min = (code[1] << 8) + code[2];
2924 max = (code[3] << 8) + code[4];
2925 if (max == 0) printf("{%d,}", min);
2926 else printf("{%d,%d}", min, max);
2927 if (*code == OP_CRMINRANGE) printf("?");
2928 code += 4;
2929 break;
2931 default:
2932 code--;
2935 break;
2937 /* Anything else is just a one-node item */
2939 default:
2940 printf(" %s", OP_names[*code]);
2941 break;
2944 code++;
2945 printf("\n");
2947 printf("------------------------------------------------------------------\n");
2949 /* This check is done here in the debugging case so that the code that
2950 was compiled can be seen. */
2952 if (code - re->code > length)
2954 printf("length=%i, code length=%i\n", length, code-re->code);
2955 *errorptr = ERR23;
2956 (pcre_free)(re);
2957 *erroroffset = ptr - (uschar *)pattern;
2958 return NULL;
2960 #endif
2962 return (pcre *)re;
2967 /*************************************************
2968 * Match a character type *
2969 *************************************************/
2971 /* Not used in all the places it might be as it's sometimes faster
2972 to put the code inline.
2974 Arguments:
2975 type the character type
2976 c the character
2977 dotall the dotall flag
2979 Returns: TRUE if character is of the type
2982 static BOOL
2983 match_type(int type, int c, BOOL dotall)
2986 #ifdef DEBUG
2987 if (isprint(c)) printf("matching subject %c against ", c);
2988 else printf("matching subject \\x%02x against ", c);
2989 printf("%s\n", OP_names[type]);
2990 #endif
2992 switch(type)
2994 case OP_ANY: return dotall || c != '\n';
2995 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2996 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2997 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2998 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2999 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
3000 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
3001 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3002 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
3004 return FALSE;
3009 /*************************************************
3010 * Match a back-reference *
3011 *************************************************/
3013 /* If a back reference hasn't been set, the match fails.
3015 Arguments:
3016 number reference number
3017 eptr points into the subject
3018 length length to be matched
3019 md points to match data block
3021 Returns: TRUE if matched
3024 static BOOL
3025 match_ref(int number, register const uschar *eptr, int length, match_data *md)
3027 const uschar *p = md->start_subject + md->offset_vector[number];
3029 #ifdef DEBUG
3030 if (eptr >= md->end_subject)
3031 printf("matching subject <null>");
3032 else
3034 printf("matching subject ");
3035 pchars(eptr, length, TRUE, md);
3037 printf(" against backref ");
3038 pchars(p, length, FALSE, md);
3039 printf("\n");
3040 #endif
3042 /* Always fail if not enough characters left */
3044 if (length > md->end_subject - p) return FALSE;
3046 /* Separate the caseless case for speed */
3048 if (md->caseless)
3049 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3050 else
3051 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3053 return TRUE;
3056 static int free_stack(match_data *md)
3058 /* Free any stack space that was allocated by the call to match(). */
3059 if (md->off_num) PyMem_DEL(md->off_num);
3060 if (md->offset_top) PyMem_DEL(md->offset_top);
3061 if (md->r1) PyMem_DEL(md->r1);
3062 if (md->r2) PyMem_DEL(md->r2);
3063 if (md->eptr) PyMem_DEL((char *)md->eptr);
3064 if (md->ecode) PyMem_DEL((char *)md->ecode);
3065 return 0;
3068 static int grow_stack(match_data *md)
3070 if (md->length != 0)
3072 md->length = md->length + md->length/2;
3074 else
3076 int string_len = md->end_subject - md->start_subject + 1;
3077 if (string_len < 80) {md->length = string_len; }
3078 else {md->length = 80;}
3080 PyMem_RESIZE(md->offset_top, int, md->length);
3081 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3082 PyMem_RESIZE(md->ecode, const uschar *, md->length);
3083 PyMem_RESIZE(md->off_num, int, md->length);
3084 PyMem_RESIZE(md->r1, int, md->length);
3085 PyMem_RESIZE(md->r2, int, md->length);
3086 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3087 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3089 PyErr_NoMemory();
3090 longjmp(md->error_env, 1);
3092 return 0;
3096 /*************************************************
3097 * Match from current position *
3098 *************************************************/
3100 /* On entry ecode points to the first opcode, and eptr to the first character.
3102 Arguments:
3103 eptr pointer in subject
3104 ecode position in code
3105 offset_top current top pointer
3106 md pointer to "static" info for the match
3108 Returns: TRUE if matched
3111 static BOOL
3112 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
3113 match_data *md)
3115 int save_stack_position = md->point;
3116 match_loop:
3118 #define SUCCEED goto succeed
3119 #define FAIL goto fail
3121 for (;;)
3123 int min, max, ctype;
3124 register int i;
3125 register int c;
3126 BOOL minimize = FALSE;
3128 /* Opening bracket. Check the alternative branches in turn, failing if none
3129 match. We have to set the start offset if required and there is space
3130 in the offset vector so that it is available for subsequent back references
3131 if the bracket matches. However, if the bracket fails, we must put back the
3132 previous value of both offsets in case they were set by a previous copy of
3133 the same bracket. Don't worry about setting the flag for the error case here;
3134 that is handled in the code for KET. */
3136 if ((int)*ecode >= OP_BRA)
3138 int number = (*ecode - OP_BRA) << 1;
3139 int save_offset1 = 0, save_offset2 = 0;
3141 DPRINTF(("start bracket %d\n", number/2));
3143 if (number > 0 && number < md->offset_end)
3145 save_offset1 = md->offset_vector[number];
3146 save_offset2 = md->offset_vector[number+1];
3147 md->offset_vector[number] = eptr - md->start_subject;
3149 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
3152 /* Recurse for all the alternatives. */
3156 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3157 ecode += (ecode[1] << 8) + ecode[2];
3159 while (*ecode == OP_ALT);
3161 DPRINTF(("bracket %d failed\n", number/2));
3163 if (number > 0 && number < md->offset_end)
3165 md->offset_vector[number] = save_offset1;
3166 md->offset_vector[number+1] = save_offset2;
3169 FAIL;
3172 /* Other types of node can be handled by a switch */
3174 switch(*ecode)
3176 case OP_END:
3177 md->end_match_ptr = eptr; /* Record where we ended */
3178 md->end_offset_top = offset_top; /* and how many extracts were taken */
3179 SUCCEED;
3181 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3182 whole thing doesn't match, so we have to get out via a longjmp(). */
3184 case OP_CUT:
3185 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3186 longjmp(md->fail_env, 1);
3188 /* Assertion brackets. Check the alternative branches in turn - the
3189 matching won't pass the KET for an assertion. If any one branch matches,
3190 the assertion is true. */
3192 case OP_ASSERT:
3195 if (match(eptr, ecode+3, offset_top, md)) break;
3196 ecode += (ecode[1] << 8) + ecode[2];
3198 while (*ecode == OP_ALT);
3199 if (*ecode == OP_KET) FAIL;
3201 /* Continue from after the assertion, updating the offsets high water
3202 mark, since extracts may have been taken during the assertion. */
3204 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3205 ecode += 3;
3206 offset_top = md->end_offset_top;
3207 continue;
3209 /* Negative assertion: all branches must fail to match */
3211 case OP_ASSERT_NOT:
3214 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3215 ecode += (ecode[1] << 8) + ecode[2];
3217 while (*ecode == OP_ALT);
3218 ecode += 3;
3219 continue;
3221 /* "Once" brackets are like assertion brackets except that after a match,
3222 the point in the subject string is not moved back. Thus there can never be
3223 a move back into the brackets. Check the alternative branches in turn - the
3224 matching won't pass the KET for this kind of subpattern. If any one branch
3225 matches, we carry on, leaving the subject pointer. */
3227 case OP_ONCE:
3230 if (match(eptr, ecode+3, offset_top, md)) break;
3231 ecode += (ecode[1] << 8) + ecode[2];
3233 while (*ecode == OP_ALT);
3234 if (*ecode == OP_KET) FAIL;
3236 /* Continue as from after the assertion, updating the offsets high water
3237 mark, since extracts may have been taken. */
3239 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3240 ecode += 3;
3241 offset_top = md->end_offset_top;
3242 eptr = md->end_match_ptr;
3243 continue;
3245 /* An alternation is the end of a branch; scan along to find the end of the
3246 bracketed group and go to there. */
3248 case OP_ALT:
3249 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3250 break;
3252 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3253 that it may occur zero times. It may repeat infinitely, or not at all -
3254 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3255 repeat limits are compiled as a number of copies, with the optional ones
3256 preceded by BRAZERO or BRAMINZERO. */
3258 case OP_BRAZERO:
3260 const uschar *next = ecode+1;
3261 if (match(eptr, next, offset_top, md)) SUCCEED;
3262 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3263 ecode = next + 3;
3265 break;
3267 case OP_BRAMINZERO:
3269 const uschar *next = ecode+1;
3270 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3271 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3272 ecode++;
3274 break;;
3276 /* End of a group, repeated or non-repeating. If we are at the end of
3277 an assertion "group", stop matching and SUCCEED, but record the
3278 current high water mark for use by positive assertions. */
3280 case OP_KET:
3281 case OP_KETRMIN:
3282 case OP_KETRMAX:
3284 int number;
3285 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3287 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
3289 md->end_match_ptr = eptr; /* For ONCE */
3290 md->end_offset_top = offset_top;
3291 SUCCEED;
3294 /* In all other cases we have to check the group number back at the
3295 start and if necessary complete handling an extraction by setting the
3296 final offset and bumping the high water mark. */
3298 number = (*prev - OP_BRA) << 1;
3300 DPRINTF(("end bracket %d\n", number/2));
3302 if (number > 0)
3304 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3306 md->offset_vector[number+1] = eptr - md->start_subject;
3307 if (offset_top <= number) offset_top = number + 2;
3311 /* For a non-repeating ket, just advance to the next node and continue at
3312 this level. */
3314 if (*ecode == OP_KET)
3316 ecode += 3;
3317 break;
3320 /* The repeating kets try the rest of the pattern or restart from the
3321 preceding bracket, in the appropriate order. */
3323 if (*ecode == OP_KETRMIN)
3325 const uschar *ptr;
3326 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3327 /* Handle alternation inside the BRA...KET; push the additional
3328 alternatives onto the stack */
3329 ptr=prev;
3330 do {
3331 ptr += (ptr[1]<<8)+ ptr[2];
3332 if (*ptr==OP_ALT)
3334 if (md->length == md->point)
3336 grow_stack(md);
3338 md->offset_top[md->point] = offset_top;
3339 md->eptr[md->point] = eptr;
3340 md->ecode[md->point] = ptr+3;
3341 md->r1[md->point] = 0;
3342 md->r2[md->point] = 0;
3343 md->off_num[md->point] = 0;
3344 md->point++;
3346 } while (*ptr==OP_ALT);
3347 ecode=prev+3; goto match_loop;
3349 else /* OP_KETRMAX */
3351 const uschar *ptr;
3352 /*int points_pushed=0;*/
3354 /* Push one failure point, that will resume matching at the code after
3355 the KETRMAX opcode. */
3356 if (md->length == md->point)
3358 grow_stack(md);
3360 md->offset_top[md->point] = offset_top;
3361 md->eptr[md->point] = eptr;
3362 md->ecode[md->point] = ecode+3;
3363 md->r1[md->point] = md->offset_vector[number];
3364 md->r2[md->point] = md->offset_vector[number+1];
3365 md->off_num[md->point] = number;
3366 md->point++;
3368 md->offset_vector[number] = eptr - md->start_subject;
3369 /* Handle alternation inside the BRA...KET; push each of the
3370 additional alternatives onto the stack */
3371 ptr=prev;
3372 do {
3373 ptr += (ptr[1]<<8)+ ptr[2];
3374 if (*ptr==OP_ALT)
3376 if (md->length == md->point)
3377 if (md->length == md->point)
3379 grow_stack(md);
3381 md->offset_top[md->point] = offset_top;
3382 md->eptr[md->point] = eptr;
3383 md->ecode[md->point] = ptr+3;
3384 md->r1[md->point] = 0;
3385 md->r2[md->point] = 0;
3386 md->off_num[md->point] = 0;
3387 md->point++;
3388 /*points_pushed++;*/
3390 } while (*ptr==OP_ALT);
3391 /* Jump to the first (or only) alternative and resume trying to match */
3392 ecode=prev+3; goto match_loop;
3396 /* Start of subject unless notbol, or after internal newline if multiline */
3398 case OP_CIRC:
3399 if (md->notbol && eptr == md->start_subject) FAIL;
3400 if (md->multiline)
3402 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3403 ecode++;
3404 break;
3406 /* ... else fall through */
3408 /* Start of subject assertion */
3410 case OP_SOD:
3411 if (eptr != md->start_subject) FAIL;
3412 ecode++;
3413 break;
3415 /* Assert before internal newline if multiline, or before
3416 a terminating newline unless endonly is set, else end of subject unless
3417 noteol is set. */
3419 case OP_DOLL:
3420 if (md->noteol && eptr >= md->end_subject) FAIL;
3421 if (md->multiline)
3423 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3424 ecode++;
3425 break;
3427 else if (!md->endonly)
3429 if (eptr < md->end_subject - 1 ||
3430 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3431 ecode++;
3432 break;
3434 /* ... else fall through */
3436 /* End of subject assertion */
3438 case OP_EOD:
3439 if (eptr < md->end_subject) FAIL;
3440 ecode++;
3441 break;
3443 /* Word boundary assertions */
3445 case OP_NOT_WORD_BOUNDARY:
3446 case OP_WORD_BOUNDARY:
3448 BOOL prev_is_word = (eptr != md->start_subject) &&
3449 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3450 BOOL cur_is_word = (eptr < md->end_subject) &&
3451 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3452 if ((*ecode++ == OP_WORD_BOUNDARY)?
3453 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3454 FAIL;
3456 break;
3458 case OP_NOT_WORD_BOUNDARY_L:
3459 case OP_WORD_BOUNDARY_L:
3461 BOOL prev_is_word = (eptr != md->start_subject) &&
3462 (isalnum(eptr[-1]) || eptr[-1]=='_');
3463 BOOL cur_is_word = (eptr < md->end_subject) &&
3464 (isalnum(*eptr) || *eptr=='_');
3465 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3466 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3467 FAIL;
3469 break;
3472 /* Match a single character type; inline for speed */
3474 case OP_ANY:
3475 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3476 if (eptr++ >= md->end_subject) FAIL;
3477 ecode++;
3478 break;
3480 case OP_NOT_DIGIT:
3481 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3482 FAIL;
3483 ecode++;
3484 break;
3486 case OP_DIGIT:
3487 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3488 FAIL;
3489 ecode++;
3490 break;
3492 case OP_NOT_WHITESPACE:
3493 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3494 FAIL;
3495 ecode++;
3496 break;
3498 case OP_WHITESPACE:
3499 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3500 FAIL;
3501 ecode++;
3502 break;
3504 case OP_NOT_WORDCHAR:
3505 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3506 FAIL;
3507 ecode++;
3508 break;
3510 case OP_WORDCHAR:
3511 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3512 FAIL;
3513 ecode++;
3514 break;
3516 case OP_NOT_WORDCHAR_L:
3517 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
3518 FAIL;
3519 eptr++;
3520 ecode++;
3521 break;
3523 case OP_WORDCHAR_L:
3524 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
3525 FAIL;
3526 eptr++;
3527 ecode++;
3528 break;
3530 /* Match a back reference, possibly repeatedly. Look past the end of the
3531 item to see if there is repeat information following. The code is similar
3532 to that for character classes, but repeated for efficiency. Then obey
3533 similar code to character type repeats - written out again for speed.
3534 However, if the referenced string is the empty string, always treat
3535 it as matched, any number of times (otherwise there could be infinite
3536 loops). */
3538 case OP_REF:
3540 int length;
3541 int number = ecode[1] << 1; /* Doubled reference number */
3542 ecode += 2; /* Advance past the item */
3544 if (number >= offset_top || md->offset_vector[number] < 0)
3546 md->errorcode = PCRE_ERROR_BADREF;
3547 FAIL;
3550 length = md->offset_vector[number+1] - md->offset_vector[number];
3552 switch (*ecode)
3554 case OP_CRSTAR:
3555 case OP_CRMINSTAR:
3556 case OP_CRPLUS:
3557 case OP_CRMINPLUS:
3558 case OP_CRQUERY:
3559 case OP_CRMINQUERY:
3560 c = *ecode++ - OP_CRSTAR;
3561 minimize = (c & 1) != 0;
3562 min = rep_min[c]; /* Pick up values from tables; */
3563 max = rep_max[c]; /* zero for max => infinity */
3564 if (max == 0) max = INT_MAX;
3565 break;
3567 case OP_CRRANGE:
3568 case OP_CRMINRANGE:
3569 minimize = (*ecode == OP_CRMINRANGE);
3570 min = (ecode[1] << 8) + ecode[2];
3571 max = (ecode[3] << 8) + ecode[4];
3572 if (max == 0) max = INT_MAX;
3573 ecode += 5;
3574 break;
3576 default: /* No repeat follows */
3577 if (!match_ref(number, eptr, length, md)) FAIL;
3578 eptr += length;
3579 continue; /* With the main loop */
3582 /* If the length of the reference is zero, just continue with the
3583 main loop. */
3585 if (length == 0) continue;
3587 /* First, ensure the minimum number of matches are present. We get back
3588 the length of the reference string explicitly rather than passing the
3589 address of eptr, so that eptr can be a register variable. */
3591 for (i = 1; i <= min; i++)
3593 if (!match_ref(number, eptr, length, md)) FAIL;
3594 eptr += length;
3597 /* If min = max, continue at the same level without recursion.
3598 They are not both allowed to be zero. */
3600 if (min == max) continue;
3602 /* If minimizing, keep trying and advancing the pointer */
3604 if (minimize)
3606 for (i = min;; i++)
3608 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3609 if (i >= max || !match_ref(number, eptr, length, md))
3610 FAIL;
3611 eptr += length;
3613 /* Control never gets here */
3616 /* If maximizing, find the longest string and work backwards */
3618 else
3620 const uschar *pp = eptr;
3621 for (i = min; i < max; i++)
3623 if (!match_ref(number, eptr, length, md)) break;
3624 eptr += length;
3626 while (eptr >= pp)
3628 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3629 eptr -= length;
3631 FAIL;
3634 /* Control never gets here */
3636 /* Match a character class, possibly repeatedly. Look past the end of the
3637 item to see if there is repeat information following. Then obey similar
3638 code to character type repeats - written out again for speed. If caseless
3639 matching was set at runtime but not at compile time, we have to check both
3640 versions of a character, and we have to behave differently for positive and
3641 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3642 treated differently. */
3644 case OP_CLASS:
3645 case OP_NEGCLASS:
3647 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
3648 const uschar *data = ecode + 1; /* Save for matching */
3649 ecode += 33; /* Advance past the item */
3651 switch (*ecode)
3653 case OP_CRSTAR:
3654 case OP_CRMINSTAR:
3655 case OP_CRPLUS:
3656 case OP_CRMINPLUS:
3657 case OP_CRQUERY:
3658 case OP_CRMINQUERY:
3659 c = *ecode++ - OP_CRSTAR;
3660 minimize = (c & 1) != 0;
3661 min = rep_min[c]; /* Pick up values from tables; */
3662 max = rep_max[c]; /* zero for max => infinity */
3663 if (max == 0) max = INT_MAX;
3664 break;
3666 case OP_CRRANGE:
3667 case OP_CRMINRANGE:
3668 minimize = (*ecode == OP_CRMINRANGE);
3669 min = (ecode[1] << 8) + ecode[2];
3670 max = (ecode[3] << 8) + ecode[4];
3671 if (max == 0) max = INT_MAX;
3672 ecode += 5;
3673 break;
3675 default: /* No repeat follows */
3676 min = max = 1;
3677 break;
3680 /* First, ensure the minimum number of matches are present. */
3682 for (i = 1; i <= min; i++)
3684 if (eptr >= md->end_subject) FAIL;
3685 c = *eptr++;
3687 /* Either not runtime caseless, or it was a positive class. For
3688 runtime caseless, continue if either case is in the map. */
3690 if (!nasty_case)
3692 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3693 if (md->runtime_caseless)
3695 c = pcre_fcc[c];
3696 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3700 /* Runtime caseless and it was a negative class. Continue only if
3701 both cases are in the map. */
3703 else
3705 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3706 c = pcre_fcc[c];
3707 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3710 FAIL;
3713 /* If max == min we can continue with the main loop without the
3714 need to recurse. */
3716 if (min == max) continue;
3718 /* If minimizing, keep testing the rest of the expression and advancing
3719 the pointer while it matches the class. */
3721 if (minimize)
3723 for (i = min;; i++)
3725 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3726 if (i >= max || eptr >= md->end_subject) FAIL;
3727 c = *eptr++;
3729 /* Either not runtime caseless, or it was a positive class. For
3730 runtime caseless, continue if either case is in the map. */
3732 if (!nasty_case)
3734 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3735 if (md->runtime_caseless)
3737 c = pcre_fcc[c];
3738 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3742 /* Runtime caseless and it was a negative class. Continue only if
3743 both cases are in the map. */
3745 else
3747 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3748 c = pcre_fcc[c];
3749 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3752 FAIL;
3754 /* Control never gets here */
3757 /* If maximizing, find the longest possible run, then work backwards. */
3759 else
3761 const uschar *pp = eptr;
3762 for (i = min; i < max; eptr++, i++)
3764 if (eptr >= md->end_subject) break;
3765 c = *eptr;
3767 /* Either not runtime caseless, or it was a positive class. For
3768 runtime caseless, continue if either case is in the map. */
3770 if (!nasty_case)
3772 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3773 if (md->runtime_caseless)
3775 c = pcre_fcc[c];
3776 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3780 /* Runtime caseless and it was a negative class. Continue only if
3781 both cases are in the map. */
3783 else
3785 if ((data[c/8] & (1 << (c&7))) == 0) break;
3786 c = pcre_fcc[c];
3787 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3790 break;
3793 while (eptr >= pp)
3794 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3795 FAIL;
3798 /* Control never gets here */
3800 /* OP_CLASS_L opcode: handles localized character classes */
3802 case OP_CLASS_L:
3804 const uschar *data = ecode + 1; /* Save for matching */
3805 const uschar locale_flag = *data;
3806 ecode++; data++; /* The localization support adds an extra byte */
3808 ecode += 33; /* Advance past the item */
3810 switch (*ecode)
3812 case OP_CRSTAR:
3813 case OP_CRMINSTAR:
3814 case OP_CRPLUS:
3815 case OP_CRMINPLUS:
3816 case OP_CRQUERY:
3817 case OP_CRMINQUERY:
3818 c = *ecode++ - OP_CRSTAR;
3819 minimize = (c & 1) != 0;
3820 min = rep_min[c]; /* Pick up values from tables; */
3821 max = rep_max[c]; /* zero for max => infinity */
3822 if (max == 0) max = INT_MAX;
3823 break;
3825 case OP_CRRANGE:
3826 case OP_CRMINRANGE:
3827 minimize = (*ecode == OP_CRMINRANGE);
3828 min = (ecode[1] << 8) + ecode[2];
3829 max = (ecode[3] << 8) + ecode[4];
3830 if (max == 0) max = INT_MAX;
3831 ecode += 5;
3832 break;
3834 default: /* No repeat follows */
3835 if (eptr >= md->end_subject) FAIL;
3836 c = *eptr++;
3837 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3838 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3839 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3840 #if 0
3841 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3842 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3843 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3844 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3845 #endif
3847 if (md->runtime_caseless)
3849 c = pcre_fcc[c];
3850 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3852 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3853 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3855 FAIL;
3858 /* First, ensure the minimum number of matches are present. */
3860 for (i = 1; i <= min; i++)
3862 if (eptr >= md->end_subject) FAIL;
3863 c = *eptr++;
3864 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3865 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3866 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3868 if (md->runtime_caseless)
3870 c = pcre_fcc[c];
3871 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3872 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3873 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3875 FAIL;
3878 /* If max == min we can continue with the main loop without the
3879 need to recurse. */
3881 if (min == max) continue;
3883 /* If minimizing, keep testing the rest of the expression and advancing
3884 the pointer while it matches the class. */
3886 if (minimize)
3888 for (i = min;; i++)
3890 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3891 if (i >= max || eptr >= md->end_subject) FAIL;
3892 c = *eptr++;
3893 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3894 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3895 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3897 if (md->runtime_caseless)
3899 c = pcre_fcc[c];
3900 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3901 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3902 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3904 FAIL;
3906 /* Control never gets here */
3909 /* If maximizing, find the longest possible run, then work backwards. */
3911 else
3913 const uschar *pp = eptr;
3914 for (i = min; i < max; eptr++, i++)
3916 if (eptr >= md->end_subject) break;
3917 c = *eptr;
3918 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3919 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3920 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3921 if (md->runtime_caseless)
3923 c = pcre_fcc[c];
3924 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3925 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3926 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
3928 break;
3931 while (eptr >= pp)
3932 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3933 FAIL;
3936 /* Control never gets here */
3938 /* Match a run of characters */
3940 case OP_CHARS:
3942 register int length = ecode[1];
3943 ecode += 2;
3945 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3946 if (eptr >= md->end_subject)
3947 printf("matching subject <null> against pattern ");
3948 else
3950 printf("matching subject ");
3951 pchars(eptr, length, TRUE, md);
3952 printf(" against pattern ");
3954 pchars(ecode, length, FALSE, md);
3955 printf("\n");
3956 #endif
3958 if (length > md->end_subject - eptr) FAIL;
3959 if (md->caseless)
3961 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3963 else
3965 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3968 break;
3970 /* Match a single character repeatedly; different opcodes share code. */
3972 case OP_EXACT:
3973 min = max = (ecode[1] << 8) + ecode[2];
3974 ecode += 3;
3975 goto REPEATCHAR;
3977 case OP_UPTO:
3978 case OP_MINUPTO:
3979 min = 0;
3980 max = (ecode[1] << 8) + ecode[2];
3981 minimize = *ecode == OP_MINUPTO;
3982 ecode += 3;
3983 goto REPEATCHAR;
3985 case OP_STAR:
3986 case OP_MINSTAR:
3987 case OP_PLUS:
3988 case OP_MINPLUS:
3989 case OP_QUERY:
3990 case OP_MINQUERY:
3991 c = *ecode++ - OP_STAR;
3992 minimize = (c & 1) != 0;
3993 min = rep_min[c]; /* Pick up values from tables; */
3994 max = rep_max[c]; /* zero for max => infinity */
3995 if (max == 0) max = INT_MAX;
3997 /* Common code for all repeated single-character matches. We can give
3998 up quickly if there are fewer than the minimum number of characters left in
3999 the subject. */
4001 REPEATCHAR:
4002 if (min > md->end_subject - eptr) FAIL;
4003 c = *ecode++;
4005 /* The code is duplicated for the caseless and caseful cases, for speed,
4006 since matching characters is likely to be quite common. First, ensure the
4007 minimum number of matches are present. If min = max, continue at the same
4008 level without recursing. Otherwise, if minimizing, keep trying the rest of
4009 the expression and advancing one matching character if failing, up to the
4010 maximum. Alternatively, if maximizing, find the maximum number of
4011 characters and work backwards. */
4013 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4014 max, eptr));
4016 if (md->caseless)
4018 c = pcre_lcc[c];
4019 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4020 if (min == max) continue;
4021 if (minimize)
4023 for (i = min;; i++)
4025 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4026 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4027 FAIL;
4029 /* Control never gets here */
4031 else
4033 const uschar *pp = eptr;
4034 for (i = min; i < max; i++)
4036 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4037 eptr++;
4039 while (eptr >= pp)
4040 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4041 FAIL;
4043 /* Control never gets here */
4046 /* Caseful comparisons */
4048 else
4050 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4051 if (min == max) continue;
4052 if (minimize)
4054 for (i = min;; i++)
4056 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4057 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4059 /* Control never gets here */
4061 else
4063 const uschar *pp = eptr;
4064 for (i = min; i < max; i++)
4066 if (eptr >= md->end_subject || c != *eptr) break;
4067 eptr++;
4069 while (eptr >= pp)
4070 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4071 FAIL;
4074 /* Control never gets here */
4076 /* Match a negated single character */
4078 case OP_NOT:
4079 if (eptr >= md->end_subject) FAIL;
4080 ecode++;
4081 if (md->caseless)
4083 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4085 else
4087 if (*ecode++ == *eptr++) FAIL;
4089 break;
4091 /* Match a negated single character repeatedly. This is almost a repeat of
4092 the code for a repeated single character, but I haven't found a nice way of
4093 commoning these up that doesn't require a test of the positive/negative
4094 option for each character match. Maybe that wouldn't add very much to the
4095 time taken, but character matching *is* what this is all about... */
4097 case OP_NOTEXACT:
4098 min = max = (ecode[1] << 8) + ecode[2];
4099 ecode += 3;
4100 goto REPEATNOTCHAR;
4102 case OP_NOTUPTO:
4103 case OP_NOTMINUPTO:
4104 min = 0;
4105 max = (ecode[1] << 8) + ecode[2];
4106 minimize = *ecode == OP_NOTMINUPTO;
4107 ecode += 3;
4108 goto REPEATNOTCHAR;
4110 case OP_NOTSTAR:
4111 case OP_NOTMINSTAR:
4112 case OP_NOTPLUS:
4113 case OP_NOTMINPLUS:
4114 case OP_NOTQUERY:
4115 case OP_NOTMINQUERY:
4116 c = *ecode++ - OP_NOTSTAR;
4117 minimize = (c & 1) != 0;
4118 min = rep_min[c]; /* Pick up values from tables; */
4119 max = rep_max[c]; /* zero for max => infinity */
4120 if (max == 0) max = INT_MAX;
4122 /* Common code for all repeated single-character matches. We can give
4123 up quickly if there are fewer than the minimum number of characters left in
4124 the subject. */
4126 REPEATNOTCHAR:
4127 if (min > md->end_subject - eptr) FAIL;
4128 c = *ecode++;
4130 /* The code is duplicated for the caseless and caseful cases, for speed,
4131 since matching characters is likely to be quite common. First, ensure the
4132 minimum number of matches are present. If min = max, continue at the same
4133 level without recursing. Otherwise, if minimizing, keep trying the rest of
4134 the expression and advancing one matching character if failing, up to the
4135 maximum. Alternatively, if maximizing, find the maximum number of
4136 characters and work backwards. */
4138 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4139 max, eptr));
4141 if (md->caseless)
4143 c = pcre_lcc[c];
4144 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4145 if (min == max) continue;
4146 if (minimize)
4148 for (i = min;; i++)
4150 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4151 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4152 FAIL;
4154 /* Control never gets here */
4156 else
4158 const uschar *pp = eptr;
4159 for (i = min; i < max; i++)
4161 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4162 eptr++;
4164 while (eptr >= pp)
4165 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4166 FAIL;
4168 /* Control never gets here */
4171 /* Caseful comparisons */
4173 else
4175 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4176 if (min == max) continue;
4177 if (minimize)
4179 for (i = min;; i++)
4181 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4182 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4184 /* Control never gets here */
4186 else
4188 const uschar *pp = eptr;
4189 for (i = min; i < max; i++)
4191 if (eptr >= md->end_subject || c == *eptr) break;
4192 eptr++;
4194 while (eptr >= pp)
4195 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4196 FAIL;
4199 /* Control never gets here */
4201 /* Match a single character type repeatedly; several different opcodes
4202 share code. This is very similar to the code for single characters, but we
4203 repeat it in the interests of efficiency. */
4205 case OP_TYPEEXACT:
4206 min = max = (ecode[1] << 8) + ecode[2];
4207 minimize = TRUE;
4208 ecode += 3;
4209 goto REPEATTYPE;
4211 case OP_TYPEUPTO:
4212 case OP_TYPEMINUPTO:
4213 min = 0;
4214 max = (ecode[1] << 8) + ecode[2];
4215 minimize = *ecode == OP_TYPEMINUPTO;
4216 ecode += 3;
4217 goto REPEATTYPE;
4219 case OP_TYPESTAR:
4220 case OP_TYPEMINSTAR:
4221 case OP_TYPEPLUS:
4222 case OP_TYPEMINPLUS:
4223 case OP_TYPEQUERY:
4224 case OP_TYPEMINQUERY:
4225 c = *ecode++ - OP_TYPESTAR;
4226 minimize = (c & 1) != 0;
4227 min = rep_min[c]; /* Pick up values from tables; */
4228 max = rep_max[c]; /* zero for max => infinity */
4229 if (max == 0) max = INT_MAX;
4231 /* Common code for all repeated single character type matches */
4233 REPEATTYPE:
4234 ctype = *ecode++; /* Code for the character type */
4236 /* First, ensure the minimum number of matches are present. Use inline
4237 code for maximizing the speed, and do the type test once at the start
4238 (i.e. keep it out of the loop). Also test that there are at least the
4239 minimum number of characters before we start. */
4241 if (min > md->end_subject - eptr) FAIL;
4242 if (min > 0) switch(ctype)
4244 case OP_ANY:
4245 if (!md->dotall)
4246 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4247 else eptr += min;
4248 break;
4250 case OP_NOT_DIGIT:
4251 for (i = 1; i <= min; i++)
4252 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4253 break;
4255 case OP_DIGIT:
4256 for (i = 1; i <= min; i++)
4257 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4258 break;
4260 case OP_NOT_WHITESPACE:
4261 for (i = 1; i <= min; i++)
4262 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4263 break;
4265 case OP_WHITESPACE:
4266 for (i = 1; i <= min; i++)
4267 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4268 break;
4270 case OP_NOT_WORDCHAR:
4271 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4272 FAIL;
4273 break;
4275 case OP_WORDCHAR:
4276 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4277 FAIL;
4278 break;
4280 case OP_NOT_WORDCHAR_L:
4281 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
4282 FAIL;
4283 break;
4285 case OP_WORDCHAR_L:
4286 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
4287 FAIL;
4288 break;
4291 /* If min = max, continue at the same level without recursing */
4293 if (min == max) continue;
4295 /* If minimizing, we have to test the rest of the pattern before each
4296 subsequent match, so inlining isn't much help; just use the function. */
4298 if (minimize)
4300 for (i = min;; i++)
4302 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4303 if (i >= max || eptr >= md->end_subject ||
4304 !match_type(ctype, *eptr++, md->dotall))
4305 FAIL;
4307 /* Control never gets here */
4310 /* If maximizing it is worth using inline code for speed, doing the type
4311 test once at the start (i.e. keep it out of the loop). */
4313 else
4315 const uschar *pp = eptr;
4316 switch(ctype)
4318 case OP_ANY:
4319 if (!md->dotall)
4321 for (i = min; i < max; i++)
4323 if (eptr >= md->end_subject || *eptr == '\n') break;
4324 eptr++;
4327 else
4329 c = max - min;
4330 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4331 eptr += c;
4333 break;
4335 case OP_NOT_DIGIT:
4336 for (i = min; i < max; i++)
4338 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4339 break;
4340 eptr++;
4342 break;
4344 case OP_DIGIT:
4345 for (i = min; i < max; i++)
4347 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4348 break;
4349 eptr++;
4351 break;
4353 case OP_NOT_WHITESPACE:
4354 for (i = min; i < max; i++)
4356 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4357 break;
4358 eptr++;
4360 break;
4362 case OP_WHITESPACE:
4363 for (i = min; i < max; i++)
4365 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4366 break;
4367 eptr++;
4369 break;
4371 case OP_NOT_WORDCHAR:
4372 for (i = min; i < max; i++)
4374 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4375 break;
4376 eptr++;
4378 break;
4380 case OP_WORDCHAR:
4381 for (i = min; i < max; i++)
4383 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4384 break;
4385 eptr++;
4387 break;
4388 case OP_NOT_WORDCHAR_L:
4389 for (i = min; i < max; i++)
4391 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
4392 break;
4393 eptr++;
4395 break;
4397 case OP_WORDCHAR_L:
4398 for (i = min; i < max; i++)
4400 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
4401 break;
4402 eptr++;
4404 break;
4407 while (eptr >= pp)
4408 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4409 FAIL;
4411 /* Control never gets here */
4413 /* There's been some horrible disaster. */
4415 default:
4416 DPRINTF(("Unknown opcode %d\n", *ecode));
4417 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4418 FAIL;
4421 /* Do not stick any code in here without much thought; it is assumed
4422 that "continue" in the code above comes out to here to repeat the main
4423 loop. */
4425 } /* End of main loop */
4426 /* Control never reaches here */
4428 fail:
4429 if (md->point > save_stack_position)
4431 /* If there are still points remaining on the stack, pop the next one off */
4432 int off_num;
4434 md->point--;
4435 offset_top = md->offset_top[md->point];
4436 eptr = md->eptr[md->point];
4437 ecode = md->ecode[md->point];
4438 off_num = md->off_num[md->point];
4439 md->offset_vector[off_num] = md->r1[md->point];
4440 md->offset_vector[off_num+1] = md->r2[md->point];
4441 goto match_loop;
4443 /* Failure, and nothing left on the stack, so end this function call */
4445 /* Restore the top of the stack to where it was before this function
4446 call. This lets us use one stack for everything; recursive calls
4447 can push and pop information, and may increase the stack. When
4448 the call returns, the parent function can resume pushing and
4449 popping wherever it was. */
4451 md->point = save_stack_position;
4452 return FALSE;
4454 succeed:
4455 return TRUE;
4460 /*************************************************
4461 * Segregate setjmp() *
4462 *************************************************/
4464 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4465 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4466 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4467 since it's needed only for the extension \X option, and with any luck, a good
4468 compiler will spot the tail recursion and compile it efficiently.
4470 Arguments:
4471 eptr pointer in subject
4472 ecode position in code
4473 offset_top current top pointer
4474 md pointer to "static" info for the match
4476 Returns: TRUE if matched
4479 static BOOL
4480 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4481 match_data *match_block)
4483 return setjmp(match_block->fail_env) == 0 &&
4484 match(eptr, ecode, offset_top, match_block);
4489 /*************************************************
4490 * Execute a Regular Expression *
4491 *************************************************/
4493 /* This function applies a compiled re to a subject string and picks out
4494 portions of the string if it matches. Two elements in the vector are set for
4495 each substring: the offsets to the start and end of the substring.
4497 Arguments:
4498 external_re points to the compiled expression
4499 external_extra points to "hints" from pcre_study() or is NULL
4500 subject points to the subject string
4501 length length of subject string (may contain binary zeros)
4502 options option bits
4503 offsets points to a vector of ints to be filled in with offsets
4504 offsetcount the number of elements in the vector
4506 Returns: > 0 => success; value is the number of elements filled in
4507 = 0 => success, but offsets is not big enough
4508 -1 => failed to match
4509 < -1 => some kind of unexpected problem
4513 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4514 const char *subject, int length, int start_pos, int options,
4515 int *offsets, int offsetcount)
4517 /* The "volatile" directives are to make gcc -Wall stop complaining
4518 that these variables can be clobbered by the longjmp. Hopefully
4519 they won't cost too much performance. */
4520 volatile int resetcount, ocount;
4521 volatile int first_char = -1;
4522 const uschar * volatile start_bits = NULL;
4523 const uschar * volatile start_match = (const uschar *)subject + start_pos;
4524 match_data match_block;
4525 const uschar *end_subject;
4526 const real_pcre *re = (const real_pcre *)external_re;
4527 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4528 volatile BOOL using_temporary_offsets = FALSE;
4529 volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4530 volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4532 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4534 if (re == NULL || subject == NULL ||
4535 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4536 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4538 match_block.start_subject = (const uschar *)subject;
4539 match_block.end_subject = match_block.start_subject + length;
4540 end_subject = match_block.end_subject;
4542 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4543 match_block.runtime_caseless = match_block.caseless &&
4544 (re->options & PCRE_CASELESS) == 0;
4546 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4547 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4548 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4550 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4551 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4553 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4555 /* Set the stack state to empty */
4556 match_block.off_num = match_block.offset_top = NULL;
4557 match_block.r1 = match_block.r2 = NULL;
4558 match_block.eptr = match_block.ecode = NULL;
4559 match_block.point = match_block.length = 0;
4561 /* If the expression has got more back references than the offsets supplied can
4562 hold, we get a temporary bit of working store to use during the matching.
4563 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4564 of 2. */
4566 ocount = offsetcount & (-2);
4567 if (re->top_backref > 0 && re->top_backref >= ocount/2)
4569 ocount = re->top_backref * 2 + 2;
4570 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4571 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4572 using_temporary_offsets = TRUE;
4573 DPRINTF(("Got memory to hold back references\n"));
4575 else match_block.offset_vector = offsets;
4577 match_block.offset_end = ocount;
4578 match_block.offset_overflow = FALSE;
4580 /* Compute the minimum number of offsets that we need to reset each time. Doing
4581 this makes a huge difference to execution time when there aren't many brackets
4582 in the pattern. */
4584 resetcount = 2 + re->top_bracket * 2;
4585 if (resetcount > offsetcount) resetcount = ocount;
4587 /* If MULTILINE is set at exec time but was not set at compile time, and the
4588 anchored flag is set, we must re-check because a setting provoked by ^ in the
4589 pattern is not right in multi-line mode. Calling is_anchored() again here does
4590 the right check, because multiline is now set. If it now yields FALSE, the
4591 expression must have had ^ starting some of its branches. Check to see if
4592 that is true for *all* branches, and if so, set the startline flag. */
4594 if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4595 !is_anchored(re->code, match_block.multiline))
4597 anchored = FALSE;
4598 if (is_startline(re->code)) startline = TRUE;
4601 /* Set up the first character to match, if available. The first_char value is
4602 never set for an anchored regular expression, but the anchoring may be forced
4603 at run time, so we have to test for anchoring. The first char may be unset for
4604 an unanchored pattern, of course. If there's no first char and the pattern was
4605 studied, the may be a bitmap of possible first characters. However, we can
4606 use this only if the caseless state of the studying was correct. */
4608 if (!anchored)
4610 if ((re->options & PCRE_FIRSTSET) != 0)
4612 first_char = re->first_char;
4613 if (match_block.caseless) first_char = pcre_lcc[first_char];
4615 else
4616 if (!startline && extra != NULL &&
4617 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4618 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4619 start_bits = extra->start_bits;
4622 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4626 int rc;
4627 register int *iptr = match_block.offset_vector;
4628 register int *iend = iptr + resetcount;
4630 /* Reset the maximum number of extractions we might see. */
4632 while (iptr < iend) *iptr++ = -1;
4634 /* Advance to a unique first char if possible */
4636 if (first_char >= 0)
4638 if (match_block.caseless)
4639 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4640 start_match++;
4641 else
4642 while (start_match < end_subject && *start_match != first_char)
4643 start_match++;
4646 /* Or to just after \n for a multiline match if possible */
4648 else if (startline)
4650 if (start_match > match_block.start_subject)
4652 while (start_match < end_subject && start_match[-1] != '\n')
4653 start_match++;
4657 /* Or to a non-unique first char */
4659 else if (start_bits != NULL)
4661 while (start_match < end_subject)
4663 register int c = *start_match;
4664 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4668 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4669 printf(">>>> Match against: ");
4670 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4671 printf("\n");
4672 #endif
4674 /* When a match occurs, substrings will be set for all internal extractions;
4675 we just need to set up the whole thing as substring 0 before returning. If
4676 there were too many extractions, set the return code to zero. In the case
4677 where we had to get some local store to hold offsets for backreferences, copy
4678 those back references that we can. In this case there need not be overflow
4679 if certain parts of the pattern were not used.
4681 Before starting the match, we have to set up a longjmp() target to enable
4682 the "cut" operation to fail a match completely without backtracking. This
4683 is done in a separate function to avoid compiler warnings. We need not do
4684 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4685 enabled. */
4687 /* To handle errors such as running out of memory for the failure
4688 stack, we need to save this location via setjmp(), so
4689 error-handling code can call longjmp() to jump out of deeply-nested code. */
4690 if (setjmp(match_block.error_env)==0)
4693 if ((re->options & PCRE_EXTRA) != 0)
4695 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4696 continue;
4698 else if (!match(start_match, re->code, 2, &match_block)) continue;
4700 /* Copy the offset information from temporary store if necessary */
4702 if (using_temporary_offsets)
4704 if (offsetcount >= 4)
4706 memcpy(offsets + 2, match_block.offset_vector + 2,
4707 (offsetcount - 2) * sizeof(int));
4708 DPRINTF(("Copied offsets from temporary memory\n"));
4710 if (match_block.end_offset_top > offsetcount)
4711 match_block.offset_overflow = TRUE;
4713 DPRINTF(("Freeing temporary memory\n"));
4714 (pcre_free)(match_block.offset_vector);
4717 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4719 if (match_block.offset_end < 2) rc = 0; else
4721 offsets[0] = start_match - match_block.start_subject;
4722 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4725 DPRINTF((">>>> returning %d\n", rc));
4726 free_stack(&match_block);
4727 return rc;
4728 } /* End of (if setjmp(match_block.error_env)...) */
4729 free_stack(&match_block);
4731 /* Return an error code; pcremodule.c will preserve the exception */
4732 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4734 while (!anchored &&
4735 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4736 start_match++ < end_subject);
4738 if (using_temporary_offsets)
4740 DPRINTF(("Freeing temporary memory\n"));
4741 (pcre_free)(match_block.offset_vector);
4744 #ifdef DEBUG
4745 printf(">>>> returning %d\n", match_block.errorcode);
4746 #endif
4748 free_stack(&match_block);
4749 return match_block.errorcode;
4752 /* End of pcre.c */