Add gdbus-proxy-well-known-name to the ignore file
[glib.git] / glib / pcre / pcre_compile.c
blobae68fb56644938b2dafc129493de2f3cfc682bc0
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
53 #include "pcre_internal.h"
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
64 /* Macro for setting individual bits in class bitmaps. */
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
73 #define OFLOW_MAX (INT_MAX - 20)
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
92 #define COMPILE_WORK_SIZE (4096)
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
140 #endif
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
161 static const verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299 "a numbered reference must not be zero\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+\0"
306 "] is an invalid data character in JavaScript compatibility mode";
309 /* Definition to allow mutual recursion */
311 static BOOL
312 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
313 int *, int *, branch_chain *, compile_data *, int *);
317 /*************************************************
318 * Find an error text *
319 *************************************************/
321 /* The error texts are now all in one long string, to save on relocations. As
322 some of the text is of unknown length, we can't use a table of offsets.
323 Instead, just count through the strings. This is not a performance issue
324 because it happens only when there has been a compilation error.
326 Argument: the error number
327 Returns: pointer to the error string
330 static const char *
331 find_error_text(int n)
333 const char *s = error_texts;
334 for (; n > 0; n--) while (*s++ != 0) {};
335 return s;
339 /*************************************************
340 * Handle escapes *
341 *************************************************/
343 /* This function is called when a \ has been encountered. It either returns a
344 positive value for a simple escape such as \n, or a negative value which
345 encodes one of the more complicated things such as \d. A backreference to group
346 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
347 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
348 ptr is pointing at the \. On exit, it is on the final character of the escape
349 sequence.
351 Arguments:
352 ptrptr points to the pattern position pointer
353 errorcodeptr points to the errorcode variable
354 bracount number of previous extracting brackets
355 options the options bits
356 isclass TRUE if inside a character class
358 Returns: zero or positive => a data character
359 negative => a special escape sequence
360 on error, errorcodeptr is set
363 static int
364 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
365 int options, BOOL isclass)
367 BOOL utf8 = (options & PCRE_UTF8) != 0;
368 const uschar *ptr = *ptrptr + 1;
369 int c, i;
371 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
372 ptr--; /* Set pointer back to the last byte */
374 /* If backslash is at the end of the pattern, it's an error. */
376 if (c == 0) *errorcodeptr = ERR1;
378 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
379 in a table. A non-zero result is something that can be returned immediately.
380 Otherwise further processing may be required. */
382 #ifndef EBCDIC /* ASCII coding */
383 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
384 else if ((i = escapes[c - '0']) != 0) c = i;
386 #else /* EBCDIC coding */
387 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
388 else if ((i = escapes[c - 0x48]) != 0) c = i;
389 #endif
391 /* Escapes that need further processing, or are illegal. */
393 else
395 const uschar *oldptr;
396 BOOL braced, negated;
398 switch (c)
400 /* A number of Perl escapes are not handled by PCRE. We give an explicit
401 error. */
403 case 'l':
404 case 'L':
405 case 'N':
406 case 'u':
407 case 'U':
408 *errorcodeptr = ERR37;
409 break;
411 /* \g must be followed by one of a number of specific things:
413 (1) A number, either plain or braced. If positive, it is an absolute
414 backreference. If negative, it is a relative backreference. This is a Perl
415 5.10 feature.
417 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
418 is part of Perl's movement towards a unified syntax for back references. As
419 this is synonymous with \k{name}, we fudge it up by pretending it really
420 was \k.
422 (3) For Oniguruma compatibility we also support \g followed by a name or a
423 number either in angle brackets or in single quotes. However, these are
424 (possibly recursive) subroutine calls, _not_ backreferences. Just return
425 the -ESC_g code (cf \k). */
427 case 'g':
428 if (ptr[1] == '<' || ptr[1] == '\'')
430 c = -ESC_g;
431 break;
434 /* Handle the Perl-compatible cases */
436 if (ptr[1] == '{')
438 const uschar *p;
439 for (p = ptr+2; *p != 0 && *p != '}'; p++)
440 if (*p != '-' && g_ascii_isdigit (*p) == 0) break;
441 if (*p != 0 && *p != '}')
443 c = -ESC_k;
444 break;
446 braced = TRUE;
447 ptr++;
449 else braced = FALSE;
451 if (ptr[1] == '-')
453 negated = TRUE;
454 ptr++;
456 else negated = FALSE;
458 c = 0;
459 while (g_ascii_isdigit (ptr[1]) != 0)
460 c = c * 10 + *(++ptr) - '0';
462 if (c < 0) /* Integer overflow */
464 *errorcodeptr = ERR61;
465 break;
468 if (braced && *(++ptr) != '}')
470 *errorcodeptr = ERR57;
471 break;
474 if (c == 0)
476 *errorcodeptr = ERR58;
477 break;
480 if (negated)
482 if (c > bracount)
484 *errorcodeptr = ERR15;
485 break;
487 c = bracount - (c - 1);
490 c = -(ESC_REF + c);
491 break;
493 /* The handling of escape sequences consisting of a string of digits
494 starting with one that is not zero is not straightforward. By experiment,
495 the way Perl works seems to be as follows:
497 Outside a character class, the digits are read as a decimal number. If the
498 number is less than 10, or if there are that many previous extracting
499 left brackets, then it is a back reference. Otherwise, up to three octal
500 digits are read to form an escaped byte. Thus \123 is likely to be octal
501 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
502 value is greater than 377, the least significant 8 bits are taken. Inside a
503 character class, \ followed by a digit is always an octal number. */
505 case '1': case '2': case '3': case '4': case '5':
506 case '6': case '7': case '8': case '9':
508 if (!isclass)
510 oldptr = ptr;
511 c -= '0';
512 while (g_ascii_isdigit (ptr[1]))
513 c = c * 10 + *(++ptr) - '0';
514 if (c < 0) /* Integer overflow */
516 *errorcodeptr = ERR61;
517 break;
519 if (c < 10 || c <= bracount)
521 c = -(ESC_REF + c);
522 break;
524 ptr = oldptr; /* Put the pointer back and fall through */
527 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
528 generates a binary zero byte and treats the digit as a following literal.
529 Thus we have to pull back the pointer by one. */
531 if ((c = *ptr) >= '8')
533 ptr--;
534 c = 0;
535 break;
538 /* \0 always starts an octal number, but we may drop through to here with a
539 larger first octal digit. The original code used just to take the least
540 significant 8 bits of octal numbers (I think this is what early Perls used
541 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
542 than 3 octal digits. */
544 case '0':
545 c -= '0';
546 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
547 c = c * 8 + *(++ptr) - '0';
548 if (!utf8 && c > 255) *errorcodeptr = ERR51;
549 break;
551 /* \x is complicated. \x{ddd} is a character number which can be greater
552 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
553 treated as a data character. */
555 case 'x':
556 if (ptr[1] == '{')
558 const uschar *pt = ptr + 2;
559 int count = 0;
561 c = 0;
562 while (g_ascii_isxdigit (*pt) != 0)
564 register int cc = *pt++;
565 if (c == 0 && cc == '0') continue; /* Leading zeroes */
566 count++;
568 #ifndef EBCDIC /* ASCII coding */
569 if (cc >= 'a') cc -= 32; /* Convert to upper case */
570 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
571 #else /* EBCDIC coding */
572 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
573 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
574 #endif
577 if (*pt == '}')
579 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
580 ptr = pt;
581 break;
584 /* If the sequence of hex digits does not end with '}', then we don't
585 recognize this construct; fall through to the normal \x handling. */
588 /* Read just a single-byte hex-defined char */
590 c = 0;
591 while (i++ < 2 && g_ascii_isxdigit (ptr[1]) != 0)
593 int cc; /* Some compilers don't like ++ */
594 cc = *(++ptr); /* in initializers */
595 #ifndef EBCDIC /* ASCII coding */
596 if (cc >= 'a') cc -= 32; /* Convert to upper case */
597 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
598 #else /* EBCDIC coding */
599 if (cc <= 'z') cc += 64; /* Convert to upper case */
600 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
601 #endif
603 break;
605 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
606 This coding is ASCII-specific, but then the whole concept of \cx is
607 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
609 case 'c':
610 c = *(++ptr);
611 if (c == 0)
613 *errorcodeptr = ERR2;
614 break;
617 #ifndef EBCDIC /* ASCII coding */
618 if (c >= 'a' && c <= 'z') c -= 32;
619 c ^= 0x40;
620 #else /* EBCDIC coding */
621 if (c >= 'a' && c <= 'z') c += 64;
622 c ^= 0xC0;
623 #endif
624 break;
626 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
627 other alphanumeric following \ is an error if PCRE_EXTRA was set;
628 otherwise, for Perl compatibility, it is a literal. This code looks a bit
629 odd, but there used to be some cases other than the default, and there may
630 be again in future, so I haven't "optimized" it. */
632 default:
633 if ((options & PCRE_EXTRA) != 0) switch(c)
635 default:
636 *errorcodeptr = ERR3;
637 break;
639 break;
643 *ptrptr = ptr;
644 return c;
649 #ifdef SUPPORT_UCP
650 /*************************************************
651 * Handle \P and \p *
652 *************************************************/
654 /* This function is called after \P or \p has been encountered, provided that
655 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
656 pointing at the P or p. On exit, it is pointing at the final character of the
657 escape sequence.
659 Argument:
660 ptrptr points to the pattern position pointer
661 negptr points to a boolean that is set TRUE for negation else FALSE
662 dptr points to an int that is set to the detailed property value
663 errorcodeptr points to the error code variable
665 Returns: type value from ucp_type_table, or -1 for an invalid type
668 static int
669 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
671 int c, i, bot, top;
672 const uschar *ptr = *ptrptr;
673 char name[32];
675 c = *(++ptr);
676 if (c == 0) goto ERROR_RETURN;
678 *negptr = FALSE;
680 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
681 negation. */
683 if (c == '{')
685 if (ptr[1] == '^')
687 *negptr = TRUE;
688 ptr++;
690 for (i = 0; i < (int)sizeof(name) - 1; i++)
692 c = *(++ptr);
693 if (c == 0) goto ERROR_RETURN;
694 if (c == '}') break;
695 name[i] = c;
697 if (c !='}') goto ERROR_RETURN;
698 name[i] = 0;
701 /* Otherwise there is just one following character */
703 else
705 name[0] = c;
706 name[1] = 0;
709 *ptrptr = ptr;
711 /* Search for a recognized property name using binary chop */
713 bot = 0;
714 top = _pcre_utt_size;
716 while (bot < top)
718 i = (bot + top) >> 1;
719 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
720 if (c == 0)
722 *dptr = _pcre_utt[i].value;
723 return _pcre_utt[i].type;
725 if (c > 0) bot = i + 1; else top = i;
728 *errorcodeptr = ERR47;
729 *ptrptr = ptr;
730 return -1;
732 ERROR_RETURN:
733 *errorcodeptr = ERR46;
734 *ptrptr = ptr;
735 return -1;
737 #endif
742 /*************************************************
743 * Check for counted repeat *
744 *************************************************/
746 /* This function is called when a '{' is encountered in a place where it might
747 start a quantifier. It looks ahead to see if it really is a quantifier or not.
748 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
749 where the ddds are digits.
751 Arguments:
752 p pointer to the first char after '{'
754 Returns: TRUE or FALSE
757 static BOOL
758 is_counted_repeat(const uschar *p)
760 if (g_ascii_isdigit (*p++) == 0) return FALSE;
761 while (g_ascii_isdigit (*p) != 0) p++;
762 if (*p == '}') return TRUE;
764 if (*p++ != ',') return FALSE;
765 if (*p == '}') return TRUE;
767 if (g_ascii_isdigit (*p++) == 0) return FALSE;
768 while (g_ascii_isdigit (*p) != 0) p++;
770 return (*p == '}');
775 /*************************************************
776 * Read repeat counts *
777 *************************************************/
779 /* Read an item of the form {n,m} and return the values. This is called only
780 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
781 so the syntax is guaranteed to be correct, but we need to check the values.
783 Arguments:
784 p pointer to first char after '{'
785 minp pointer to int for min
786 maxp pointer to int for max
787 returned as -1 if no max
788 errorcodeptr points to error code variable
790 Returns: pointer to '}' on success;
791 current ptr on error, with errorcodeptr set non-zero
794 static const uschar *
795 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
797 int min = 0;
798 int max = -1;
800 /* Read the minimum value and do a paranoid check: a negative value indicates
801 an integer overflow. */
803 while (g_ascii_isdigit (*p) != 0) min = min * 10 + *p++ - '0';
804 if (min < 0 || min > 65535)
806 *errorcodeptr = ERR5;
807 return p;
810 /* Read the maximum value if there is one, and again do a paranoid on its size.
811 Also, max must not be less than min. */
813 if (*p == '}') max = min; else
815 if (*(++p) != '}')
817 max = 0;
818 while(g_ascii_isdigit (*p) != 0) max = max * 10 + *p++ - '0';
819 if (max < 0 || max > 65535)
821 *errorcodeptr = ERR5;
822 return p;
824 if (max < min)
826 *errorcodeptr = ERR4;
827 return p;
832 /* Fill in the required variables, and pass back the pointer to the terminating
833 '}'. */
835 *minp = min;
836 *maxp = max;
837 return p;
842 /*************************************************
843 * Find forward referenced subpattern *
844 *************************************************/
846 /* This function scans along a pattern's text looking for capturing
847 subpatterns, and counting them. If it finds a named pattern that matches the
848 name it is given, it returns its number. Alternatively, if the name is NULL, it
849 returns when it reaches a given numbered subpattern. This is used for forward
850 references to subpatterns. We know that if (?P< is encountered, the name will
851 be terminated by '>' because that is checked in the first pass.
853 Arguments:
854 ptr current position in the pattern
855 cd compile background data
856 name name to seek, or NULL if seeking a numbered subpattern
857 lorn name length, or subpattern number if name is NULL
858 xmode TRUE if we are in /x mode
860 Returns: the number of the named subpattern, or -1 if not found
863 static int
864 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
865 BOOL xmode)
867 const uschar *thisname;
868 int count = cd->bracount;
870 for (; *ptr != 0; ptr++)
872 int term;
874 /* Skip over backslashed characters and also entire \Q...\E */
876 if (*ptr == '\\')
878 if (*(++ptr) == 0) return -1;
879 if (*ptr == 'Q') for (;;)
881 while (*(++ptr) != 0 && *ptr != '\\') {};
882 if (*ptr == 0) return -1;
883 if (*(++ptr) == 'E') break;
885 continue;
888 /* Skip over character classes; this logic must be similar to the way they
889 are handled for real. If the first character is '^', skip it. Also, if the
890 first few characters (either before or after ^) are \Q\E or \E we skip them
891 too. This makes for compatibility with Perl. */
893 if (*ptr == '[')
895 BOOL negate_class = FALSE;
896 for (;;)
898 int c = *(++ptr);
899 if (c == '\\')
901 if (ptr[1] == 'E') ptr++;
902 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
903 else break;
905 else if (!negate_class && c == '^')
906 negate_class = TRUE;
907 else break;
910 /* If the next character is ']', it is a data character that must be
911 skipped, except in JavaScript compatibility mode. */
913 if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
914 ptr++;
916 while (*(++ptr) != ']')
918 if (*ptr == 0) return -1;
919 if (*ptr == '\\')
921 if (*(++ptr) == 0) return -1;
922 if (*ptr == 'Q') for (;;)
924 while (*(++ptr) != 0 && *ptr != '\\') {};
925 if (*ptr == 0) return -1;
926 if (*(++ptr) == 'E') break;
928 continue;
931 continue;
934 /* Skip comments in /x mode */
936 if (xmode && *ptr == '#')
938 while (*(++ptr) != 0 && *ptr != '\n') {};
939 if (*ptr == 0) return -1;
940 continue;
943 /* An opening parens must now be a real metacharacter */
945 if (*ptr != '(') continue;
946 if (ptr[1] != '?' && ptr[1] != '*')
948 count++;
949 if (name == NULL && count == lorn) return count;
950 continue;
953 ptr += 2;
954 if (*ptr == 'P') ptr++; /* Allow optional P */
956 /* We have to disambiguate (?<! and (?<= from (?<name> */
958 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
959 *ptr != '\'')
960 continue;
962 count++;
964 if (name == NULL && count == lorn) return count;
965 term = *ptr++;
966 if (term == '<') term = '>';
967 thisname = ptr;
968 while (*ptr != term) ptr++;
969 if (name != NULL && lorn == ptr - thisname &&
970 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
971 return count;
974 return -1;
979 /*************************************************
980 * Find first significant op code *
981 *************************************************/
983 /* This is called by several functions that scan a compiled expression looking
984 for a fixed first character, or an anchoring op code etc. It skips over things
985 that do not influence this. For some calls, a change of option is important.
986 For some calls, it makes sense to skip negative forward and all backward
987 assertions, and also the \b assertion; for others it does not.
989 Arguments:
990 code pointer to the start of the group
991 options pointer to external options
992 optbit the option bit whose changing is significant, or
993 zero if none are
994 skipassert TRUE if certain assertions are to be skipped
996 Returns: pointer to the first significant opcode
999 static const uschar*
1000 first_significant_code(const uschar *code, int *options, int optbit,
1001 BOOL skipassert)
1003 for (;;)
1005 switch ((int)*code)
1007 case OP_OPT:
1008 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1009 *options = (int)code[1];
1010 code += 2;
1011 break;
1013 case OP_ASSERT_NOT:
1014 case OP_ASSERTBACK:
1015 case OP_ASSERTBACK_NOT:
1016 if (!skipassert) return code;
1017 do code += GET(code, 1); while (*code == OP_ALT);
1018 code += _pcre_OP_lengths[*code];
1019 break;
1021 case OP_WORD_BOUNDARY:
1022 case OP_NOT_WORD_BOUNDARY:
1023 if (!skipassert) return code;
1024 /* Fall through */
1026 case OP_CALLOUT:
1027 case OP_CREF:
1028 case OP_RREF:
1029 case OP_DEF:
1030 code += _pcre_OP_lengths[*code];
1031 break;
1033 default:
1034 return code;
1037 /* Control never reaches here */
1043 /*************************************************
1044 * Find the fixed length of a pattern *
1045 *************************************************/
1047 /* Scan a pattern and compute the fixed length of subject that will match it,
1048 if the length is fixed. This is needed for dealing with backward assertions.
1049 In UTF8 mode, the result is in characters rather than bytes.
1051 Arguments:
1052 code points to the start of the pattern (the bracket)
1053 options the compiling options
1055 Returns: the fixed length, or -1 if there is no fixed length,
1056 or -2 if \C was encountered
1059 static int
1060 find_fixedlength(uschar *code, int options)
1062 int length = -1;
1064 register int branchlength = 0;
1065 register uschar *cc = code + 1 + LINK_SIZE;
1067 /* Scan along the opcodes for this branch. If we get to the end of the
1068 branch, check the length against that of the other branches. */
1070 for (;;)
1072 int d;
1073 register int op = *cc;
1074 switch (op)
1076 case OP_CBRA:
1077 case OP_BRA:
1078 case OP_ONCE:
1079 case OP_COND:
1080 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1081 if (d < 0) return d;
1082 branchlength += d;
1083 do cc += GET(cc, 1); while (*cc == OP_ALT);
1084 cc += 1 + LINK_SIZE;
1085 break;
1087 /* Reached end of a branch; if it's a ket it is the end of a nested
1088 call. If it's ALT it is an alternation in a nested call. If it is
1089 END it's the end of the outer call. All can be handled by the same code. */
1091 case OP_ALT:
1092 case OP_KET:
1093 case OP_KETRMAX:
1094 case OP_KETRMIN:
1095 case OP_END:
1096 if (length < 0) length = branchlength;
1097 else if (length != branchlength) return -1;
1098 if (*cc != OP_ALT) return length;
1099 cc += 1 + LINK_SIZE;
1100 branchlength = 0;
1101 break;
1103 /* Skip over assertive subpatterns */
1105 case OP_ASSERT:
1106 case OP_ASSERT_NOT:
1107 case OP_ASSERTBACK:
1108 case OP_ASSERTBACK_NOT:
1109 do cc += GET(cc, 1); while (*cc == OP_ALT);
1110 /* Fall through */
1112 /* Skip over things that don't match chars */
1114 case OP_REVERSE:
1115 case OP_CREF:
1116 case OP_RREF:
1117 case OP_DEF:
1118 case OP_OPT:
1119 case OP_CALLOUT:
1120 case OP_SOD:
1121 case OP_SOM:
1122 case OP_EOD:
1123 case OP_EODN:
1124 case OP_CIRC:
1125 case OP_DOLL:
1126 case OP_NOT_WORD_BOUNDARY:
1127 case OP_WORD_BOUNDARY:
1128 cc += _pcre_OP_lengths[*cc];
1129 break;
1131 /* Handle literal characters */
1133 case OP_CHAR:
1134 case OP_CHARNC:
1135 case OP_NOT:
1136 branchlength++;
1137 cc += 2;
1138 #ifdef SUPPORT_UTF8
1139 if ((options & PCRE_UTF8) != 0)
1141 while ((*cc & 0xc0) == 0x80) cc++;
1143 #endif
1144 break;
1146 /* Handle exact repetitions. The count is already in characters, but we
1147 need to skip over a multibyte character in UTF8 mode. */
1149 case OP_EXACT:
1150 branchlength += GET2(cc,1);
1151 cc += 4;
1152 #ifdef SUPPORT_UTF8
1153 if ((options & PCRE_UTF8) != 0)
1155 while((*cc & 0x80) == 0x80) cc++;
1157 #endif
1158 break;
1160 case OP_TYPEEXACT:
1161 branchlength += GET2(cc,1);
1162 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1163 cc += 4;
1164 break;
1166 /* Handle single-char matchers */
1168 case OP_PROP:
1169 case OP_NOTPROP:
1170 cc += 2;
1171 /* Fall through */
1173 case OP_NOT_DIGIT:
1174 case OP_DIGIT:
1175 case OP_NOT_WHITESPACE:
1176 case OP_WHITESPACE:
1177 case OP_NOT_WORDCHAR:
1178 case OP_WORDCHAR:
1179 case OP_ANY:
1180 case OP_ALLANY:
1181 branchlength++;
1182 cc++;
1183 break;
1185 /* The single-byte matcher isn't allowed */
1187 case OP_ANYBYTE:
1188 return -2;
1190 /* Check a class for variable quantification */
1192 #ifdef SUPPORT_UTF8
1193 case OP_XCLASS:
1194 cc += GET(cc, 1) - 33;
1195 /* Fall through */
1196 #endif
1198 case OP_CLASS:
1199 case OP_NCLASS:
1200 cc += 33;
1202 switch (*cc)
1204 case OP_CRSTAR:
1205 case OP_CRMINSTAR:
1206 case OP_CRQUERY:
1207 case OP_CRMINQUERY:
1208 return -1;
1210 case OP_CRRANGE:
1211 case OP_CRMINRANGE:
1212 if (GET2(cc,1) != GET2(cc,3)) return -1;
1213 branchlength += GET2(cc,1);
1214 cc += 5;
1215 break;
1217 default:
1218 branchlength++;
1220 break;
1222 /* Anything else is variable length */
1224 default:
1225 return -1;
1228 /* Control never gets here */
1234 /*************************************************
1235 * Scan compiled regex for numbered bracket *
1236 *************************************************/
1238 /* This little function scans through a compiled pattern until it finds a
1239 capturing bracket with the given number.
1241 Arguments:
1242 code points to start of expression
1243 utf8 TRUE in UTF-8 mode
1244 number the required bracket number
1246 Returns: pointer to the opcode for the bracket, or NULL if not found
1249 static const uschar *
1250 find_bracket(const uschar *code, BOOL utf8, int number)
1252 for (;;)
1254 register int c = *code;
1255 if (c == OP_END) return NULL;
1257 /* XCLASS is used for classes that cannot be represented just by a bit
1258 map. This includes negated single high-valued characters. The length in
1259 the table is zero; the actual length is stored in the compiled code. */
1261 if (c == OP_XCLASS) code += GET(code, 1);
1263 /* Handle capturing bracket */
1265 else if (c == OP_CBRA)
1267 int n = GET2(code, 1+LINK_SIZE);
1268 if (n == number) return (uschar *)code;
1269 code += _pcre_OP_lengths[c];
1272 /* Otherwise, we can get the item's length from the table, except that for
1273 repeated character types, we have to test for \p and \P, which have an extra
1274 two bytes of parameters. */
1276 else
1278 switch(c)
1280 case OP_TYPESTAR:
1281 case OP_TYPEMINSTAR:
1282 case OP_TYPEPLUS:
1283 case OP_TYPEMINPLUS:
1284 case OP_TYPEQUERY:
1285 case OP_TYPEMINQUERY:
1286 case OP_TYPEPOSSTAR:
1287 case OP_TYPEPOSPLUS:
1288 case OP_TYPEPOSQUERY:
1289 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1290 break;
1292 case OP_TYPEUPTO:
1293 case OP_TYPEMINUPTO:
1294 case OP_TYPEEXACT:
1295 case OP_TYPEPOSUPTO:
1296 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1297 break;
1300 /* Add in the fixed length from the table */
1302 code += _pcre_OP_lengths[c];
1304 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1305 a multi-byte character. The length in the table is a minimum, so we have to
1306 arrange to skip the extra bytes. */
1308 #ifdef SUPPORT_UTF8
1309 if (utf8) switch(c)
1311 case OP_CHAR:
1312 case OP_CHARNC:
1313 case OP_EXACT:
1314 case OP_UPTO:
1315 case OP_MINUPTO:
1316 case OP_POSUPTO:
1317 case OP_STAR:
1318 case OP_MINSTAR:
1319 case OP_POSSTAR:
1320 case OP_PLUS:
1321 case OP_MINPLUS:
1322 case OP_POSPLUS:
1323 case OP_QUERY:
1324 case OP_MINQUERY:
1325 case OP_POSQUERY:
1326 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1327 break;
1329 #else
1330 (void)(utf8); /* Keep compiler happy by referencing function argument */
1331 #endif
1338 /*************************************************
1339 * Scan compiled regex for recursion reference *
1340 *************************************************/
1342 /* This little function scans through a compiled pattern until it finds an
1343 instance of OP_RECURSE.
1345 Arguments:
1346 code points to start of expression
1347 utf8 TRUE in UTF-8 mode
1349 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1352 static const uschar *
1353 find_recurse(const uschar *code, BOOL utf8)
1355 for (;;)
1357 register int c = *code;
1358 if (c == OP_END) return NULL;
1359 if (c == OP_RECURSE) return code;
1361 /* XCLASS is used for classes that cannot be represented just by a bit
1362 map. This includes negated single high-valued characters. The length in
1363 the table is zero; the actual length is stored in the compiled code. */
1365 if (c == OP_XCLASS) code += GET(code, 1);
1367 /* Otherwise, we can get the item's length from the table, except that for
1368 repeated character types, we have to test for \p and \P, which have an extra
1369 two bytes of parameters. */
1371 else
1373 switch(c)
1375 case OP_TYPESTAR:
1376 case OP_TYPEMINSTAR:
1377 case OP_TYPEPLUS:
1378 case OP_TYPEMINPLUS:
1379 case OP_TYPEQUERY:
1380 case OP_TYPEMINQUERY:
1381 case OP_TYPEPOSSTAR:
1382 case OP_TYPEPOSPLUS:
1383 case OP_TYPEPOSQUERY:
1384 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1385 break;
1387 case OP_TYPEPOSUPTO:
1388 case OP_TYPEUPTO:
1389 case OP_TYPEMINUPTO:
1390 case OP_TYPEEXACT:
1391 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1392 break;
1395 /* Add in the fixed length from the table */
1397 code += _pcre_OP_lengths[c];
1399 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1400 by a multi-byte character. The length in the table is a minimum, so we have
1401 to arrange to skip the extra bytes. */
1403 #ifdef SUPPORT_UTF8
1404 if (utf8) switch(c)
1406 case OP_CHAR:
1407 case OP_CHARNC:
1408 case OP_EXACT:
1409 case OP_UPTO:
1410 case OP_MINUPTO:
1411 case OP_POSUPTO:
1412 case OP_STAR:
1413 case OP_MINSTAR:
1414 case OP_POSSTAR:
1415 case OP_PLUS:
1416 case OP_MINPLUS:
1417 case OP_POSPLUS:
1418 case OP_QUERY:
1419 case OP_MINQUERY:
1420 case OP_POSQUERY:
1421 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1422 break;
1424 #else
1425 (void)(utf8); /* Keep compiler happy by referencing function argument */
1426 #endif
1433 /*************************************************
1434 * Scan compiled branch for non-emptiness *
1435 *************************************************/
1437 /* This function scans through a branch of a compiled pattern to see whether it
1438 can match the empty string or not. It is called from could_be_empty()
1439 below and from compile_branch() when checking for an unlimited repeat of a
1440 group that can match nothing. Note that first_significant_code() skips over
1441 backward and negative forward assertions when its final argument is TRUE. If we
1442 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1443 bracket whose current branch will already have been scanned.
1445 Arguments:
1446 code points to start of search
1447 endcode points to where to stop
1448 utf8 TRUE if in UTF8 mode
1450 Returns: TRUE if what is matched could be empty
1453 static BOOL
1454 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1456 register int c;
1457 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1458 code < endcode;
1459 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1461 const uschar *ccode;
1463 c = *code;
1465 /* Skip over forward assertions; the other assertions are skipped by
1466 first_significant_code() with a TRUE final argument. */
1468 if (c == OP_ASSERT)
1470 do code += GET(code, 1); while (*code == OP_ALT);
1471 c = *code;
1472 continue;
1475 /* Groups with zero repeats can of course be empty; skip them. */
1477 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1479 code += _pcre_OP_lengths[c];
1480 do code += GET(code, 1); while (*code == OP_ALT);
1481 c = *code;
1482 continue;
1485 /* For other groups, scan the branches. */
1487 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1489 BOOL empty_branch;
1490 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1492 /* Scan a closed bracket */
1494 empty_branch = FALSE;
1497 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1498 empty_branch = TRUE;
1499 code += GET(code, 1);
1501 while (*code == OP_ALT);
1502 if (!empty_branch) return FALSE; /* All branches are non-empty */
1503 c = *code;
1504 continue;
1507 /* Handle the other opcodes */
1509 switch (c)
1511 /* Check for quantifiers after a class. XCLASS is used for classes that
1512 cannot be represented just by a bit map. This includes negated single
1513 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1514 actual length is stored in the compiled code, so we must update "code"
1515 here. */
1517 #ifdef SUPPORT_UTF8
1518 case OP_XCLASS:
1519 ccode = code += GET(code, 1);
1520 goto CHECK_CLASS_REPEAT;
1521 #endif
1523 case OP_CLASS:
1524 case OP_NCLASS:
1525 ccode = code + 33;
1527 #ifdef SUPPORT_UTF8
1528 CHECK_CLASS_REPEAT:
1529 #endif
1531 switch (*ccode)
1533 case OP_CRSTAR: /* These could be empty; continue */
1534 case OP_CRMINSTAR:
1535 case OP_CRQUERY:
1536 case OP_CRMINQUERY:
1537 break;
1539 default: /* Non-repeat => class must match */
1540 case OP_CRPLUS: /* These repeats aren't empty */
1541 case OP_CRMINPLUS:
1542 return FALSE;
1544 case OP_CRRANGE:
1545 case OP_CRMINRANGE:
1546 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1547 break;
1549 break;
1551 /* Opcodes that must match a character */
1553 case OP_PROP:
1554 case OP_NOTPROP:
1555 case OP_EXTUNI:
1556 case OP_NOT_DIGIT:
1557 case OP_DIGIT:
1558 case OP_NOT_WHITESPACE:
1559 case OP_WHITESPACE:
1560 case OP_NOT_WORDCHAR:
1561 case OP_WORDCHAR:
1562 case OP_ANY:
1563 case OP_ALLANY:
1564 case OP_ANYBYTE:
1565 case OP_CHAR:
1566 case OP_CHARNC:
1567 case OP_NOT:
1568 case OP_PLUS:
1569 case OP_MINPLUS:
1570 case OP_POSPLUS:
1571 case OP_EXACT:
1572 case OP_NOTPLUS:
1573 case OP_NOTMINPLUS:
1574 case OP_NOTPOSPLUS:
1575 case OP_NOTEXACT:
1576 case OP_TYPEPLUS:
1577 case OP_TYPEMINPLUS:
1578 case OP_TYPEPOSPLUS:
1579 case OP_TYPEEXACT:
1580 return FALSE;
1582 /* These are going to continue, as they may be empty, but we have to
1583 fudge the length for the \p and \P cases. */
1585 case OP_TYPESTAR:
1586 case OP_TYPEMINSTAR:
1587 case OP_TYPEPOSSTAR:
1588 case OP_TYPEQUERY:
1589 case OP_TYPEMINQUERY:
1590 case OP_TYPEPOSQUERY:
1591 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1592 break;
1594 /* Same for these */
1596 case OP_TYPEUPTO:
1597 case OP_TYPEMINUPTO:
1598 case OP_TYPEPOSUPTO:
1599 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1600 break;
1602 /* End of branch */
1604 case OP_KET:
1605 case OP_KETRMAX:
1606 case OP_KETRMIN:
1607 case OP_ALT:
1608 return TRUE;
1610 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1611 MINUPTO, and POSUPTO may be followed by a multibyte character */
1613 #ifdef SUPPORT_UTF8
1614 case OP_STAR:
1615 case OP_MINSTAR:
1616 case OP_POSSTAR:
1617 case OP_QUERY:
1618 case OP_MINQUERY:
1619 case OP_POSQUERY:
1620 case OP_UPTO:
1621 case OP_MINUPTO:
1622 case OP_POSUPTO:
1623 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1624 break;
1625 #endif
1629 return TRUE;
1634 /*************************************************
1635 * Scan compiled regex for non-emptiness *
1636 *************************************************/
1638 /* This function is called to check for left recursive calls. We want to check
1639 the current branch of the current pattern to see if it could match the empty
1640 string. If it could, we must look outwards for branches at other levels,
1641 stopping when we pass beyond the bracket which is the subject of the recursion.
1643 Arguments:
1644 code points to start of the recursion
1645 endcode points to where to stop (current RECURSE item)
1646 bcptr points to the chain of current (unclosed) branch starts
1647 utf8 TRUE if in UTF-8 mode
1649 Returns: TRUE if what is matched could be empty
1652 static BOOL
1653 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1654 BOOL utf8)
1656 while (bcptr != NULL && bcptr->current >= code)
1658 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1659 bcptr = bcptr->outer;
1661 return TRUE;
1666 /*************************************************
1667 * Check for POSIX class syntax *
1668 *************************************************/
1670 /* This function is called when the sequence "[:" or "[." or "[=" is
1671 encountered in a character class. It checks whether this is followed by a
1672 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1673 reach an unescaped ']' without the special preceding character, return FALSE.
1675 Originally, this function only recognized a sequence of letters between the
1676 terminators, but it seems that Perl recognizes any sequence of characters,
1677 though of course unknown POSIX names are subsequently rejected. Perl gives an
1678 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1679 didn't consider this to be a POSIX class. Likewise for [:1234:].
1681 The problem in trying to be exactly like Perl is in the handling of escapes. We
1682 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1683 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1684 below handles the special case of \], but does not try to do any other escape
1685 processing. This makes it different from Perl for cases such as [:l\ower:]
1686 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1687 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1688 I think.
1690 Arguments:
1691 ptr pointer to the initial [
1692 endptr where to return the end pointer
1694 Returns: TRUE or FALSE
1697 static BOOL
1698 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1700 int terminator; /* Don't combine these lines; the Solaris cc */
1701 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1702 for (++ptr; *ptr != 0; ptr++)
1704 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1706 if (*ptr == ']') return FALSE;
1707 if (*ptr == terminator && ptr[1] == ']')
1709 *endptr = ptr;
1710 return TRUE;
1714 return FALSE;
1720 /*************************************************
1721 * Check POSIX class name *
1722 *************************************************/
1724 /* This function is called to check the name given in a POSIX-style class entry
1725 such as [:alnum:].
1727 Arguments:
1728 ptr points to the first letter
1729 len the length of the name
1731 Returns: a value representing the name, or -1 if unknown
1734 static int
1735 check_posix_name(const uschar *ptr, int len)
1737 const char *pn = posix_names;
1738 register int yield = 0;
1739 while (posix_name_lengths[yield] != 0)
1741 if (len == posix_name_lengths[yield] &&
1742 strncmp((const char *)ptr, pn, len) == 0) return yield;
1743 pn += posix_name_lengths[yield] + 1;
1744 yield++;
1746 return -1;
1750 /*************************************************
1751 * Adjust OP_RECURSE items in repeated group *
1752 *************************************************/
1754 /* OP_RECURSE items contain an offset from the start of the regex to the group
1755 that is referenced. This means that groups can be replicated for fixed
1756 repetition simply by copying (because the recursion is allowed to refer to
1757 earlier groups that are outside the current group). However, when a group is
1758 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1759 inserted before it, after it has been compiled. This means that any OP_RECURSE
1760 items within it that refer to the group itself or any contained groups have to
1761 have their offsets adjusted. That one of the jobs of this function. Before it
1762 is called, the partially compiled regex must be temporarily terminated with
1763 OP_END.
1765 This function has been extended with the possibility of forward references for
1766 recursions and subroutine calls. It must also check the list of such references
1767 for the group we are dealing with. If it finds that one of the recursions in
1768 the current group is on this list, it adjusts the offset in the list, not the
1769 value in the reference (which is a group number).
1771 Arguments:
1772 group points to the start of the group
1773 adjust the amount by which the group is to be moved
1774 utf8 TRUE in UTF-8 mode
1775 cd contains pointers to tables etc.
1776 save_hwm the hwm forward reference pointer at the start of the group
1778 Returns: nothing
1781 static void
1782 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1783 uschar *save_hwm)
1785 uschar *ptr = group;
1787 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1789 int offset;
1790 uschar *hc;
1792 /* See if this recursion is on the forward reference list. If so, adjust the
1793 reference. */
1795 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1797 offset = GET(hc, 0);
1798 if (cd->start_code + offset == ptr + 1)
1800 PUT(hc, 0, offset + adjust);
1801 break;
1805 /* Otherwise, adjust the recursion offset if it's after the start of this
1806 group. */
1808 if (hc >= cd->hwm)
1810 offset = GET(ptr, 1);
1811 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1814 ptr += 1 + LINK_SIZE;
1820 /*************************************************
1821 * Insert an automatic callout point *
1822 *************************************************/
1824 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1825 callout points before each pattern item.
1827 Arguments:
1828 code current code pointer
1829 ptr current pattern pointer
1830 cd pointers to tables etc
1832 Returns: new code pointer
1835 static uschar *
1836 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1838 *code++ = OP_CALLOUT;
1839 *code++ = 255;
1840 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1841 PUT(code, LINK_SIZE, 0); /* Default length */
1842 return code + 2*LINK_SIZE;
1847 /*************************************************
1848 * Complete a callout item *
1849 *************************************************/
1851 /* A callout item contains the length of the next item in the pattern, which
1852 we can't fill in till after we have reached the relevant point. This is used
1853 for both automatic and manual callouts.
1855 Arguments:
1856 previous_callout points to previous callout item
1857 ptr current pattern pointer
1858 cd pointers to tables etc
1860 Returns: nothing
1863 static void
1864 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1866 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1867 PUT(previous_callout, 2 + LINK_SIZE, length);
1872 #ifdef SUPPORT_UCP
1873 /*************************************************
1874 * Get othercase range *
1875 *************************************************/
1877 /* This function is passed the start and end of a class range, in UTF-8 mode
1878 with UCP support. It searches up the characters, looking for internal ranges of
1879 characters in the "other" case. Each call returns the next one, updating the
1880 start address.
1882 Arguments:
1883 cptr points to starting character value; updated
1884 d end value
1885 ocptr where to put start of othercase range
1886 odptr where to put end of othercase range
1888 Yield: TRUE when range returned; FALSE when no more
1891 static BOOL
1892 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1893 unsigned int *odptr)
1895 unsigned int c, othercase, next;
1897 for (c = *cptr; c <= d; c++)
1898 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
1900 if (c > d) return FALSE;
1902 *ocptr = othercase;
1903 next = othercase + 1;
1905 for (++c; c <= d; c++)
1907 if (UCD_OTHERCASE(c) != next) break;
1908 next++;
1911 *odptr = next - 1;
1912 *cptr = c;
1914 return TRUE;
1916 #endif /* SUPPORT_UCP */
1920 /*************************************************
1921 * Check if auto-possessifying is possible *
1922 *************************************************/
1924 /* This function is called for unlimited repeats of certain items, to see
1925 whether the next thing could possibly match the repeated item. If not, it makes
1926 sense to automatically possessify the repeated item.
1928 Arguments:
1929 op_code the repeated op code
1930 this data for this item, depends on the opcode
1931 utf8 TRUE in UTF-8 mode
1932 utf8_char used for utf8 character bytes, NULL if not relevant
1933 ptr next character in pattern
1934 options options bits
1935 cd contains pointers to tables etc.
1937 Returns: TRUE if possessifying is wanted
1940 static BOOL
1941 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1942 const uschar *ptr, int options, compile_data *cd)
1944 int next;
1946 /* Skip whitespace and comments in extended mode */
1948 if ((options & PCRE_EXTENDED) != 0)
1950 for (;;)
1952 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1953 if (*ptr == '#')
1955 while (*(++ptr) != 0)
1956 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1958 else break;
1962 /* If the next item is one that we can handle, get its value. A non-negative
1963 value is a character, a negative value is an escape value. */
1965 if (*ptr == '\\')
1967 int temperrorcode = 0;
1968 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1969 if (temperrorcode != 0) return FALSE;
1970 ptr++; /* Point after the escape sequence */
1973 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1975 #ifdef SUPPORT_UTF8
1976 if (utf8) { GETCHARINC(next, ptr); } else
1977 #endif
1978 next = *ptr++;
1981 else return FALSE;
1983 /* Skip whitespace and comments in extended mode */
1985 if ((options & PCRE_EXTENDED) != 0)
1987 for (;;)
1989 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1990 if (*ptr == '#')
1992 while (*(++ptr) != 0)
1993 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1995 else break;
1999 /* If the next thing is itself optional, we have to give up. */
2001 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2002 return FALSE;
2004 /* Now compare the next item with the previous opcode. If the previous is a
2005 positive single character match, "item" either contains the character or, if
2006 "item" is greater than 127 in utf8 mode, the character's bytes are in
2007 utf8_char. */
2010 /* Handle cases when the next item is a character. */
2012 if (next >= 0) switch(op_code)
2014 case OP_CHAR:
2015 #ifdef SUPPORT_UTF8
2016 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2017 #else
2018 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2019 #endif
2020 return item != next;
2022 /* For CHARNC (caseless character) we must check the other case. If we have
2023 Unicode property support, we can use it to test the other case of
2024 high-valued characters. */
2026 case OP_CHARNC:
2027 #ifdef SUPPORT_UTF8
2028 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2029 #endif
2030 if (item == next) return FALSE;
2031 #ifdef SUPPORT_UTF8
2032 if (utf8)
2034 unsigned int othercase;
2035 if (next < 128) othercase = cd->fcc[next]; else
2036 #ifdef SUPPORT_UCP
2037 othercase = UCD_OTHERCASE((unsigned int)next);
2038 #else
2039 othercase = NOTACHAR;
2040 #endif
2041 return (unsigned int)item != othercase;
2043 else
2044 #endif /* SUPPORT_UTF8 */
2045 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2047 /* For OP_NOT, "item" must be a single-byte character. */
2049 case OP_NOT:
2050 if (item == next) return TRUE;
2051 if ((options & PCRE_CASELESS) == 0) return FALSE;
2052 #ifdef SUPPORT_UTF8
2053 if (utf8)
2055 unsigned int othercase;
2056 if (next < 128) othercase = cd->fcc[next]; else
2057 #ifdef SUPPORT_UCP
2058 othercase = UCD_OTHERCASE(next);
2059 #else
2060 othercase = NOTACHAR;
2061 #endif
2062 return (unsigned int)item == othercase;
2064 else
2065 #endif /* SUPPORT_UTF8 */
2066 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2068 case OP_DIGIT:
2069 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2071 case OP_NOT_DIGIT:
2072 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2074 case OP_WHITESPACE:
2075 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2077 case OP_NOT_WHITESPACE:
2078 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2080 case OP_WORDCHAR:
2081 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2083 case OP_NOT_WORDCHAR:
2084 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2086 case OP_HSPACE:
2087 case OP_NOT_HSPACE:
2088 switch(next)
2090 case 0x09:
2091 case 0x20:
2092 case 0xa0:
2093 case 0x1680:
2094 case 0x180e:
2095 case 0x2000:
2096 case 0x2001:
2097 case 0x2002:
2098 case 0x2003:
2099 case 0x2004:
2100 case 0x2005:
2101 case 0x2006:
2102 case 0x2007:
2103 case 0x2008:
2104 case 0x2009:
2105 case 0x200A:
2106 case 0x202f:
2107 case 0x205f:
2108 case 0x3000:
2109 return op_code != OP_HSPACE;
2110 default:
2111 return op_code == OP_HSPACE;
2114 case OP_VSPACE:
2115 case OP_NOT_VSPACE:
2116 switch(next)
2118 case 0x0a:
2119 case 0x0b:
2120 case 0x0c:
2121 case 0x0d:
2122 case 0x85:
2123 case 0x2028:
2124 case 0x2029:
2125 return op_code != OP_VSPACE;
2126 default:
2127 return op_code == OP_VSPACE;
2130 default:
2131 return FALSE;
2135 /* Handle the case when the next item is \d, \s, etc. */
2137 switch(op_code)
2139 case OP_CHAR:
2140 case OP_CHARNC:
2141 #ifdef SUPPORT_UTF8
2142 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2143 #endif
2144 switch(-next)
2146 case ESC_d:
2147 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2149 case ESC_D:
2150 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2152 case ESC_s:
2153 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2155 case ESC_S:
2156 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2158 case ESC_w:
2159 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2161 case ESC_W:
2162 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2164 case ESC_h:
2165 case ESC_H:
2166 switch(item)
2168 case 0x09:
2169 case 0x20:
2170 case 0xa0:
2171 case 0x1680:
2172 case 0x180e:
2173 case 0x2000:
2174 case 0x2001:
2175 case 0x2002:
2176 case 0x2003:
2177 case 0x2004:
2178 case 0x2005:
2179 case 0x2006:
2180 case 0x2007:
2181 case 0x2008:
2182 case 0x2009:
2183 case 0x200A:
2184 case 0x202f:
2185 case 0x205f:
2186 case 0x3000:
2187 return -next != ESC_h;
2188 default:
2189 return -next == ESC_h;
2192 case ESC_v:
2193 case ESC_V:
2194 switch(item)
2196 case 0x0a:
2197 case 0x0b:
2198 case 0x0c:
2199 case 0x0d:
2200 case 0x85:
2201 case 0x2028:
2202 case 0x2029:
2203 return -next != ESC_v;
2204 default:
2205 return -next == ESC_v;
2208 default:
2209 return FALSE;
2212 case OP_DIGIT:
2213 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2214 next == -ESC_h || next == -ESC_v;
2216 case OP_NOT_DIGIT:
2217 return next == -ESC_d;
2219 case OP_WHITESPACE:
2220 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2222 case OP_NOT_WHITESPACE:
2223 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2225 case OP_HSPACE:
2226 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2228 case OP_NOT_HSPACE:
2229 return next == -ESC_h;
2231 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2232 case OP_VSPACE:
2233 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2235 case OP_NOT_VSPACE:
2236 return next == -ESC_v;
2238 case OP_WORDCHAR:
2239 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2241 case OP_NOT_WORDCHAR:
2242 return next == -ESC_w || next == -ESC_d;
2244 default:
2245 return FALSE;
2248 /* Control does not reach here */
2253 /*************************************************
2254 * Compile one branch *
2255 *************************************************/
2257 /* Scan the pattern, compiling it into the a vector. If the options are
2258 changed during the branch, the pointer is used to change the external options
2259 bits. This function is used during the pre-compile phase when we are trying
2260 to find out the amount of memory needed, as well as during the real compile
2261 phase. The value of lengthptr distinguishes the two phases.
2263 Arguments:
2264 optionsptr pointer to the option bits
2265 codeptr points to the pointer to the current code point
2266 ptrptr points to the current pattern pointer
2267 errorcodeptr points to error code variable
2268 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2269 reqbyteptr set to the last literal character required, else < 0
2270 bcptr points to current branch chain
2271 cd contains pointers to tables etc.
2272 lengthptr NULL during the real compile phase
2273 points to length accumulator during pre-compile phase
2275 Returns: TRUE on success
2276 FALSE, with *errorcodeptr set non-zero on error
2279 static BOOL
2280 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2281 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2282 compile_data *cd, int *lengthptr)
2284 int repeat_type, op_type;
2285 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2286 int bravalue = 0;
2287 int greedy_default, greedy_non_default;
2288 int firstbyte, reqbyte;
2289 int zeroreqbyte, zerofirstbyte;
2290 int req_caseopt, reqvary, tempreqvary;
2291 int options = *optionsptr;
2292 int after_manual_callout = 0;
2293 int length_prevgroup = 0;
2294 register int c;
2295 register uschar *code = *codeptr;
2296 uschar *last_code = code;
2297 uschar *orig_code = code;
2298 uschar *tempcode;
2299 BOOL inescq = FALSE;
2300 BOOL groupsetfirstbyte = FALSE;
2301 const uschar *ptr = *ptrptr;
2302 const uschar *tempptr;
2303 uschar *previous = NULL;
2304 uschar *previous_callout = NULL;
2305 uschar *save_hwm = NULL;
2306 uschar classbits[32];
2308 #ifdef SUPPORT_UTF8
2309 BOOL class_utf8;
2310 BOOL utf8 = (options & PCRE_UTF8) != 0;
2311 uschar *class_utf8data;
2312 uschar *class_utf8data_base;
2313 uschar utf8_char[6];
2314 #else
2315 BOOL utf8 = FALSE;
2316 uschar *utf8_char = NULL;
2317 #endif
2319 #ifdef DEBUG
2320 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2321 #endif
2323 /* Set up the default and non-default settings for greediness */
2325 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2326 greedy_non_default = greedy_default ^ 1;
2328 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2329 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2330 matches a non-fixed char first char; reqbyte just remains unset if we never
2331 find one.
2333 When we hit a repeat whose minimum is zero, we may have to adjust these values
2334 to take the zero repeat into account. This is implemented by setting them to
2335 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2336 item types that can be repeated set these backoff variables appropriately. */
2338 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2340 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2341 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2342 value > 255. It is added into the firstbyte or reqbyte variables to record the
2343 case status of the value. This is used only for ASCII characters. */
2345 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2347 /* Switch on next character until the end of the branch */
2349 for (;; ptr++)
2351 BOOL negate_class;
2352 BOOL should_flip_negation;
2353 BOOL possessive_quantifier;
2354 BOOL is_quantifier;
2355 BOOL is_recurse;
2356 BOOL reset_bracount;
2357 int class_charcount;
2358 int class_lastchar;
2359 int newoptions;
2360 int recno;
2361 int refsign;
2362 int skipbytes;
2363 int subreqbyte;
2364 int subfirstbyte;
2365 int terminator;
2366 int mclength;
2367 uschar mcbuffer[8];
2369 /* Get next byte in the pattern */
2371 c = *ptr;
2373 /* If we are in the pre-compile phase, accumulate the length used for the
2374 previous cycle of this loop. */
2376 if (lengthptr != NULL)
2378 #ifdef DEBUG
2379 if (code > cd->hwm) cd->hwm = code; /* High water info */
2380 #endif
2381 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2383 *errorcodeptr = ERR52;
2384 goto FAILED;
2387 /* There is at least one situation where code goes backwards: this is the
2388 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2389 the class is simply eliminated. However, it is created first, so we have to
2390 allow memory for it. Therefore, don't ever reduce the length at this point.
2393 if (code < last_code) code = last_code;
2395 /* Paranoid check for integer overflow */
2397 if (OFLOW_MAX - *lengthptr < code - last_code)
2399 *errorcodeptr = ERR20;
2400 goto FAILED;
2403 *lengthptr += code - last_code;
2404 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2406 /* If "previous" is set and it is not at the start of the work space, move
2407 it back to there, in order to avoid filling up the work space. Otherwise,
2408 if "previous" is NULL, reset the current code pointer to the start. */
2410 if (previous != NULL)
2412 if (previous > orig_code)
2414 memmove(orig_code, previous, code - previous);
2415 code -= previous - orig_code;
2416 previous = orig_code;
2419 else code = orig_code;
2421 /* Remember where this code item starts so we can pick up the length
2422 next time round. */
2424 last_code = code;
2427 /* In the real compile phase, just check the workspace used by the forward
2428 reference list. */
2430 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2432 *errorcodeptr = ERR52;
2433 goto FAILED;
2436 /* If in \Q...\E, check for the end; if not, we have a literal */
2438 if (inescq && c != 0)
2440 if (c == '\\' && ptr[1] == 'E')
2442 inescq = FALSE;
2443 ptr++;
2444 continue;
2446 else
2448 if (previous_callout != NULL)
2450 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2451 complete_callout(previous_callout, ptr, cd);
2452 previous_callout = NULL;
2454 if ((options & PCRE_AUTO_CALLOUT) != 0)
2456 previous_callout = code;
2457 code = auto_callout(code, ptr, cd);
2459 goto NORMAL_CHAR;
2463 /* Fill in length of a previous callout, except when the next thing is
2464 a quantifier. */
2466 is_quantifier = c == '*' || c == '+' || c == '?' ||
2467 (c == '{' && is_counted_repeat(ptr+1));
2469 if (!is_quantifier && previous_callout != NULL &&
2470 after_manual_callout-- <= 0)
2472 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2473 complete_callout(previous_callout, ptr, cd);
2474 previous_callout = NULL;
2477 /* In extended mode, skip white space and comments */
2479 if ((options & PCRE_EXTENDED) != 0)
2481 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2482 if (c == '#')
2484 while (*(++ptr) != 0)
2486 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2488 if (*ptr != 0) continue;
2490 /* Else fall through to handle end of string */
2491 c = 0;
2495 /* No auto callout for quantifiers. */
2497 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2499 previous_callout = code;
2500 code = auto_callout(code, ptr, cd);
2503 switch(c)
2505 /* ===================================================================*/
2506 case 0: /* The branch terminates at string end */
2507 case '|': /* or | or ) */
2508 case ')':
2509 *firstbyteptr = firstbyte;
2510 *reqbyteptr = reqbyte;
2511 *codeptr = code;
2512 *ptrptr = ptr;
2513 if (lengthptr != NULL)
2515 if (OFLOW_MAX - *lengthptr < code - last_code)
2517 *errorcodeptr = ERR20;
2518 goto FAILED;
2520 *lengthptr += code - last_code; /* To include callout length */
2521 DPRINTF((">> end branch\n"));
2523 return TRUE;
2526 /* ===================================================================*/
2527 /* Handle single-character metacharacters. In multiline mode, ^ disables
2528 the setting of any following char as a first character. */
2530 case '^':
2531 if ((options & PCRE_MULTILINE) != 0)
2533 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2535 previous = NULL;
2536 *code++ = OP_CIRC;
2537 break;
2539 case '$':
2540 previous = NULL;
2541 *code++ = OP_DOLL;
2542 break;
2544 /* There can never be a first char if '.' is first, whatever happens about
2545 repeats. The value of reqbyte doesn't change either. */
2547 case '.':
2548 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2549 zerofirstbyte = firstbyte;
2550 zeroreqbyte = reqbyte;
2551 previous = code;
2552 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2553 break;
2556 /* ===================================================================*/
2557 /* Character classes. If the included characters are all < 256, we build a
2558 32-byte bitmap of the permitted characters, except in the special case
2559 where there is only one such character. For negated classes, we build the
2560 map as usual, then invert it at the end. However, we use a different opcode
2561 so that data characters > 255 can be handled correctly.
2563 If the class contains characters outside the 0-255 range, a different
2564 opcode is compiled. It may optionally have a bit map for characters < 256,
2565 but those above are are explicitly listed afterwards. A flag byte tells
2566 whether the bitmap is present, and whether this is a negated class or not.
2568 In JavaScript compatibility mode, an isolated ']' causes an error. In
2569 default (Perl) mode, it is treated as a data character. */
2571 case ']':
2572 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2574 *errorcodeptr = ERR64;
2575 goto FAILED;
2577 goto NORMAL_CHAR;
2579 case '[':
2580 previous = code;
2582 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2583 they are encountered at the top level, so we'll do that too. */
2585 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2586 check_posix_syntax(ptr, &tempptr))
2588 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2589 goto FAILED;
2592 /* If the first character is '^', set the negation flag and skip it. Also,
2593 if the first few characters (either before or after ^) are \Q\E or \E we
2594 skip them too. This makes for compatibility with Perl. */
2596 negate_class = FALSE;
2597 for (;;)
2599 c = *(++ptr);
2600 if (c == '\\')
2602 if (ptr[1] == 'E') ptr++;
2603 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2604 else break;
2606 else if (!negate_class && c == '^')
2607 negate_class = TRUE;
2608 else break;
2611 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2612 an initial ']' is taken as a data character -- the code below handles
2613 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2614 [^] must match any character, so generate OP_ALLANY. */
2616 if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2618 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2619 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2620 zerofirstbyte = firstbyte;
2621 break;
2624 /* If a class contains a negative special such as \S, we need to flip the
2625 negation flag at the end, so that support for characters > 255 works
2626 correctly (they are all included in the class). */
2628 should_flip_negation = FALSE;
2630 /* Keep a count of chars with values < 256 so that we can optimize the case
2631 of just a single character (as long as it's < 256). However, For higher
2632 valued UTF-8 characters, we don't yet do any optimization. */
2634 class_charcount = 0;
2635 class_lastchar = -1;
2637 /* Initialize the 32-char bit map to all zeros. We build the map in a
2638 temporary bit of memory, in case the class contains only 1 character (less
2639 than 256), because in that case the compiled code doesn't use the bit map.
2642 memset(classbits, 0, 32 * sizeof(uschar));
2644 #ifdef SUPPORT_UTF8
2645 class_utf8 = FALSE; /* No chars >= 256 */
2646 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2647 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2648 #endif
2650 /* Process characters until ] is reached. By writing this as a "do" it
2651 means that an initial ] is taken as a data character. At the start of the
2652 loop, c contains the first byte of the character. */
2654 if (c != 0) do
2656 const uschar *oldptr;
2658 #ifdef SUPPORT_UTF8
2659 if (utf8 && c > 127)
2660 { /* Braces are required because the */
2661 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2664 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2665 data and reset the pointer. This is so that very large classes that
2666 contain a zillion UTF-8 characters no longer overwrite the work space
2667 (which is on the stack). */
2669 if (lengthptr != NULL)
2671 *lengthptr += class_utf8data - class_utf8data_base;
2672 class_utf8data = class_utf8data_base;
2675 #endif
2677 /* Inside \Q...\E everything is literal except \E */
2679 if (inescq)
2681 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2683 inescq = FALSE; /* Reset literal state */
2684 ptr++; /* Skip the 'E' */
2685 continue; /* Carry on with next */
2687 goto CHECK_RANGE; /* Could be range if \E follows */
2690 /* Handle POSIX class names. Perl allows a negation extension of the
2691 form [:^name:]. A square bracket that doesn't match the syntax is
2692 treated as a literal. We also recognize the POSIX constructions
2693 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2694 5.6 and 5.8 do. */
2696 if (c == '[' &&
2697 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2698 check_posix_syntax(ptr, &tempptr))
2700 BOOL local_negate = FALSE;
2701 int posix_class, taboffset, tabopt;
2702 register const uschar *cbits = cd->cbits;
2703 uschar pbits[32];
2705 if (ptr[1] != ':')
2707 *errorcodeptr = ERR31;
2708 goto FAILED;
2711 ptr += 2;
2712 if (*ptr == '^')
2714 local_negate = TRUE;
2715 should_flip_negation = TRUE; /* Note negative special */
2716 ptr++;
2719 posix_class = check_posix_name(ptr, tempptr - ptr);
2720 if (posix_class < 0)
2722 *errorcodeptr = ERR30;
2723 goto FAILED;
2726 /* If matching is caseless, upper and lower are converted to
2727 alpha. This relies on the fact that the class table starts with
2728 alpha, lower, upper as the first 3 entries. */
2730 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2731 posix_class = 0;
2733 /* We build the bit map for the POSIX class in a chunk of local store
2734 because we may be adding and subtracting from it, and we don't want to
2735 subtract bits that may be in the main map already. At the end we or the
2736 result into the bit map that is being built. */
2738 posix_class *= 3;
2740 /* Copy in the first table (always present) */
2742 memcpy(pbits, cbits + posix_class_maps[posix_class],
2743 32 * sizeof(uschar));
2745 /* If there is a second table, add or remove it as required. */
2747 taboffset = posix_class_maps[posix_class + 1];
2748 tabopt = posix_class_maps[posix_class + 2];
2750 if (taboffset >= 0)
2752 if (tabopt >= 0)
2753 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2754 else
2755 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2758 /* Not see if we need to remove any special characters. An option
2759 value of 1 removes vertical space and 2 removes underscore. */
2761 if (tabopt < 0) tabopt = -tabopt;
2762 if (tabopt == 1) pbits[1] &= ~0x3c;
2763 else if (tabopt == 2) pbits[11] &= 0x7f;
2765 /* Add the POSIX table or its complement into the main table that is
2766 being built and we are done. */
2768 if (local_negate)
2769 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2770 else
2771 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2773 ptr = tempptr + 1;
2774 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2775 continue; /* End of POSIX syntax handling */
2778 /* Backslash may introduce a single character, or it may introduce one
2779 of the specials, which just set a flag. The sequence \b is a special
2780 case. Inside a class (and only there) it is treated as backspace.
2781 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2782 to 'or' into the one we are building. We assume they have more than one
2783 character in them, so set class_charcount bigger than one. */
2785 if (c == '\\')
2787 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2788 if (*errorcodeptr != 0) goto FAILED;
2790 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2791 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2792 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2793 else if (-c == ESC_Q) /* Handle start of quoted string */
2795 if (ptr[1] == '\\' && ptr[2] == 'E')
2797 ptr += 2; /* avoid empty string */
2799 else inescq = TRUE;
2800 continue;
2802 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2804 if (c < 0)
2806 register const uschar *cbits = cd->cbits;
2807 class_charcount += 2; /* Greater than 1 is what matters */
2809 /* Save time by not doing this in the pre-compile phase. */
2811 if (lengthptr == NULL) switch (-c)
2813 case ESC_d:
2814 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2815 continue;
2817 case ESC_D:
2818 should_flip_negation = TRUE;
2819 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2820 continue;
2822 case ESC_w:
2823 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2824 continue;
2826 case ESC_W:
2827 should_flip_negation = TRUE;
2828 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2829 continue;
2831 case ESC_s:
2832 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2833 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2834 continue;
2836 case ESC_S:
2837 should_flip_negation = TRUE;
2838 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2839 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2840 continue;
2842 default: /* Not recognized; fall through */
2843 break; /* Need "default" setting to stop compiler warning. */
2846 /* In the pre-compile phase, just do the recognition. */
2848 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2849 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2851 /* We need to deal with \H, \h, \V, and \v in both phases because
2852 they use extra memory. */
2854 if (-c == ESC_h)
2856 SETBIT(classbits, 0x09); /* VT */
2857 SETBIT(classbits, 0x20); /* SPACE */
2858 SETBIT(classbits, 0xa0); /* NSBP */
2859 #ifdef SUPPORT_UTF8
2860 if (utf8)
2862 class_utf8 = TRUE;
2863 *class_utf8data++ = XCL_SINGLE;
2864 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2865 *class_utf8data++ = XCL_SINGLE;
2866 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2867 *class_utf8data++ = XCL_RANGE;
2868 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2869 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2870 *class_utf8data++ = XCL_SINGLE;
2871 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2872 *class_utf8data++ = XCL_SINGLE;
2873 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2874 *class_utf8data++ = XCL_SINGLE;
2875 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2877 #endif
2878 continue;
2881 if (-c == ESC_H)
2883 for (c = 0; c < 32; c++)
2885 int x = 0xff;
2886 switch (c)
2888 case 0x09/8: x ^= 1 << (0x09%8); break;
2889 case 0x20/8: x ^= 1 << (0x20%8); break;
2890 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2891 default: break;
2893 classbits[c] |= x;
2896 #ifdef SUPPORT_UTF8
2897 if (utf8)
2899 class_utf8 = TRUE;
2900 *class_utf8data++ = XCL_RANGE;
2901 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2902 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2903 *class_utf8data++ = XCL_RANGE;
2904 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2905 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2906 *class_utf8data++ = XCL_RANGE;
2907 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2908 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2909 *class_utf8data++ = XCL_RANGE;
2910 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2911 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2912 *class_utf8data++ = XCL_RANGE;
2913 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2914 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2915 *class_utf8data++ = XCL_RANGE;
2916 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2917 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2918 *class_utf8data++ = XCL_RANGE;
2919 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2920 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2922 #endif
2923 continue;
2926 if (-c == ESC_v)
2928 SETBIT(classbits, 0x0a); /* LF */
2929 SETBIT(classbits, 0x0b); /* VT */
2930 SETBIT(classbits, 0x0c); /* FF */
2931 SETBIT(classbits, 0x0d); /* CR */
2932 SETBIT(classbits, 0x85); /* NEL */
2933 #ifdef SUPPORT_UTF8
2934 if (utf8)
2936 class_utf8 = TRUE;
2937 *class_utf8data++ = XCL_RANGE;
2938 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2939 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2941 #endif
2942 continue;
2945 if (-c == ESC_V)
2947 for (c = 0; c < 32; c++)
2949 int x = 0xff;
2950 switch (c)
2952 case 0x0a/8: x ^= 1 << (0x0a%8);
2953 x ^= 1 << (0x0b%8);
2954 x ^= 1 << (0x0c%8);
2955 x ^= 1 << (0x0d%8);
2956 break;
2957 case 0x85/8: x ^= 1 << (0x85%8); break;
2958 default: break;
2960 classbits[c] |= x;
2963 #ifdef SUPPORT_UTF8
2964 if (utf8)
2966 class_utf8 = TRUE;
2967 *class_utf8data++ = XCL_RANGE;
2968 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2969 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2970 *class_utf8data++ = XCL_RANGE;
2971 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2972 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2974 #endif
2975 continue;
2978 /* We need to deal with \P and \p in both phases. */
2980 #ifdef SUPPORT_UCP
2981 if (-c == ESC_p || -c == ESC_P)
2983 BOOL negated;
2984 int pdata;
2985 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2986 if (ptype < 0) goto FAILED;
2987 class_utf8 = TRUE;
2988 *class_utf8data++ = ((-c == ESC_p) != negated)?
2989 XCL_PROP : XCL_NOTPROP;
2990 *class_utf8data++ = ptype;
2991 *class_utf8data++ = pdata;
2992 class_charcount -= 2; /* Not a < 256 character */
2993 continue;
2995 #endif
2996 /* Unrecognized escapes are faulted if PCRE is running in its
2997 strict mode. By default, for compatibility with Perl, they are
2998 treated as literals. */
3000 if ((options & PCRE_EXTRA) != 0)
3002 *errorcodeptr = ERR7;
3003 goto FAILED;
3006 class_charcount -= 2; /* Undo the default count from above */
3007 c = *ptr; /* Get the final character and fall through */
3010 /* Fall through if we have a single character (c >= 0). This may be
3011 greater than 256 in UTF-8 mode. */
3013 } /* End of backslash handling */
3015 /* A single character may be followed by '-' to form a range. However,
3016 Perl does not permit ']' to be the end of the range. A '-' character
3017 at the end is treated as a literal. Perl ignores orphaned \E sequences
3018 entirely. The code for handling \Q and \E is messy. */
3020 CHECK_RANGE:
3021 while (ptr[1] == '\\' && ptr[2] == 'E')
3023 inescq = FALSE;
3024 ptr += 2;
3027 oldptr = ptr;
3029 /* Remember \r or \n */
3031 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3033 /* Check for range */
3035 if (!inescq && ptr[1] == '-')
3037 int d;
3038 ptr += 2;
3039 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3041 /* If we hit \Q (not followed by \E) at this point, go into escaped
3042 mode. */
3044 while (*ptr == '\\' && ptr[1] == 'Q')
3046 ptr += 2;
3047 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3048 inescq = TRUE;
3049 break;
3052 if (*ptr == 0 || (!inescq && *ptr == ']'))
3054 ptr = oldptr;
3055 goto LONE_SINGLE_CHARACTER;
3058 #ifdef SUPPORT_UTF8
3059 if (utf8)
3060 { /* Braces are required because the */
3061 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3063 else
3064 #endif
3065 d = *ptr; /* Not UTF-8 mode */
3067 /* The second part of a range can be a single-character escape, but
3068 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3069 in such circumstances. */
3071 if (!inescq && d == '\\')
3073 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3074 if (*errorcodeptr != 0) goto FAILED;
3076 /* \b is backspace; \X is literal X; \R is literal R; any other
3077 special means the '-' was literal */
3079 if (d < 0)
3081 if (d == -ESC_b) d = '\b';
3082 else if (d == -ESC_X) d = 'X';
3083 else if (d == -ESC_R) d = 'R'; else
3085 ptr = oldptr;
3086 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3091 /* Check that the two values are in the correct order. Optimize
3092 one-character ranges */
3094 if (d < c)
3096 *errorcodeptr = ERR8;
3097 goto FAILED;
3100 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3102 /* Remember \r or \n */
3104 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3106 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3107 matching, we have to use an XCLASS with extra data items. Caseless
3108 matching for characters > 127 is available only if UCP support is
3109 available. */
3111 #ifdef SUPPORT_UTF8
3112 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3114 class_utf8 = TRUE;
3116 /* With UCP support, we can find the other case equivalents of
3117 the relevant characters. There may be several ranges. Optimize how
3118 they fit with the basic range. */
3120 #ifdef SUPPORT_UCP
3121 if ((options & PCRE_CASELESS) != 0)
3123 unsigned int occ, ocd;
3124 unsigned int cc = c;
3125 unsigned int origd = d;
3126 while (get_othercase_range(&cc, origd, &occ, &ocd))
3128 if (occ >= (unsigned int)c &&
3129 ocd <= (unsigned int)d)
3130 continue; /* Skip embedded ranges */
3132 if (occ < (unsigned int)c &&
3133 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3134 { /* if there is overlap, */
3135 c = occ; /* noting that if occ < c */
3136 continue; /* we can't have ocd > d */
3137 } /* because a subrange is */
3138 if (ocd > (unsigned int)d &&
3139 occ <= (unsigned int)d + 1) /* always shorter than */
3140 { /* the basic range. */
3141 d = ocd;
3142 continue;
3145 if (occ == ocd)
3147 *class_utf8data++ = XCL_SINGLE;
3149 else
3151 *class_utf8data++ = XCL_RANGE;
3152 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3154 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3157 #endif /* SUPPORT_UCP */
3159 /* Now record the original range, possibly modified for UCP caseless
3160 overlapping ranges. */
3162 *class_utf8data++ = XCL_RANGE;
3163 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3164 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3166 /* With UCP support, we are done. Without UCP support, there is no
3167 caseless matching for UTF-8 characters > 127; we can use the bit map
3168 for the smaller ones. */
3170 #ifdef SUPPORT_UCP
3171 continue; /* With next character in the class */
3172 #else
3173 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3175 /* Adjust upper limit and fall through to set up the map */
3177 d = 127;
3179 #endif /* SUPPORT_UCP */
3181 #endif /* SUPPORT_UTF8 */
3183 /* We use the bit map for all cases when not in UTF-8 mode; else
3184 ranges that lie entirely within 0-127 when there is UCP support; else
3185 for partial ranges without UCP support. */
3187 class_charcount += d - c + 1;
3188 class_lastchar = d;
3190 /* We can save a bit of time by skipping this in the pre-compile. */
3192 if (lengthptr == NULL) for (; c <= d; c++)
3194 classbits[c/8] |= (1 << (c&7));
3195 if ((options & PCRE_CASELESS) != 0)
3197 int uc = cd->fcc[c]; /* flip case */
3198 classbits[uc/8] |= (1 << (uc&7));
3202 continue; /* Go get the next char in the class */
3205 /* Handle a lone single character - we can get here for a normal
3206 non-escape char, or after \ that introduces a single character or for an
3207 apparent range that isn't. */
3209 LONE_SINGLE_CHARACTER:
3211 /* Handle a character that cannot go in the bit map */
3213 #ifdef SUPPORT_UTF8
3214 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3216 class_utf8 = TRUE;
3217 *class_utf8data++ = XCL_SINGLE;
3218 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3220 #ifdef SUPPORT_UCP
3221 if ((options & PCRE_CASELESS) != 0)
3223 unsigned int othercase;
3224 if ((othercase = UCD_OTHERCASE(c)) != c)
3226 *class_utf8data++ = XCL_SINGLE;
3227 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3230 #endif /* SUPPORT_UCP */
3233 else
3234 #endif /* SUPPORT_UTF8 */
3236 /* Handle a single-byte character */
3238 classbits[c/8] |= (1 << (c&7));
3239 if ((options & PCRE_CASELESS) != 0)
3241 c = cd->fcc[c]; /* flip case */
3242 classbits[c/8] |= (1 << (c&7));
3244 class_charcount++;
3245 class_lastchar = c;
3249 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3251 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3253 if (c == 0) /* Missing terminating ']' */
3255 *errorcodeptr = ERR6;
3256 goto FAILED;
3260 /* This code has been disabled because it would mean that \s counts as
3261 an explicit \r or \n reference, and that's not really what is wanted. Now
3262 we set the flag only if there is a literal "\r" or "\n" in the class. */
3264 #if 0
3265 /* Remember whether \r or \n are in this class */
3267 if (negate_class)
3269 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3271 else
3273 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3275 #endif
3278 /* If class_charcount is 1, we saw precisely one character whose value is
3279 less than 256. As long as there were no characters >= 128 and there was no
3280 use of \p or \P, in other words, no use of any XCLASS features, we can
3281 optimize.
3283 In UTF-8 mode, we can optimize the negative case only if there were no
3284 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3285 operate on single-bytes only. This is an historical hangover. Maybe one day
3286 we can tidy these opcodes to handle multi-byte characters.
3288 The optimization throws away the bit map. We turn the item into a
3289 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3290 that OP_NOT does not support multibyte characters. In the positive case, it
3291 can cause firstbyte to be set. Otherwise, there can be no first char if
3292 this item is first, whatever repeat count may follow. In the case of
3293 reqbyte, save the previous value for reinstating. */
3295 #ifdef SUPPORT_UTF8
3296 if (class_charcount == 1 && !class_utf8 &&
3297 (!utf8 || !negate_class || class_lastchar < 128))
3298 #else
3299 if (class_charcount == 1)
3300 #endif
3302 zeroreqbyte = reqbyte;
3304 /* The OP_NOT opcode works on one-byte characters only. */
3306 if (negate_class)
3308 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3309 zerofirstbyte = firstbyte;
3310 *code++ = OP_NOT;
3311 *code++ = class_lastchar;
3312 break;
3315 /* For a single, positive character, get the value into mcbuffer, and
3316 then we can handle this with the normal one-character code. */
3318 #ifdef SUPPORT_UTF8
3319 if (utf8 && class_lastchar > 127)
3320 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3321 else
3322 #endif
3324 mcbuffer[0] = class_lastchar;
3325 mclength = 1;
3327 goto ONE_CHAR;
3328 } /* End of 1-char optimization */
3330 /* The general case - not the one-char optimization. If this is the first
3331 thing in the branch, there can be no first char setting, whatever the
3332 repeat count. Any reqbyte setting must remain unchanged after any kind of
3333 repeat. */
3335 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3336 zerofirstbyte = firstbyte;
3337 zeroreqbyte = reqbyte;
3339 /* If there are characters with values > 255, we have to compile an
3340 extended class, with its own opcode, unless there was a negated special
3341 such as \S in the class, because in that case all characters > 255 are in
3342 the class, so any that were explicitly given as well can be ignored. If
3343 (when there are explicit characters > 255 that must be listed) there are no
3344 characters < 256, we can omit the bitmap in the actual compiled code. */
3346 #ifdef SUPPORT_UTF8
3347 if (class_utf8 && !should_flip_negation)
3349 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3350 *code++ = OP_XCLASS;
3351 code += LINK_SIZE;
3352 *code = negate_class? XCL_NOT : 0;
3354 /* If the map is required, move up the extra data to make room for it;
3355 otherwise just move the code pointer to the end of the extra data. */
3357 if (class_charcount > 0)
3359 *code++ |= XCL_MAP;
3360 memmove(code + 32, code, class_utf8data - code);
3361 memcpy(code, classbits, 32);
3362 code = class_utf8data + 32;
3364 else code = class_utf8data;
3366 /* Now fill in the complete length of the item */
3368 PUT(previous, 1, code - previous);
3369 break; /* End of class handling */
3371 #endif
3373 /* If there are no characters > 255, set the opcode to OP_CLASS or
3374 OP_NCLASS, depending on whether the whole class was negated and whether
3375 there were negative specials such as \S in the class. Then copy the 32-byte
3376 map into the code vector, negating it if necessary. */
3378 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3379 if (negate_class)
3381 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3382 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3384 else
3386 memcpy(code, classbits, 32);
3388 code += 32;
3389 break;
3392 /* ===================================================================*/
3393 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3394 has been tested above. */
3396 case '{':
3397 if (!is_quantifier) goto NORMAL_CHAR;
3398 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3399 if (*errorcodeptr != 0) goto FAILED;
3400 goto REPEAT;
3402 case '*':
3403 repeat_min = 0;
3404 repeat_max = -1;
3405 goto REPEAT;
3407 case '+':
3408 repeat_min = 1;
3409 repeat_max = -1;
3410 goto REPEAT;
3412 case '?':
3413 repeat_min = 0;
3414 repeat_max = 1;
3416 REPEAT:
3417 if (previous == NULL)
3419 *errorcodeptr = ERR9;
3420 goto FAILED;
3423 if (repeat_min == 0)
3425 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3426 reqbyte = zeroreqbyte; /* Ditto */
3429 /* Remember whether this is a variable length repeat */
3431 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3433 op_type = 0; /* Default single-char op codes */
3434 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3436 /* Save start of previous item, in case we have to move it up to make space
3437 for an inserted OP_ONCE for the additional '+' extension. */
3439 tempcode = previous;
3441 /* If the next character is '+', we have a possessive quantifier. This
3442 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3443 If the next character is '?' this is a minimizing repeat, by default,
3444 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3445 repeat type to the non-default. */
3447 if (ptr[1] == '+')
3449 repeat_type = 0; /* Force greedy */
3450 possessive_quantifier = TRUE;
3451 ptr++;
3453 else if (ptr[1] == '?')
3455 repeat_type = greedy_non_default;
3456 ptr++;
3458 else repeat_type = greedy_default;
3460 /* If previous was a character match, abolish the item and generate a
3461 repeat item instead. If a char item has a minumum of more than one, ensure
3462 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3463 the first thing in a branch because the x will have gone into firstbyte
3464 instead. */
3466 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3468 /* Deal with UTF-8 characters that take up more than one byte. It's
3469 easier to write this out separately than try to macrify it. Use c to
3470 hold the length of the character in bytes, plus 0x80 to flag that it's a
3471 length rather than a small character. */
3473 #ifdef SUPPORT_UTF8
3474 if (utf8 && (code[-1] & 0x80) != 0)
3476 uschar *lastchar = code - 1;
3477 while((*lastchar & 0xc0) == 0x80) lastchar--;
3478 c = code - lastchar; /* Length of UTF-8 character */
3479 memcpy(utf8_char, lastchar, c); /* Save the char */
3480 c |= 0x80; /* Flag c as a length */
3482 else
3483 #endif
3485 /* Handle the case of a single byte - either with no UTF8 support, or
3486 with UTF-8 disabled, or for a UTF-8 character < 128. */
3489 c = code[-1];
3490 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3493 /* If the repetition is unlimited, it pays to see if the next thing on
3494 the line is something that cannot possibly match this character. If so,
3495 automatically possessifying this item gains some performance in the case
3496 where the match fails. */
3498 if (!possessive_quantifier &&
3499 repeat_max < 0 &&
3500 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3501 options, cd))
3503 repeat_type = 0; /* Force greedy */
3504 possessive_quantifier = TRUE;
3507 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3510 /* If previous was a single negated character ([^a] or similar), we use
3511 one of the special opcodes, replacing it. The code is shared with single-
3512 character repeats by setting opt_type to add a suitable offset into
3513 repeat_type. We can also test for auto-possessification. OP_NOT is
3514 currently used only for single-byte chars. */
3516 else if (*previous == OP_NOT)
3518 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3519 c = previous[1];
3520 if (!possessive_quantifier &&
3521 repeat_max < 0 &&
3522 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3524 repeat_type = 0; /* Force greedy */
3525 possessive_quantifier = TRUE;
3527 goto OUTPUT_SINGLE_REPEAT;
3530 /* If previous was a character type match (\d or similar), abolish it and
3531 create a suitable repeat item. The code is shared with single-character
3532 repeats by setting op_type to add a suitable offset into repeat_type. Note
3533 the the Unicode property types will be present only when SUPPORT_UCP is
3534 defined, but we don't wrap the little bits of code here because it just
3535 makes it horribly messy. */
3537 else if (*previous < OP_EODN)
3539 uschar *oldcode;
3540 int prop_type, prop_value;
3541 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3542 c = *previous;
3544 if (!possessive_quantifier &&
3545 repeat_max < 0 &&
3546 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3548 repeat_type = 0; /* Force greedy */
3549 possessive_quantifier = TRUE;
3552 OUTPUT_SINGLE_REPEAT:
3553 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3555 prop_type = previous[1];
3556 prop_value = previous[2];
3558 else prop_type = prop_value = -1;
3560 oldcode = code;
3561 code = previous; /* Usually overwrite previous item */
3563 /* If the maximum is zero then the minimum must also be zero; Perl allows
3564 this case, so we do too - by simply omitting the item altogether. */
3566 if (repeat_max == 0) goto END_REPEAT;
3568 /* All real repeats make it impossible to handle partial matching (maybe
3569 one day we will be able to remove this restriction). */
3571 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3573 /* Combine the op_type with the repeat_type */
3575 repeat_type += op_type;
3577 /* A minimum of zero is handled either as the special case * or ?, or as
3578 an UPTO, with the maximum given. */
3580 if (repeat_min == 0)
3582 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3583 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3584 else
3586 *code++ = OP_UPTO + repeat_type;
3587 PUT2INC(code, 0, repeat_max);
3591 /* A repeat minimum of 1 is optimized into some special cases. If the
3592 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3593 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3594 one less than the maximum. */
3596 else if (repeat_min == 1)
3598 if (repeat_max == -1)
3599 *code++ = OP_PLUS + repeat_type;
3600 else
3602 code = oldcode; /* leave previous item in place */
3603 if (repeat_max == 1) goto END_REPEAT;
3604 *code++ = OP_UPTO + repeat_type;
3605 PUT2INC(code, 0, repeat_max - 1);
3609 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3610 handled as an EXACT followed by an UPTO. */
3612 else
3614 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3615 PUT2INC(code, 0, repeat_min);
3617 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3618 we have to insert the character for the previous code. For a repeated
3619 Unicode property match, there are two extra bytes that define the
3620 required property. In UTF-8 mode, long characters have their length in
3621 c, with the 0x80 bit as a flag. */
3623 if (repeat_max < 0)
3625 #ifdef SUPPORT_UTF8
3626 if (utf8 && c >= 128)
3628 memcpy(code, utf8_char, c & 7);
3629 code += c & 7;
3631 else
3632 #endif
3634 *code++ = c;
3635 if (prop_type >= 0)
3637 *code++ = prop_type;
3638 *code++ = prop_value;
3641 *code++ = OP_STAR + repeat_type;
3644 /* Else insert an UPTO if the max is greater than the min, again
3645 preceded by the character, for the previously inserted code. If the
3646 UPTO is just for 1 instance, we can use QUERY instead. */
3648 else if (repeat_max != repeat_min)
3650 #ifdef SUPPORT_UTF8
3651 if (utf8 && c >= 128)
3653 memcpy(code, utf8_char, c & 7);
3654 code += c & 7;
3656 else
3657 #endif
3658 *code++ = c;
3659 if (prop_type >= 0)
3661 *code++ = prop_type;
3662 *code++ = prop_value;
3664 repeat_max -= repeat_min;
3666 if (repeat_max == 1)
3668 *code++ = OP_QUERY + repeat_type;
3670 else
3672 *code++ = OP_UPTO + repeat_type;
3673 PUT2INC(code, 0, repeat_max);
3678 /* The character or character type itself comes last in all cases. */
3680 #ifdef SUPPORT_UTF8
3681 if (utf8 && c >= 128)
3683 memcpy(code, utf8_char, c & 7);
3684 code += c & 7;
3686 else
3687 #endif
3688 *code++ = c;
3690 /* For a repeated Unicode property match, there are two extra bytes that
3691 define the required property. */
3693 #ifdef SUPPORT_UCP
3694 if (prop_type >= 0)
3696 *code++ = prop_type;
3697 *code++ = prop_value;
3699 #endif
3702 /* If previous was a character class or a back reference, we put the repeat
3703 stuff after it, but just skip the item if the repeat was {0,0}. */
3705 else if (*previous == OP_CLASS ||
3706 *previous == OP_NCLASS ||
3707 #ifdef SUPPORT_UTF8
3708 *previous == OP_XCLASS ||
3709 #endif
3710 *previous == OP_REF)
3712 if (repeat_max == 0)
3714 code = previous;
3715 goto END_REPEAT;
3718 /* All real repeats make it impossible to handle partial matching (maybe
3719 one day we will be able to remove this restriction). */
3721 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3723 if (repeat_min == 0 && repeat_max == -1)
3724 *code++ = OP_CRSTAR + repeat_type;
3725 else if (repeat_min == 1 && repeat_max == -1)
3726 *code++ = OP_CRPLUS + repeat_type;
3727 else if (repeat_min == 0 && repeat_max == 1)
3728 *code++ = OP_CRQUERY + repeat_type;
3729 else
3731 *code++ = OP_CRRANGE + repeat_type;
3732 PUT2INC(code, 0, repeat_min);
3733 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3734 PUT2INC(code, 0, repeat_max);
3738 /* If previous was a bracket group, we may have to replicate it in certain
3739 cases. */
3741 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3742 *previous == OP_ONCE || *previous == OP_COND)
3744 register int i;
3745 int ketoffset = 0;
3746 int len = code - previous;
3747 uschar *bralink = NULL;
3749 /* Repeating a DEFINE group is pointless */
3751 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3753 *errorcodeptr = ERR55;
3754 goto FAILED;
3757 /* If the maximum repeat count is unlimited, find the end of the bracket
3758 by scanning through from the start, and compute the offset back to it
3759 from the current code pointer. There may be an OP_OPT setting following
3760 the final KET, so we can't find the end just by going back from the code
3761 pointer. */
3763 if (repeat_max == -1)
3765 register uschar *ket = previous;
3766 do ket += GET(ket, 1); while (*ket != OP_KET);
3767 ketoffset = code - ket;
3770 /* The case of a zero minimum is special because of the need to stick
3771 OP_BRAZERO in front of it, and because the group appears once in the
3772 data, whereas in other cases it appears the minimum number of times. For
3773 this reason, it is simplest to treat this case separately, as otherwise
3774 the code gets far too messy. There are several special subcases when the
3775 minimum is zero. */
3777 if (repeat_min == 0)
3779 /* If the maximum is also zero, we used to just omit the group from the
3780 output altogether, like this:
3782 ** if (repeat_max == 0)
3783 ** {
3784 ** code = previous;
3785 ** goto END_REPEAT;
3786 ** }
3788 However, that fails when a group is referenced as a subroutine from
3789 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3790 so that it is skipped on execution. As we don't have a list of which
3791 groups are referenced, we cannot do this selectively.
3793 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3794 and do no more at this point. However, we do need to adjust any
3795 OP_RECURSE calls inside the group that refer to the group itself or any
3796 internal or forward referenced group, because the offset is from the
3797 start of the whole regex. Temporarily terminate the pattern while doing
3798 this. */
3800 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3802 *code = OP_END;
3803 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3804 memmove(previous+1, previous, len);
3805 code++;
3806 if (repeat_max == 0)
3808 *previous++ = OP_SKIPZERO;
3809 goto END_REPEAT;
3811 *previous++ = OP_BRAZERO + repeat_type;
3814 /* If the maximum is greater than 1 and limited, we have to replicate
3815 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3816 The first one has to be handled carefully because it's the original
3817 copy, which has to be moved up. The remainder can be handled by code
3818 that is common with the non-zero minimum case below. We have to
3819 adjust the value or repeat_max, since one less copy is required. Once
3820 again, we may have to adjust any OP_RECURSE calls inside the group. */
3822 else
3824 int offset;
3825 *code = OP_END;
3826 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3827 memmove(previous + 2 + LINK_SIZE, previous, len);
3828 code += 2 + LINK_SIZE;
3829 *previous++ = OP_BRAZERO + repeat_type;
3830 *previous++ = OP_BRA;
3832 /* We chain together the bracket offset fields that have to be
3833 filled in later when the ends of the brackets are reached. */
3835 offset = (bralink == NULL)? 0 : previous - bralink;
3836 bralink = previous;
3837 PUTINC(previous, 0, offset);
3840 repeat_max--;
3843 /* If the minimum is greater than zero, replicate the group as many
3844 times as necessary, and adjust the maximum to the number of subsequent
3845 copies that we need. If we set a first char from the group, and didn't
3846 set a required char, copy the latter from the former. If there are any
3847 forward reference subroutine calls in the group, there will be entries on
3848 the workspace list; replicate these with an appropriate increment. */
3850 else
3852 if (repeat_min > 1)
3854 /* In the pre-compile phase, we don't actually do the replication. We
3855 just adjust the length as if we had. Do some paranoid checks for
3856 potential integer overflow. */
3858 if (lengthptr != NULL)
3860 int delta = (repeat_min - 1)*length_prevgroup;
3861 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3862 (double)INT_MAX ||
3863 OFLOW_MAX - *lengthptr < delta)
3865 *errorcodeptr = ERR20;
3866 goto FAILED;
3868 *lengthptr += delta;
3871 /* This is compiling for real */
3873 else
3875 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3876 for (i = 1; i < repeat_min; i++)
3878 uschar *hc;
3879 uschar *this_hwm = cd->hwm;
3880 memcpy(code, previous, len);
3881 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3883 PUT(cd->hwm, 0, GET(hc, 0) + len);
3884 cd->hwm += LINK_SIZE;
3886 save_hwm = this_hwm;
3887 code += len;
3892 if (repeat_max > 0) repeat_max -= repeat_min;
3895 /* This code is common to both the zero and non-zero minimum cases. If
3896 the maximum is limited, it replicates the group in a nested fashion,
3897 remembering the bracket starts on a stack. In the case of a zero minimum,
3898 the first one was set up above. In all cases the repeat_max now specifies
3899 the number of additional copies needed. Again, we must remember to
3900 replicate entries on the forward reference list. */
3902 if (repeat_max >= 0)
3904 /* In the pre-compile phase, we don't actually do the replication. We
3905 just adjust the length as if we had. For each repetition we must add 1
3906 to the length for BRAZERO and for all but the last repetition we must
3907 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3908 paranoid checks to avoid integer overflow. */
3910 if (lengthptr != NULL && repeat_max > 0)
3912 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3913 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3914 if ((double)repeat_max *
3915 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3916 > (double)INT_MAX ||
3917 OFLOW_MAX - *lengthptr < delta)
3919 *errorcodeptr = ERR20;
3920 goto FAILED;
3922 *lengthptr += delta;
3925 /* This is compiling for real */
3927 else for (i = repeat_max - 1; i >= 0; i--)
3929 uschar *hc;
3930 uschar *this_hwm = cd->hwm;
3932 *code++ = OP_BRAZERO + repeat_type;
3934 /* All but the final copy start a new nesting, maintaining the
3935 chain of brackets outstanding. */
3937 if (i != 0)
3939 int offset;
3940 *code++ = OP_BRA;
3941 offset = (bralink == NULL)? 0 : code - bralink;
3942 bralink = code;
3943 PUTINC(code, 0, offset);
3946 memcpy(code, previous, len);
3947 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3949 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3950 cd->hwm += LINK_SIZE;
3952 save_hwm = this_hwm;
3953 code += len;
3956 /* Now chain through the pending brackets, and fill in their length
3957 fields (which are holding the chain links pro tem). */
3959 while (bralink != NULL)
3961 int oldlinkoffset;
3962 int offset = code - bralink + 1;
3963 uschar *bra = code - offset;
3964 oldlinkoffset = GET(bra, 1);
3965 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3966 *code++ = OP_KET;
3967 PUTINC(code, 0, offset);
3968 PUT(bra, 1, offset);
3972 /* If the maximum is unlimited, set a repeater in the final copy. We
3973 can't just offset backwards from the current code point, because we
3974 don't know if there's been an options resetting after the ket. The
3975 correct offset was computed above.
3977 Then, when we are doing the actual compile phase, check to see whether
3978 this group is a non-atomic one that could match an empty string. If so,
3979 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3980 that runtime checking can be done. [This check is also applied to
3981 atomic groups at runtime, but in a different way.] */
3983 else
3985 uschar *ketcode = code - ketoffset;
3986 uschar *bracode = ketcode - GET(ketcode, 1);
3987 *ketcode = OP_KETRMAX + repeat_type;
3988 if (lengthptr == NULL && *bracode != OP_ONCE)
3990 uschar *scode = bracode;
3993 if (could_be_empty_branch(scode, ketcode, utf8))
3995 *bracode += OP_SBRA - OP_BRA;
3996 break;
3998 scode += GET(scode, 1);
4000 while (*scode == OP_ALT);
4005 /* If previous is OP_FAIL, it was generated by an empty class [] in
4006 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4007 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4008 error above. We can just ignore the repeat in JS case. */
4010 else if (*previous == OP_FAIL) goto END_REPEAT;
4012 /* Else there's some kind of shambles */
4014 else
4016 *errorcodeptr = ERR11;
4017 goto FAILED;
4020 /* If the character following a repeat is '+', or if certain optimization
4021 tests above succeeded, possessive_quantifier is TRUE. For some of the
4022 simpler opcodes, there is an special alternative opcode for this. For
4023 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4024 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4025 but the special opcodes can optimize it a bit. The repeated item starts at
4026 tempcode, not at previous, which might be the first part of a string whose
4027 (former) last char we repeated.
4029 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4030 an 'upto' may follow. We skip over an 'exact' item, and then test the
4031 length of what remains before proceeding. */
4033 if (possessive_quantifier)
4035 int len;
4036 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4037 *tempcode == OP_NOTEXACT)
4038 tempcode += _pcre_OP_lengths[*tempcode] +
4039 ((*tempcode == OP_TYPEEXACT &&
4040 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4041 len = code - tempcode;
4042 if (len > 0) switch (*tempcode)
4044 case OP_STAR: *tempcode = OP_POSSTAR; break;
4045 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4046 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4047 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4049 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4050 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4051 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4052 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4054 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4055 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4056 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4057 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4059 default:
4060 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4061 code += 1 + LINK_SIZE;
4062 len += 1 + LINK_SIZE;
4063 tempcode[0] = OP_ONCE;
4064 *code++ = OP_KET;
4065 PUTINC(code, 0, len);
4066 PUT(tempcode, 1, len);
4067 break;
4071 /* In all case we no longer have a previous item. We also set the
4072 "follows varying string" flag for subsequently encountered reqbytes if
4073 it isn't already set and we have just passed a varying length item. */
4075 END_REPEAT:
4076 previous = NULL;
4077 cd->req_varyopt |= reqvary;
4078 break;
4081 /* ===================================================================*/
4082 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4083 lookbehind or option setting or condition or all the other extended
4084 parenthesis forms. */
4086 case '(':
4087 newoptions = options;
4088 skipbytes = 0;
4089 bravalue = OP_CBRA;
4090 save_hwm = cd->hwm;
4091 reset_bracount = FALSE;
4093 /* First deal with various "verbs" that can be introduced by '*'. */
4095 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4097 int i, namelen;
4098 const char *vn = verbnames;
4099 const uschar *name = ++ptr;
4100 previous = NULL;
4101 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4102 if (*ptr == ':')
4104 *errorcodeptr = ERR59; /* Not supported */
4105 goto FAILED;
4107 if (*ptr != ')')
4109 *errorcodeptr = ERR60;
4110 goto FAILED;
4112 namelen = ptr - name;
4113 for (i = 0; i < verbcount; i++)
4115 if (namelen == verbs[i].len &&
4116 strncmp((char *)name, vn, namelen) == 0)
4118 *code = verbs[i].op;
4119 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4120 break;
4122 vn += verbs[i].len + 1;
4124 if (i < verbcount) continue;
4125 *errorcodeptr = ERR60;
4126 goto FAILED;
4129 /* Deal with the extended parentheses; all are introduced by '?', and the
4130 appearance of any of them means that this is not a capturing group. */
4132 else if (*ptr == '?')
4134 int i, set, unset, namelen;
4135 int *optset;
4136 const uschar *name;
4137 uschar *slot;
4139 switch (*(++ptr))
4141 case '#': /* Comment; skip to ket */
4142 ptr++;
4143 while (*ptr != 0 && *ptr != ')') ptr++;
4144 if (*ptr == 0)
4146 *errorcodeptr = ERR18;
4147 goto FAILED;
4149 continue;
4152 /* ------------------------------------------------------------ */
4153 case '|': /* Reset capture count for each branch */
4154 reset_bracount = TRUE;
4155 /* Fall through */
4157 /* ------------------------------------------------------------ */
4158 case ':': /* Non-capturing bracket */
4159 bravalue = OP_BRA;
4160 ptr++;
4161 break;
4164 /* ------------------------------------------------------------ */
4165 case '(':
4166 bravalue = OP_COND; /* Conditional group */
4168 /* A condition can be an assertion, a number (referring to a numbered
4169 group), a name (referring to a named group), or 'R', referring to
4170 recursion. R<digits> and R&name are also permitted for recursion tests.
4172 There are several syntaxes for testing a named group: (?(name)) is used
4173 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4175 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4176 be the recursive thing or the name 'R' (and similarly for 'R' followed
4177 by digits), and (b) a number could be a name that consists of digits.
4178 In both cases, we look for a name first; if not found, we try the other
4179 cases. */
4181 /* For conditions that are assertions, check the syntax, and then exit
4182 the switch. This will take control down to where bracketed groups,
4183 including assertions, are processed. */
4185 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4186 break;
4188 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4189 below), and all need to skip 3 bytes at the start of the group. */
4191 code[1+LINK_SIZE] = OP_CREF;
4192 skipbytes = 3;
4193 refsign = -1;
4195 /* Check for a test for recursion in a named group. */
4197 if (ptr[1] == 'R' && ptr[2] == '&')
4199 terminator = -1;
4200 ptr += 2;
4201 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4204 /* Check for a test for a named group's having been set, using the Perl
4205 syntax (?(<name>) or (?('name') */
4207 else if (ptr[1] == '<')
4209 terminator = '>';
4210 ptr++;
4212 else if (ptr[1] == '\'')
4214 terminator = '\'';
4215 ptr++;
4217 else
4219 terminator = 0;
4220 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4223 /* We now expect to read a name; any thing else is an error */
4225 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4227 ptr += 1; /* To get the right offset */
4228 *errorcodeptr = ERR28;
4229 goto FAILED;
4232 /* Read the name, but also get it as a number if it's all digits */
4234 recno = 0;
4235 name = ++ptr;
4236 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4238 if (recno >= 0)
4239 recno = (g_ascii_isdigit (*ptr) != 0)?
4240 recno * 10 + *ptr - '0' : -1;
4241 ptr++;
4243 namelen = ptr - name;
4245 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4247 ptr--; /* Error offset */
4248 *errorcodeptr = ERR26;
4249 goto FAILED;
4252 /* Do no further checking in the pre-compile phase. */
4254 if (lengthptr != NULL) break;
4256 /* In the real compile we do the work of looking for the actual
4257 reference. If the string started with "+" or "-" we require the rest to
4258 be digits, in which case recno will be set. */
4260 if (refsign > 0)
4262 if (recno <= 0)
4264 *errorcodeptr = ERR58;
4265 goto FAILED;
4267 recno = (refsign == '-')?
4268 cd->bracount - recno + 1 : recno +cd->bracount;
4269 if (recno <= 0 || recno > cd->final_bracount)
4271 *errorcodeptr = ERR15;
4272 goto FAILED;
4274 PUT2(code, 2+LINK_SIZE, recno);
4275 break;
4278 /* Otherwise (did not start with "+" or "-"), start by looking for the
4279 name. */
4281 slot = cd->name_table;
4282 for (i = 0; i < cd->names_found; i++)
4284 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4285 slot += cd->name_entry_size;
4288 /* Found a previous named subpattern */
4290 if (i < cd->names_found)
4292 recno = GET2(slot, 0);
4293 PUT2(code, 2+LINK_SIZE, recno);
4296 /* Search the pattern for a forward reference */
4298 else if ((i = find_parens(ptr, cd, name, namelen,
4299 (options & PCRE_EXTENDED) != 0)) > 0)
4301 PUT2(code, 2+LINK_SIZE, i);
4304 /* If terminator == 0 it means that the name followed directly after
4305 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4306 some further alternatives to try. For the cases where terminator != 0
4307 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4308 now checked all the possibilities, so give an error. */
4310 else if (terminator != 0)
4312 *errorcodeptr = ERR15;
4313 goto FAILED;
4316 /* Check for (?(R) for recursion. Allow digits after R to specify a
4317 specific group number. */
4319 else if (*name == 'R')
4321 recno = 0;
4322 for (i = 1; i < namelen; i++)
4324 if (g_ascii_isdigit (name[i]) == 0)
4326 *errorcodeptr = ERR15;
4327 goto FAILED;
4329 recno = recno * 10 + name[i] - '0';
4331 if (recno == 0) recno = RREF_ANY;
4332 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4333 PUT2(code, 2+LINK_SIZE, recno);
4336 /* Similarly, check for the (?(DEFINE) "condition", which is always
4337 false. */
4339 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4341 code[1+LINK_SIZE] = OP_DEF;
4342 skipbytes = 1;
4345 /* Check for the "name" actually being a subpattern number. We are
4346 in the second pass here, so final_bracount is set. */
4348 else if (recno > 0 && recno <= cd->final_bracount)
4350 PUT2(code, 2+LINK_SIZE, recno);
4353 /* Either an unidentified subpattern, or a reference to (?(0) */
4355 else
4357 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4358 goto FAILED;
4360 break;
4363 /* ------------------------------------------------------------ */
4364 case '=': /* Positive lookahead */
4365 bravalue = OP_ASSERT;
4366 ptr++;
4367 break;
4370 /* ------------------------------------------------------------ */
4371 case '!': /* Negative lookahead */
4372 ptr++;
4373 if (*ptr == ')') /* Optimize (?!) */
4375 *code++ = OP_FAIL;
4376 previous = NULL;
4377 continue;
4379 bravalue = OP_ASSERT_NOT;
4380 break;
4383 /* ------------------------------------------------------------ */
4384 case '<': /* Lookbehind or named define */
4385 switch (ptr[1])
4387 case '=': /* Positive lookbehind */
4388 bravalue = OP_ASSERTBACK;
4389 ptr += 2;
4390 break;
4392 case '!': /* Negative lookbehind */
4393 bravalue = OP_ASSERTBACK_NOT;
4394 ptr += 2;
4395 break;
4397 default: /* Could be name define, else bad */
4398 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4399 ptr++; /* Correct offset for error */
4400 *errorcodeptr = ERR24;
4401 goto FAILED;
4403 break;
4406 /* ------------------------------------------------------------ */
4407 case '>': /* One-time brackets */
4408 bravalue = OP_ONCE;
4409 ptr++;
4410 break;
4413 /* ------------------------------------------------------------ */
4414 case 'C': /* Callout - may be followed by digits; */
4415 previous_callout = code; /* Save for later completion */
4416 after_manual_callout = 1; /* Skip one item before completing */
4417 *code++ = OP_CALLOUT;
4419 int n = 0;
4420 while (g_ascii_isdigit (*(++ptr)) != 0)
4421 n = n * 10 + *ptr - '0';
4422 if (*ptr != ')')
4424 *errorcodeptr = ERR39;
4425 goto FAILED;
4427 if (n > 255)
4429 *errorcodeptr = ERR38;
4430 goto FAILED;
4432 *code++ = n;
4433 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4434 PUT(code, LINK_SIZE, 0); /* Default length */
4435 code += 2 * LINK_SIZE;
4437 previous = NULL;
4438 continue;
4441 /* ------------------------------------------------------------ */
4442 case 'P': /* Python-style named subpattern handling */
4443 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4445 is_recurse = *ptr == '>';
4446 terminator = ')';
4447 goto NAMED_REF_OR_RECURSE;
4449 else if (*ptr != '<') /* Test for Python-style definition */
4451 *errorcodeptr = ERR41;
4452 goto FAILED;
4454 /* Fall through to handle (?P< as (?< is handled */
4457 /* ------------------------------------------------------------ */
4458 DEFINE_NAME: /* Come here from (?< handling */
4459 case '\'':
4461 terminator = (*ptr == '<')? '>' : '\'';
4462 name = ++ptr;
4464 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4465 namelen = ptr - name;
4467 /* In the pre-compile phase, just do a syntax check. */
4469 if (lengthptr != NULL)
4471 if (*ptr != terminator)
4473 *errorcodeptr = ERR42;
4474 goto FAILED;
4476 if (cd->names_found >= MAX_NAME_COUNT)
4478 *errorcodeptr = ERR49;
4479 goto FAILED;
4481 if (namelen + 3 > cd->name_entry_size)
4483 cd->name_entry_size = namelen + 3;
4484 if (namelen > MAX_NAME_SIZE)
4486 *errorcodeptr = ERR48;
4487 goto FAILED;
4492 /* In the real compile, create the entry in the table */
4494 else
4496 slot = cd->name_table;
4497 for (i = 0; i < cd->names_found; i++)
4499 int crc = memcmp(name, slot+2, namelen);
4500 if (crc == 0)
4502 if (slot[2+namelen] == 0)
4504 if ((options & PCRE_DUPNAMES) == 0)
4506 *errorcodeptr = ERR43;
4507 goto FAILED;
4510 else crc = -1; /* Current name is substring */
4512 if (crc < 0)
4514 memmove(slot + cd->name_entry_size, slot,
4515 (cd->names_found - i) * cd->name_entry_size);
4516 break;
4518 slot += cd->name_entry_size;
4521 PUT2(slot, 0, cd->bracount + 1);
4522 memcpy(slot + 2, name, namelen);
4523 slot[2+namelen] = 0;
4527 /* In both cases, count the number of names we've encountered. */
4529 ptr++; /* Move past > or ' */
4530 cd->names_found++;
4531 goto NUMBERED_GROUP;
4534 /* ------------------------------------------------------------ */
4535 case '&': /* Perl recursion/subroutine syntax */
4536 terminator = ')';
4537 is_recurse = TRUE;
4538 /* Fall through */
4540 /* We come here from the Python syntax above that handles both
4541 references (?P=name) and recursion (?P>name), as well as falling
4542 through from the Perl recursion syntax (?&name). We also come here from
4543 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4544 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4546 NAMED_REF_OR_RECURSE:
4547 name = ++ptr;
4548 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4549 namelen = ptr - name;
4551 /* In the pre-compile phase, do a syntax check and set a dummy
4552 reference number. */
4554 if (lengthptr != NULL)
4556 if (namelen == 0)
4558 *errorcodeptr = ERR62;
4559 goto FAILED;
4561 if (*ptr != terminator)
4563 *errorcodeptr = ERR42;
4564 goto FAILED;
4566 if (namelen > MAX_NAME_SIZE)
4568 *errorcodeptr = ERR48;
4569 goto FAILED;
4571 recno = 0;
4574 /* In the real compile, seek the name in the table. We check the name
4575 first, and then check that we have reached the end of the name in the
4576 table. That way, if the name that is longer than any in the table,
4577 the comparison will fail without reading beyond the table entry. */
4579 else
4581 slot = cd->name_table;
4582 for (i = 0; i < cd->names_found; i++)
4584 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4585 slot[2+namelen] == 0)
4586 break;
4587 slot += cd->name_entry_size;
4590 if (i < cd->names_found) /* Back reference */
4592 recno = GET2(slot, 0);
4594 else if ((recno = /* Forward back reference */
4595 find_parens(ptr, cd, name, namelen,
4596 (options & PCRE_EXTENDED) != 0)) <= 0)
4598 *errorcodeptr = ERR15;
4599 goto FAILED;
4603 /* In both phases, we can now go to the code than handles numerical
4604 recursion or backreferences. */
4606 if (is_recurse) goto HANDLE_RECURSION;
4607 else goto HANDLE_REFERENCE;
4610 /* ------------------------------------------------------------ */
4611 case 'R': /* Recursion */
4612 ptr++; /* Same as (?0) */
4613 /* Fall through */
4616 /* ------------------------------------------------------------ */
4617 case '-': case '+':
4618 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4619 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4621 const uschar *called;
4622 terminator = ')';
4624 /* Come here from the \g<...> and \g'...' code (Oniguruma
4625 compatibility). However, the syntax has been checked to ensure that
4626 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4627 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4628 ever be taken. */
4630 HANDLE_NUMERICAL_RECURSION:
4632 if ((refsign = *ptr) == '+')
4634 ptr++;
4635 if (g_ascii_isdigit (*ptr) == 0)
4637 *errorcodeptr = ERR63;
4638 goto FAILED;
4641 else if (refsign == '-')
4643 if (g_ascii_isdigit (ptr[1]) == 0)
4644 goto OTHER_CHAR_AFTER_QUERY;
4645 ptr++;
4648 recno = 0;
4649 while(g_ascii_isdigit (*ptr) != 0)
4650 recno = recno * 10 + *ptr++ - '0';
4652 if (*ptr != terminator)
4654 *errorcodeptr = ERR29;
4655 goto FAILED;
4658 if (refsign == '-')
4660 if (recno == 0)
4662 *errorcodeptr = ERR58;
4663 goto FAILED;
4665 recno = cd->bracount - recno + 1;
4666 if (recno <= 0)
4668 *errorcodeptr = ERR15;
4669 goto FAILED;
4672 else if (refsign == '+')
4674 if (recno == 0)
4676 *errorcodeptr = ERR58;
4677 goto FAILED;
4679 recno += cd->bracount;
4682 /* Come here from code above that handles a named recursion */
4684 HANDLE_RECURSION:
4686 previous = code;
4687 called = cd->start_code;
4689 /* When we are actually compiling, find the bracket that is being
4690 referenced. Temporarily end the regex in case it doesn't exist before
4691 this point. If we end up with a forward reference, first check that
4692 the bracket does occur later so we can give the error (and position)
4693 now. Then remember this forward reference in the workspace so it can
4694 be filled in at the end. */
4696 if (lengthptr == NULL)
4698 *code = OP_END;
4699 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4701 /* Forward reference */
4703 if (called == NULL)
4705 if (find_parens(ptr, cd, NULL, recno,
4706 (options & PCRE_EXTENDED) != 0) < 0)
4708 *errorcodeptr = ERR15;
4709 goto FAILED;
4711 called = cd->start_code + recno;
4712 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4715 /* If not a forward reference, and the subpattern is still open,
4716 this is a recursive call. We check to see if this is a left
4717 recursion that could loop for ever, and diagnose that case. */
4719 else if (GET(called, 1) == 0 &&
4720 could_be_empty(called, code, bcptr, utf8))
4722 *errorcodeptr = ERR40;
4723 goto FAILED;
4727 /* Insert the recursion/subroutine item, automatically wrapped inside
4728 "once" brackets. Set up a "previous group" length so that a
4729 subsequent quantifier will work. */
4731 *code = OP_ONCE;
4732 PUT(code, 1, 2 + 2*LINK_SIZE);
4733 code += 1 + LINK_SIZE;
4735 *code = OP_RECURSE;
4736 PUT(code, 1, called - cd->start_code);
4737 code += 1 + LINK_SIZE;
4739 *code = OP_KET;
4740 PUT(code, 1, 2 + 2*LINK_SIZE);
4741 code += 1 + LINK_SIZE;
4743 length_prevgroup = 3 + 3*LINK_SIZE;
4746 /* Can't determine a first byte now */
4748 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4749 continue;
4752 /* ------------------------------------------------------------ */
4753 default: /* Other characters: check option setting */
4754 OTHER_CHAR_AFTER_QUERY:
4755 set = unset = 0;
4756 optset = &set;
4758 while (*ptr != ')' && *ptr != ':')
4760 switch (*ptr++)
4762 case '-': optset = &unset; break;
4764 case 'J': /* Record that it changed in the external options */
4765 *optset |= PCRE_DUPNAMES;
4766 cd->external_flags |= PCRE_JCHANGED;
4767 break;
4769 case 'i': *optset |= PCRE_CASELESS; break;
4770 case 'm': *optset |= PCRE_MULTILINE; break;
4771 case 's': *optset |= PCRE_DOTALL; break;
4772 case 'x': *optset |= PCRE_EXTENDED; break;
4773 case 'U': *optset |= PCRE_UNGREEDY; break;
4774 case 'X': *optset |= PCRE_EXTRA; break;
4776 default: *errorcodeptr = ERR12;
4777 ptr--; /* Correct the offset */
4778 goto FAILED;
4782 /* Set up the changed option bits, but don't change anything yet. */
4784 newoptions = (options | set) & (~unset);
4786 /* If the options ended with ')' this is not the start of a nested
4787 group with option changes, so the options change at this level. If this
4788 item is right at the start of the pattern, the options can be
4789 abstracted and made external in the pre-compile phase, and ignored in
4790 the compile phase. This can be helpful when matching -- for instance in
4791 caseless checking of required bytes.
4793 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4794 definitely *not* at the start of the pattern because something has been
4795 compiled. In the pre-compile phase, however, the code pointer can have
4796 that value after the start, because it gets reset as code is discarded
4797 during the pre-compile. However, this can happen only at top level - if
4798 we are within parentheses, the starting BRA will still be present. At
4799 any parenthesis level, the length value can be used to test if anything
4800 has been compiled at that level. Thus, a test for both these conditions
4801 is necessary to ensure we correctly detect the start of the pattern in
4802 both phases.
4804 If we are not at the pattern start, compile code to change the ims
4805 options if this setting actually changes any of them, and reset the
4806 greedy defaults and the case value for firstbyte and reqbyte. */
4808 if (*ptr == ')')
4810 if (code == cd->start_code + 1 + LINK_SIZE &&
4811 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4813 cd->external_options = newoptions;
4815 else
4817 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4819 *code++ = OP_OPT;
4820 *code++ = newoptions & PCRE_IMS;
4822 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4823 greedy_non_default = greedy_default ^ 1;
4824 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4827 /* Change options at this level, and pass them back for use
4828 in subsequent branches. When not at the start of the pattern, this
4829 information is also necessary so that a resetting item can be
4830 compiled at the end of a group (if we are in a group). */
4832 *optionsptr = options = newoptions;
4833 previous = NULL; /* This item can't be repeated */
4834 continue; /* It is complete */
4837 /* If the options ended with ':' we are heading into a nested group
4838 with possible change of options. Such groups are non-capturing and are
4839 not assertions of any kind. All we need to do is skip over the ':';
4840 the newoptions value is handled below. */
4842 bravalue = OP_BRA;
4843 ptr++;
4844 } /* End of switch for character following (? */
4845 } /* End of (? handling */
4847 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4848 all unadorned brackets become non-capturing and behave like (?:...)
4849 brackets. */
4851 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4853 bravalue = OP_BRA;
4856 /* Else we have a capturing group. */
4858 else
4860 NUMBERED_GROUP:
4861 cd->bracount += 1;
4862 PUT2(code, 1+LINK_SIZE, cd->bracount);
4863 skipbytes = 2;
4866 /* Process nested bracketed regex. Assertions may not be repeated, but
4867 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4868 non-register variable in order to be able to pass its address because some
4869 compilers complain otherwise. Pass in a new setting for the ims options if
4870 they have changed. */
4872 previous = (bravalue >= OP_ONCE)? code : NULL;
4873 *code = bravalue;
4874 tempcode = code;
4875 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4876 length_prevgroup = 0; /* Initialize for pre-compile phase */
4878 if (!compile_regex(
4879 newoptions, /* The complete new option state */
4880 options & PCRE_IMS, /* The previous ims option state */
4881 &tempcode, /* Where to put code (updated) */
4882 &ptr, /* Input pointer (updated) */
4883 errorcodeptr, /* Where to put an error message */
4884 (bravalue == OP_ASSERTBACK ||
4885 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4886 reset_bracount, /* True if (?| group */
4887 skipbytes, /* Skip over bracket number */
4888 &subfirstbyte, /* For possible first char */
4889 &subreqbyte, /* For possible last char */
4890 bcptr, /* Current branch chain */
4891 cd, /* Tables block */
4892 (lengthptr == NULL)? NULL : /* Actual compile phase */
4893 &length_prevgroup /* Pre-compile phase */
4895 goto FAILED;
4897 /* At the end of compiling, code is still pointing to the start of the
4898 group, while tempcode has been updated to point past the end of the group
4899 and any option resetting that may follow it. The pattern pointer (ptr)
4900 is on the bracket. */
4902 /* If this is a conditional bracket, check that there are no more than
4903 two branches in the group, or just one if it's a DEFINE group. We do this
4904 in the real compile phase, not in the pre-pass, where the whole group may
4905 not be available. */
4907 if (bravalue == OP_COND && lengthptr == NULL)
4909 uschar *tc = code;
4910 int condcount = 0;
4912 do {
4913 condcount++;
4914 tc += GET(tc,1);
4916 while (*tc != OP_KET);
4918 /* A DEFINE group is never obeyed inline (the "condition" is always
4919 false). It must have only one branch. */
4921 if (code[LINK_SIZE+1] == OP_DEF)
4923 if (condcount > 1)
4925 *errorcodeptr = ERR54;
4926 goto FAILED;
4928 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4931 /* A "normal" conditional group. If there is just one branch, we must not
4932 make use of its firstbyte or reqbyte, because this is equivalent to an
4933 empty second branch. */
4935 else
4937 if (condcount > 2)
4939 *errorcodeptr = ERR27;
4940 goto FAILED;
4942 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4946 /* Error if hit end of pattern */
4948 if (*ptr != ')')
4950 *errorcodeptr = ERR14;
4951 goto FAILED;
4954 /* In the pre-compile phase, update the length by the length of the group,
4955 less the brackets at either end. Then reduce the compiled code to just a
4956 set of non-capturing brackets so that it doesn't use much memory if it is
4957 duplicated by a quantifier.*/
4959 if (lengthptr != NULL)
4961 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4963 *errorcodeptr = ERR20;
4964 goto FAILED;
4966 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4967 *code++ = OP_BRA;
4968 PUTINC(code, 0, 1 + LINK_SIZE);
4969 *code++ = OP_KET;
4970 PUTINC(code, 0, 1 + LINK_SIZE);
4971 break; /* No need to waste time with special character handling */
4974 /* Otherwise update the main code pointer to the end of the group. */
4976 code = tempcode;
4978 /* For a DEFINE group, required and first character settings are not
4979 relevant. */
4981 if (bravalue == OP_DEF) break;
4983 /* Handle updating of the required and first characters for other types of
4984 group. Update for normal brackets of all kinds, and conditions with two
4985 branches (see code above). If the bracket is followed by a quantifier with
4986 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4987 zerofirstbyte outside the main loop so that they can be accessed for the
4988 back off. */
4990 zeroreqbyte = reqbyte;
4991 zerofirstbyte = firstbyte;
4992 groupsetfirstbyte = FALSE;
4994 if (bravalue >= OP_ONCE)
4996 /* If we have not yet set a firstbyte in this branch, take it from the
4997 subpattern, remembering that it was set here so that a repeat of more
4998 than one can replicate it as reqbyte if necessary. If the subpattern has
4999 no firstbyte, set "none" for the whole branch. In both cases, a zero
5000 repeat forces firstbyte to "none". */
5002 if (firstbyte == REQ_UNSET)
5004 if (subfirstbyte >= 0)
5006 firstbyte = subfirstbyte;
5007 groupsetfirstbyte = TRUE;
5009 else firstbyte = REQ_NONE;
5010 zerofirstbyte = REQ_NONE;
5013 /* If firstbyte was previously set, convert the subpattern's firstbyte
5014 into reqbyte if there wasn't one, using the vary flag that was in
5015 existence beforehand. */
5017 else if (subfirstbyte >= 0 && subreqbyte < 0)
5018 subreqbyte = subfirstbyte | tempreqvary;
5020 /* If the subpattern set a required byte (or set a first byte that isn't
5021 really the first byte - see above), set it. */
5023 if (subreqbyte >= 0) reqbyte = subreqbyte;
5026 /* For a forward assertion, we take the reqbyte, if set. This can be
5027 helpful if the pattern that follows the assertion doesn't set a different
5028 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5029 for an assertion, however because it leads to incorrect effect for patterns
5030 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5031 of a firstbyte. This is overcome by a scan at the end if there's no
5032 firstbyte, looking for an asserted first char. */
5034 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5035 break; /* End of processing '(' */
5038 /* ===================================================================*/
5039 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5040 are arranged to be the negation of the corresponding OP_values. For the
5041 back references, the values are ESC_REF plus the reference number. Only
5042 back references and those types that consume a character may be repeated.
5043 We can test for values between ESC_b and ESC_Z for the latter; this may
5044 have to change if any new ones are ever created. */
5046 case '\\':
5047 tempptr = ptr;
5048 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5049 if (*errorcodeptr != 0) goto FAILED;
5051 if (c < 0)
5053 if (-c == ESC_Q) /* Handle start of quoted string */
5055 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5056 else inescq = TRUE;
5057 continue;
5060 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5062 /* For metasequences that actually match a character, we disable the
5063 setting of a first character if it hasn't already been set. */
5065 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5066 firstbyte = REQ_NONE;
5068 /* Set values to reset to if this is followed by a zero repeat. */
5070 zerofirstbyte = firstbyte;
5071 zeroreqbyte = reqbyte;
5073 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5074 is a subroutine call by number (Oniguruma syntax). In fact, the value
5075 -ESC_g is returned only for these cases. So we don't need to check for <
5076 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5077 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5078 that is a synonym for a named back reference). */
5080 if (-c == ESC_g)
5082 const uschar *p;
5083 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5084 terminator = (*(++ptr) == '<')? '>' : '\'';
5086 /* These two statements stop the compiler for warning about possibly
5087 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5088 fact, because we actually check for a number below, the paths that
5089 would actually be in error are never taken. */
5091 skipbytes = 0;
5092 reset_bracount = FALSE;
5094 /* Test for a name */
5096 if (ptr[1] != '+' && ptr[1] != '-')
5098 BOOL isnumber = TRUE;
5099 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5101 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5102 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5104 if (*p != terminator)
5106 *errorcodeptr = ERR57;
5107 break;
5109 if (isnumber)
5111 ptr++;
5112 goto HANDLE_NUMERICAL_RECURSION;
5114 is_recurse = TRUE;
5115 goto NAMED_REF_OR_RECURSE;
5118 /* Test a signed number in angle brackets or quotes. */
5120 p = ptr + 2;
5121 while (g_ascii_isdigit (*p) != 0) p++;
5122 if (*p != terminator)
5124 *errorcodeptr = ERR57;
5125 break;
5127 ptr++;
5128 goto HANDLE_NUMERICAL_RECURSION;
5131 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5132 We also support \k{name} (.NET syntax) */
5134 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5136 is_recurse = FALSE;
5137 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5138 goto NAMED_REF_OR_RECURSE;
5141 /* Back references are handled specially; must disable firstbyte if
5142 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5143 ':' later. */
5145 if (-c >= ESC_REF)
5147 recno = -c - ESC_REF;
5149 HANDLE_REFERENCE: /* Come here from named backref handling */
5150 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5151 previous = code;
5152 *code++ = OP_REF;
5153 PUT2INC(code, 0, recno);
5154 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5155 if (recno > cd->top_backref) cd->top_backref = recno;
5158 /* So are Unicode property matches, if supported. */
5160 #ifdef SUPPORT_UCP
5161 else if (-c == ESC_P || -c == ESC_p)
5163 BOOL negated;
5164 int pdata;
5165 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5166 if (ptype < 0) goto FAILED;
5167 previous = code;
5168 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5169 *code++ = ptype;
5170 *code++ = pdata;
5172 #else
5174 /* If Unicode properties are not supported, \X, \P, and \p are not
5175 allowed. */
5177 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5179 *errorcodeptr = ERR45;
5180 goto FAILED;
5182 #endif
5184 /* For the rest (including \X when Unicode properties are supported), we
5185 can obtain the OP value by negating the escape value. */
5187 else
5189 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5190 *code++ = -c;
5192 continue;
5195 /* We have a data character whose value is in c. In UTF-8 mode it may have
5196 a value > 127. We set its representation in the length/buffer, and then
5197 handle it as a data character. */
5199 #ifdef SUPPORT_UTF8
5200 if (utf8 && c > 127)
5201 mclength = _pcre_ord2utf8(c, mcbuffer);
5202 else
5203 #endif
5206 mcbuffer[0] = c;
5207 mclength = 1;
5209 goto ONE_CHAR;
5212 /* ===================================================================*/
5213 /* Handle a literal character. It is guaranteed not to be whitespace or #
5214 when the extended flag is set. If we are in UTF-8 mode, it may be a
5215 multi-byte literal character. */
5217 default:
5218 NORMAL_CHAR:
5219 mclength = 1;
5220 mcbuffer[0] = c;
5222 #ifdef SUPPORT_UTF8
5223 if (utf8 && c >= 0xc0)
5225 while ((ptr[1] & 0xc0) == 0x80)
5226 mcbuffer[mclength++] = *(++ptr);
5228 #endif
5230 /* At this point we have the character's bytes in mcbuffer, and the length
5231 in mclength. When not in UTF-8 mode, the length is always 1. */
5233 ONE_CHAR:
5234 previous = code;
5235 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5236 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5238 /* Remember if \r or \n were seen */
5240 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5241 cd->external_flags |= PCRE_HASCRORLF;
5243 /* Set the first and required bytes appropriately. If no previous first
5244 byte, set it from this character, but revert to none on a zero repeat.
5245 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5246 repeat. */
5248 if (firstbyte == REQ_UNSET)
5250 zerofirstbyte = REQ_NONE;
5251 zeroreqbyte = reqbyte;
5253 /* If the character is more than one byte long, we can set firstbyte
5254 only if it is not to be matched caselessly. */
5256 if (mclength == 1 || req_caseopt == 0)
5258 firstbyte = mcbuffer[0] | req_caseopt;
5259 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5261 else firstbyte = reqbyte = REQ_NONE;
5264 /* firstbyte was previously set; we can set reqbyte only the length is
5265 1 or the matching is caseful. */
5267 else
5269 zerofirstbyte = firstbyte;
5270 zeroreqbyte = reqbyte;
5271 if (mclength == 1 || req_caseopt == 0)
5272 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5275 break; /* End of literal character handling */
5277 } /* end of big loop */
5280 /* Control never reaches here by falling through, only by a goto for all the
5281 error states. Pass back the position in the pattern so that it can be displayed
5282 to the user for diagnosing the error. */
5284 FAILED:
5285 *ptrptr = ptr;
5286 return FALSE;
5292 /*************************************************
5293 * Compile sequence of alternatives *
5294 *************************************************/
5296 /* On entry, ptr is pointing past the bracket character, but on return it
5297 points to the closing bracket, or vertical bar, or end of string. The code
5298 variable is pointing at the byte into which the BRA operator has been stored.
5299 If the ims options are changed at the start (for a (?ims: group) or during any
5300 branch, we need to insert an OP_OPT item at the start of every following branch
5301 to ensure they get set correctly at run time, and also pass the new options
5302 into every subsequent branch compile.
5304 This function is used during the pre-compile phase when we are trying to find
5305 out the amount of memory needed, as well as during the real compile phase. The
5306 value of lengthptr distinguishes the two phases.
5308 Arguments:
5309 options option bits, including any changes for this subpattern
5310 oldims previous settings of ims option bits
5311 codeptr -> the address of the current code pointer
5312 ptrptr -> the address of the current pattern pointer
5313 errorcodeptr -> pointer to error code variable
5314 lookbehind TRUE if this is a lookbehind assertion
5315 reset_bracount TRUE to reset the count for each branch
5316 skipbytes skip this many bytes at start (for brackets and OP_COND)
5317 firstbyteptr place to put the first required character, or a negative number
5318 reqbyteptr place to put the last required character, or a negative number
5319 bcptr pointer to the chain of currently open branches
5320 cd points to the data block with tables pointers etc.
5321 lengthptr NULL during the real compile phase
5322 points to length accumulator during pre-compile phase
5324 Returns: TRUE on success
5327 static BOOL
5328 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5329 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5330 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5331 int *lengthptr)
5333 const uschar *ptr = *ptrptr;
5334 uschar *code = *codeptr;
5335 uschar *last_branch = code;
5336 uschar *start_bracket = code;
5337 uschar *reverse_count = NULL;
5338 int firstbyte, reqbyte;
5339 int branchfirstbyte, branchreqbyte;
5340 int length;
5341 int orig_bracount;
5342 int max_bracount;
5343 branch_chain bc;
5345 bc.outer = bcptr;
5346 bc.current = code;
5348 firstbyte = reqbyte = REQ_UNSET;
5350 /* Accumulate the length for use in the pre-compile phase. Start with the
5351 length of the BRA and KET and any extra bytes that are required at the
5352 beginning. We accumulate in a local variable to save frequent testing of
5353 lenthptr for NULL. We cannot do this by looking at the value of code at the
5354 start and end of each alternative, because compiled items are discarded during
5355 the pre-compile phase so that the work space is not exceeded. */
5357 length = 2 + 2*LINK_SIZE + skipbytes;
5359 /* WARNING: If the above line is changed for any reason, you must also change
5360 the code that abstracts option settings at the start of the pattern and makes
5361 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5362 pre-compile phase to find out whether anything has yet been compiled or not. */
5364 /* Offset is set zero to mark that this bracket is still open */
5366 PUT(code, 1, 0);
5367 code += 1 + LINK_SIZE + skipbytes;
5369 /* Loop for each alternative branch */
5371 orig_bracount = max_bracount = cd->bracount;
5372 for (;;)
5374 /* For a (?| group, reset the capturing bracket count so that each branch
5375 uses the same numbers. */
5377 if (reset_bracount) cd->bracount = orig_bracount;
5379 /* Handle a change of ims options at the start of the branch */
5381 if ((options & PCRE_IMS) != oldims)
5383 *code++ = OP_OPT;
5384 *code++ = options & PCRE_IMS;
5385 length += 2;
5388 /* Set up dummy OP_REVERSE if lookbehind assertion */
5390 if (lookbehind)
5392 *code++ = OP_REVERSE;
5393 reverse_count = code;
5394 PUTINC(code, 0, 0);
5395 length += 1 + LINK_SIZE;
5398 /* Now compile the branch; in the pre-compile phase its length gets added
5399 into the length. */
5401 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5402 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5404 *ptrptr = ptr;
5405 return FALSE;
5408 /* Keep the highest bracket count in case (?| was used and some branch
5409 has fewer than the rest. */
5411 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5413 /* In the real compile phase, there is some post-processing to be done. */
5415 if (lengthptr == NULL)
5417 /* If this is the first branch, the firstbyte and reqbyte values for the
5418 branch become the values for the regex. */
5420 if (*last_branch != OP_ALT)
5422 firstbyte = branchfirstbyte;
5423 reqbyte = branchreqbyte;
5426 /* If this is not the first branch, the first char and reqbyte have to
5427 match the values from all the previous branches, except that if the
5428 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5429 and we set REQ_VARY for the regex. */
5431 else
5433 /* If we previously had a firstbyte, but it doesn't match the new branch,
5434 we have to abandon the firstbyte for the regex, but if there was
5435 previously no reqbyte, it takes on the value of the old firstbyte. */
5437 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5439 if (reqbyte < 0) reqbyte = firstbyte;
5440 firstbyte = REQ_NONE;
5443 /* If we (now or from before) have no firstbyte, a firstbyte from the
5444 branch becomes a reqbyte if there isn't a branch reqbyte. */
5446 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5447 branchreqbyte = branchfirstbyte;
5449 /* Now ensure that the reqbytes match */
5451 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5452 reqbyte = REQ_NONE;
5453 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5456 /* If lookbehind, check that this branch matches a fixed-length string, and
5457 put the length into the OP_REVERSE item. Temporarily mark the end of the
5458 branch with OP_END. */
5460 if (lookbehind)
5462 int fixed_length;
5463 *code = OP_END;
5464 fixed_length = find_fixedlength(last_branch, options);
5465 DPRINTF(("fixed length = %d\n", fixed_length));
5466 if (fixed_length < 0)
5468 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5469 *ptrptr = ptr;
5470 return FALSE;
5472 PUT(reverse_count, 0, fixed_length);
5476 /* Reached end of expression, either ')' or end of pattern. In the real
5477 compile phase, go back through the alternative branches and reverse the chain
5478 of offsets, with the field in the BRA item now becoming an offset to the
5479 first alternative. If there are no alternatives, it points to the end of the
5480 group. The length in the terminating ket is always the length of the whole
5481 bracketed item. If any of the ims options were changed inside the group,
5482 compile a resetting op-code following, except at the very end of the pattern.
5483 Return leaving the pointer at the terminating char. */
5485 if (*ptr != '|')
5487 if (lengthptr == NULL)
5489 int branch_length = code - last_branch;
5492 int prev_length = GET(last_branch, 1);
5493 PUT(last_branch, 1, branch_length);
5494 branch_length = prev_length;
5495 last_branch -= branch_length;
5497 while (branch_length > 0);
5500 /* Fill in the ket */
5502 *code = OP_KET;
5503 PUT(code, 1, code - start_bracket);
5504 code += 1 + LINK_SIZE;
5506 /* Resetting option if needed */
5508 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5510 *code++ = OP_OPT;
5511 *code++ = oldims;
5512 length += 2;
5515 /* Retain the highest bracket number, in case resetting was used. */
5517 cd->bracount = max_bracount;
5519 /* Set values to pass back */
5521 *codeptr = code;
5522 *ptrptr = ptr;
5523 *firstbyteptr = firstbyte;
5524 *reqbyteptr = reqbyte;
5525 if (lengthptr != NULL)
5527 if (OFLOW_MAX - *lengthptr < length)
5529 *errorcodeptr = ERR20;
5530 return FALSE;
5532 *lengthptr += length;
5534 return TRUE;
5537 /* Another branch follows. In the pre-compile phase, we can move the code
5538 pointer back to where it was for the start of the first branch. (That is,
5539 pretend that each branch is the only one.)
5541 In the real compile phase, insert an ALT node. Its length field points back
5542 to the previous branch while the bracket remains open. At the end the chain
5543 is reversed. It's done like this so that the start of the bracket has a
5544 zero offset until it is closed, making it possible to detect recursion. */
5546 if (lengthptr != NULL)
5548 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5549 length += 1 + LINK_SIZE;
5551 else
5553 *code = OP_ALT;
5554 PUT(code, 1, code - last_branch);
5555 bc.current = last_branch = code;
5556 code += 1 + LINK_SIZE;
5559 ptr++;
5561 /* Control never reaches here */
5567 /*************************************************
5568 * Check for anchored expression *
5569 *************************************************/
5571 /* Try to find out if this is an anchored regular expression. Consider each
5572 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5573 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5574 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5575 counts, since OP_CIRC can match in the middle.
5577 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5578 This is the code for \G, which means "match at start of match position, taking
5579 into account the match offset".
5581 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5582 because that will try the rest of the pattern at all possible matching points,
5583 so there is no point trying again.... er ....
5585 .... except when the .* appears inside capturing parentheses, and there is a
5586 subsequent back reference to those parentheses. We haven't enough information
5587 to catch that case precisely.
5589 At first, the best we could do was to detect when .* was in capturing brackets
5590 and the highest back reference was greater than or equal to that level.
5591 However, by keeping a bitmap of the first 31 back references, we can catch some
5592 of the more common cases more precisely.
5594 Arguments:
5595 code points to start of expression (the bracket)
5596 options points to the options setting
5597 bracket_map a bitmap of which brackets we are inside while testing; this
5598 handles up to substring 31; after that we just have to take
5599 the less precise approach
5600 backref_map the back reference bitmap
5602 Returns: TRUE or FALSE
5605 static BOOL
5606 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5607 unsigned int backref_map)
5609 do {
5610 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5611 options, PCRE_MULTILINE, FALSE);
5612 register int op = *scode;
5614 /* Non-capturing brackets */
5616 if (op == OP_BRA)
5618 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5621 /* Capturing brackets */
5623 else if (op == OP_CBRA)
5625 int n = GET2(scode, 1+LINK_SIZE);
5626 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5627 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5630 /* Other brackets */
5632 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5634 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5637 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5638 it isn't in brackets that are or may be referenced. */
5640 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5641 op == OP_TYPEPOSSTAR))
5643 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5644 return FALSE;
5647 /* Check for explicit anchoring */
5649 else if (op != OP_SOD && op != OP_SOM &&
5650 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5651 return FALSE;
5652 code += GET(code, 1);
5654 while (*code == OP_ALT); /* Loop for each alternative */
5655 return TRUE;
5660 /*************************************************
5661 * Check for starting with ^ or .* *
5662 *************************************************/
5664 /* This is called to find out if every branch starts with ^ or .* so that
5665 "first char" processing can be done to speed things up in multiline
5666 matching and for non-DOTALL patterns that start with .* (which must start at
5667 the beginning or after \n). As in the case of is_anchored() (see above), we
5668 have to take account of back references to capturing brackets that contain .*
5669 because in that case we can't make the assumption.
5671 Arguments:
5672 code points to start of expression (the bracket)
5673 bracket_map a bitmap of which brackets we are inside while testing; this
5674 handles up to substring 31; after that we just have to take
5675 the less precise approach
5676 backref_map the back reference bitmap
5678 Returns: TRUE or FALSE
5681 static BOOL
5682 is_startline(const uschar *code, unsigned int bracket_map,
5683 unsigned int backref_map)
5685 do {
5686 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5687 NULL, 0, FALSE);
5688 register int op = *scode;
5690 /* Non-capturing brackets */
5692 if (op == OP_BRA)
5694 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5697 /* Capturing brackets */
5699 else if (op == OP_CBRA)
5701 int n = GET2(scode, 1+LINK_SIZE);
5702 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5703 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5706 /* Other brackets */
5708 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5709 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5711 /* .* means "start at start or after \n" if it isn't in brackets that
5712 may be referenced. */
5714 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5716 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5719 /* Check for explicit circumflex */
5721 else if (op != OP_CIRC) return FALSE;
5723 /* Move on to the next alternative */
5725 code += GET(code, 1);
5727 while (*code == OP_ALT); /* Loop for each alternative */
5728 return TRUE;
5733 /*************************************************
5734 * Check for asserted fixed first char *
5735 *************************************************/
5737 /* During compilation, the "first char" settings from forward assertions are
5738 discarded, because they can cause conflicts with actual literals that follow.
5739 However, if we end up without a first char setting for an unanchored pattern,
5740 it is worth scanning the regex to see if there is an initial asserted first
5741 char. If all branches start with the same asserted char, or with a bracket all
5742 of whose alternatives start with the same asserted char (recurse ad lib), then
5743 we return that char, otherwise -1.
5745 Arguments:
5746 code points to start of expression (the bracket)
5747 options pointer to the options (used to check casing changes)
5748 inassert TRUE if in an assertion
5750 Returns: -1 or the fixed first char
5753 static int
5754 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5756 register int c = -1;
5757 do {
5758 int d;
5759 const uschar *scode =
5760 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5761 register int op = *scode;
5763 switch(op)
5765 default:
5766 return -1;
5768 case OP_BRA:
5769 case OP_CBRA:
5770 case OP_ASSERT:
5771 case OP_ONCE:
5772 case OP_COND:
5773 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5774 return -1;
5775 if (c < 0) c = d; else if (c != d) return -1;
5776 break;
5778 case OP_EXACT: /* Fall through */
5779 scode += 2;
5781 case OP_CHAR:
5782 case OP_CHARNC:
5783 case OP_PLUS:
5784 case OP_MINPLUS:
5785 case OP_POSPLUS:
5786 if (!inassert) return -1;
5787 if (c < 0)
5789 c = scode[1];
5790 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5792 else if (c != scode[1]) return -1;
5793 break;
5796 code += GET(code, 1);
5798 while (*code == OP_ALT);
5799 return c;
5804 /*************************************************
5805 * Compile a Regular Expression *
5806 *************************************************/
5808 /* This function takes a string and returns a pointer to a block of store
5809 holding a compiled version of the expression. The original API for this
5810 function had no error code return variable; it is retained for backwards
5811 compatibility. The new function is given a new name.
5813 Arguments:
5814 pattern the regular expression
5815 options various option bits
5816 errorcodeptr pointer to error code variable (pcre_compile2() only)
5817 can be NULL if you don't want a code value
5818 errorptr pointer to pointer to error text
5819 erroroffset ptr offset in pattern where error was detected
5820 tables pointer to character tables or NULL
5822 Returns: pointer to compiled data block, or NULL on error,
5823 with errorptr and erroroffset set
5826 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
5827 pcre_compile(const char *pattern, int options, const char **errorptr,
5828 int *erroroffset, const unsigned char *tables)
5830 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5834 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
5835 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5836 const char **errorptr, int *erroroffset, const unsigned char *tables)
5838 real_pcre *re;
5839 int length = 1; /* For final END opcode */
5840 int firstbyte, reqbyte, newline;
5841 int errorcode = 0;
5842 int skipatstart = 0;
5843 #ifdef SUPPORT_UTF8
5844 BOOL utf8;
5845 #endif
5846 size_t size;
5847 uschar *code;
5848 const uschar *codestart;
5849 const uschar *ptr;
5850 compile_data compile_block;
5851 compile_data *cd = &compile_block;
5853 /* This space is used for "compiling" into during the first phase, when we are
5854 computing the amount of memory that is needed. Compiled items are thrown away
5855 as soon as possible, so that a fairly large buffer should be sufficient for
5856 this purpose. The same space is used in the second phase for remembering where
5857 to fill in forward references to subpatterns. */
5859 uschar cworkspace[COMPILE_WORK_SIZE];
5861 /* Set this early so that early errors get offset 0. */
5863 ptr = (const uschar *)pattern;
5865 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5866 can do is just return NULL, but we can set a code value if there is a code
5867 pointer. */
5869 if (errorptr == NULL)
5871 if (errorcodeptr != NULL) *errorcodeptr = 99;
5872 return NULL;
5875 *errorptr = NULL;
5876 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5878 /* However, we can give a message for this error */
5880 if (erroroffset == NULL)
5882 errorcode = ERR16;
5883 goto PCRE_EARLY_ERROR_RETURN2;
5886 *erroroffset = 0;
5888 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5890 #ifdef SUPPORT_UTF8
5891 utf8 = (options & PCRE_UTF8) != 0;
5892 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5893 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5895 errorcode = ERR44;
5896 goto PCRE_EARLY_ERROR_RETURN2;
5898 #else
5899 if ((options & PCRE_UTF8) != 0)
5901 errorcode = ERR32;
5902 goto PCRE_EARLY_ERROR_RETURN;
5904 #endif
5906 if ((options & ~PUBLIC_OPTIONS) != 0)
5908 errorcode = ERR17;
5909 goto PCRE_EARLY_ERROR_RETURN;
5912 /* Set up pointers to the individual character tables */
5914 if (tables == NULL) tables = _pcre_default_tables;
5915 cd->lcc = tables + lcc_offset;
5916 cd->fcc = tables + fcc_offset;
5917 cd->cbits = tables + cbits_offset;
5918 cd->ctypes = tables + ctypes_offset;
5920 /* Check for global one-time settings at the start of the pattern, and remember
5921 the offset for later. */
5923 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5925 int newnl = 0;
5926 int newbsr = 0;
5928 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5929 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5930 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5931 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5932 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5933 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5934 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5935 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5936 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5937 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5939 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5940 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5941 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5942 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5944 if (newnl != 0)
5945 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5946 else if (newbsr != 0)
5947 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5948 else break;
5951 /* Check validity of \R options. */
5953 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5955 case 0:
5956 case PCRE_BSR_ANYCRLF:
5957 case PCRE_BSR_UNICODE:
5958 break;
5959 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5962 /* Handle different types of newline. The three bits give seven cases. The
5963 current code allows for fixed one- or two-byte sequences, plus "any" and
5964 "anycrlf". */
5966 switch (options & PCRE_NEWLINE_BITS)
5968 case 0: newline = NEWLINE; break; /* Build-time default */
5969 case PCRE_NEWLINE_CR: newline = '\r'; break;
5970 case PCRE_NEWLINE_LF: newline = '\n'; break;
5971 case PCRE_NEWLINE_CR+
5972 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5973 case PCRE_NEWLINE_ANY: newline = -1; break;
5974 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5975 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5978 if (newline == -2)
5980 cd->nltype = NLTYPE_ANYCRLF;
5982 else if (newline < 0)
5984 cd->nltype = NLTYPE_ANY;
5986 else
5988 cd->nltype = NLTYPE_FIXED;
5989 if (newline > 255)
5991 cd->nllen = 2;
5992 cd->nl[0] = (newline >> 8) & 255;
5993 cd->nl[1] = newline & 255;
5995 else
5997 cd->nllen = 1;
5998 cd->nl[0] = newline;
6002 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6003 references to help in deciding whether (.*) can be treated as anchored or not.
6006 cd->top_backref = 0;
6007 cd->backref_map = 0;
6009 /* Reflect pattern for debugging output */
6011 DPRINTF(("------------------------------------------------------------------\n"));
6012 DPRINTF(("%s\n", pattern));
6014 /* Pretend to compile the pattern while actually just accumulating the length
6015 of memory required. This behaviour is triggered by passing a non-NULL final
6016 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6017 to compile parts of the pattern into; the compiled code is discarded when it is
6018 no longer needed, so hopefully this workspace will never overflow, though there
6019 is a test for its doing so. */
6021 cd->bracount = cd->final_bracount = 0;
6022 cd->names_found = 0;
6023 cd->name_entry_size = 0;
6024 cd->name_table = NULL;
6025 cd->start_workspace = cworkspace;
6026 cd->start_code = cworkspace;
6027 cd->hwm = cworkspace;
6028 cd->start_pattern = (const uschar *)pattern;
6029 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6030 cd->req_varyopt = 0;
6031 cd->external_options = options;
6032 cd->external_flags = 0;
6034 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6035 don't need to look at the result of the function here. The initial options have
6036 been put into the cd block so that they can be changed if an option setting is
6037 found within the regex right at the beginning. Bringing initial option settings
6038 outside can help speed up starting point checks. */
6040 ptr += skipatstart;
6041 code = cworkspace;
6042 *code = OP_BRA;
6043 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6044 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6045 &length);
6046 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6048 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6049 cd->hwm - cworkspace));
6051 if (length > MAX_PATTERN_SIZE)
6053 errorcode = ERR20;
6054 goto PCRE_EARLY_ERROR_RETURN;
6057 /* Compute the size of data block needed and get it, either from malloc or
6058 externally provided function. Integer overflow should no longer be possible
6059 because nowadays we limit the maximum value of cd->names_found and
6060 cd->name_entry_size. */
6062 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6063 re = (real_pcre *)(pcre_malloc)(size);
6065 if (re == NULL)
6067 errorcode = ERR21;
6068 goto PCRE_EARLY_ERROR_RETURN;
6071 /* Put in the magic number, and save the sizes, initial options, internal
6072 flags, and character table pointer. NULL is used for the default character
6073 tables. The nullpad field is at the end; it's there to help in the case when a
6074 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6075 pointers. */
6077 re->magic_number = MAGIC_NUMBER;
6078 re->size = size;
6079 re->options = cd->external_options;
6080 re->flags = cd->external_flags;
6081 re->dummy1 = 0;
6082 re->first_byte = 0;
6083 re->req_byte = 0;
6084 re->name_table_offset = sizeof(real_pcre);
6085 re->name_entry_size = cd->name_entry_size;
6086 re->name_count = cd->names_found;
6087 re->ref_count = 0;
6088 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6089 re->nullpad = NULL;
6091 /* The starting points of the name/number translation table and of the code are
6092 passed around in the compile data block. The start/end pattern and initial
6093 options are already set from the pre-compile phase, as is the name_entry_size
6094 field. Reset the bracket count and the names_found field. Also reset the hwm
6095 field; this time it's used for remembering forward references to subpatterns.
6098 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6099 cd->bracount = 0;
6100 cd->names_found = 0;
6101 cd->name_table = (uschar *)re + re->name_table_offset;
6102 codestart = cd->name_table + re->name_entry_size * re->name_count;
6103 cd->start_code = codestart;
6104 cd->hwm = cworkspace;
6105 cd->req_varyopt = 0;
6106 cd->had_accept = FALSE;
6108 /* Set up a starting, non-extracting bracket, then compile the expression. On
6109 error, errorcode will be set non-zero, so we don't need to look at the result
6110 of the function here. */
6112 ptr = (const uschar *)pattern + skipatstart;
6113 code = (uschar *)codestart;
6114 *code = OP_BRA;
6115 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6116 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6117 re->top_bracket = cd->bracount;
6118 re->top_backref = cd->top_backref;
6119 re->flags = cd->external_flags;
6121 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6123 /* If not reached end of pattern on success, there's an excess bracket. */
6125 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6127 /* Fill in the terminating state and check for disastrous overflow, but
6128 if debugging, leave the test till after things are printed out. */
6130 *code++ = OP_END;
6132 #ifndef DEBUG
6133 if (code - codestart > length) errorcode = ERR23;
6134 #endif
6136 /* Fill in any forward references that are required. */
6138 while (errorcode == 0 && cd->hwm > cworkspace)
6140 int offset, recno;
6141 const uschar *groupptr;
6142 cd->hwm -= LINK_SIZE;
6143 offset = GET(cd->hwm, 0);
6144 recno = GET(codestart, offset);
6145 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6146 if (groupptr == NULL) errorcode = ERR53;
6147 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6150 /* Give an error if there's back reference to a non-existent capturing
6151 subpattern. */
6153 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6155 /* Failed to compile, or error while post-processing */
6157 if (errorcode != 0)
6159 (pcre_free)(re);
6160 PCRE_EARLY_ERROR_RETURN:
6161 *erroroffset = ptr - (const uschar *)pattern;
6162 PCRE_EARLY_ERROR_RETURN2:
6163 *errorptr = find_error_text(errorcode);
6164 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6165 return NULL;
6168 /* If the anchored option was not passed, set the flag if we can determine that
6169 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6170 as starting with .* when DOTALL is set).
6172 Otherwise, if we know what the first byte has to be, save it, because that
6173 speeds up unanchored matches no end. If not, see if we can set the
6174 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6175 start with ^. and also when all branches start with .* for non-DOTALL matches.
6178 if ((re->options & PCRE_ANCHORED) == 0)
6180 int temp_options = re->options; /* May get changed during these scans */
6181 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6182 re->options |= PCRE_ANCHORED;
6183 else
6185 if (firstbyte < 0)
6186 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6187 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6189 int ch = firstbyte & 255;
6190 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6191 cd->fcc[ch] == ch)? ch : firstbyte;
6192 re->flags |= PCRE_FIRSTSET;
6194 else if (is_startline(codestart, 0, cd->backref_map))
6195 re->flags |= PCRE_STARTLINE;
6199 /* For an anchored pattern, we use the "required byte" only if it follows a
6200 variable length item in the regex. Remove the caseless flag for non-caseable
6201 bytes. */
6203 if (reqbyte >= 0 &&
6204 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6206 int ch = reqbyte & 255;
6207 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6208 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6209 re->flags |= PCRE_REQCHSET;
6212 /* Print out the compiled data if debugging is enabled. This is never the
6213 case when building a production library. */
6215 #ifdef DEBUG
6217 printf("Length = %d top_bracket = %d top_backref = %d\n",
6218 length, re->top_bracket, re->top_backref);
6220 printf("Options=%08x\n", re->options);
6222 if ((re->flags & PCRE_FIRSTSET) != 0)
6224 int ch = re->first_byte & 255;
6225 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6226 "" : " (caseless)";
6227 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6228 else printf("First char = \\x%02x%s\n", ch, caseless);
6231 if ((re->flags & PCRE_REQCHSET) != 0)
6233 int ch = re->req_byte & 255;
6234 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6235 "" : " (caseless)";
6236 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6237 else printf("Req char = \\x%02x%s\n", ch, caseless);
6240 pcre_printint(re, stdout, TRUE);
6242 /* This check is done here in the debugging case so that the code that
6243 was compiled can be seen. */
6245 if (code - codestart > length)
6247 (pcre_free)(re);
6248 *errorptr = find_error_text(ERR23);
6249 *erroroffset = ptr - (uschar *)pattern;
6250 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6251 return NULL;
6253 #endif /* DEBUG */
6255 return (pcre *)re;
6258 /* End of pcre_compile.c */