glib/pcre/pcre_compile.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2008 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_compile(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #define NLBLOCK cd             /* Block containing newline information */
  50 #define PSSTART start_pattern  /* Field containing processed string start */
  51 #define PSEND   end_pattern    /* Field containing processed string end */
  52
  53 #include "pcre_internal.h"
  54
  55
  56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
  57 used by pcretest. DEBUG is not defined when building a production library. */
  58
  59 #ifdef DEBUG
  60 #include "pcre_printint.src"
  61 #endif
  62
  63
  64 /* Macro for setting individual bits in class bitmaps. */
  65
  66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
  67
  68 /* Maximum length value to check against when making sure that the integer that
  69 holds the compiled pattern length does not overflow. We make it a bit less than
  70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  71 to check them every time. */
  72
  73 #define OFLOW_MAX (INT_MAX - 20)
  74
  75
  76 /*************************************************
  77 *      Code parameters and static tables         *
  78 *************************************************/
  79
  80 /* This value specifies the size of stack workspace that is used during the
  81 first pre-compile phase that determines how much memory is required. The regex
  82 is partly compiled into this space, but the compiled parts are discarded as
  83 soon as they can be, so that hopefully there will never be an overrun. The code
  84 does, however, check for an overrun. The largest amount I've seen used is 218,
  85 so this number is very generous.
  86
  87 The same workspace is used during the second, actual compile phase for
  88 remembering forward references to groups so that they can be filled in at the
  89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  90 is 4 there is plenty of room. */
  91
  92 #define COMPILE_WORK_SIZE (4096)
  93
  94
  95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  96 are simple data values; negative values are for special things like \d and so
  97 on. Zero means further processing is needed (for things like \x), or the escape
  98 is invalid. */
  99
 100 #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
 101 static const short int escapes[] = {
 102      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 103      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 104    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
 105 -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
 106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
 107 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 108    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
 109 -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
 110 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
 111      0,      0, -ESC_z                                            /* x - z */
 112 };
 113
 114 #else           /* This is the "abnormal" table for EBCDIC systems */
 115 static const short int escapes[] = {
 116 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 117 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 118 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 119 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 120 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 121 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 122 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 123 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 124 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 125 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
 126 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 127 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 128 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 129 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 130 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 131 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 132 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 133 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
 134 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 135 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 136 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 137 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 138 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 139 };
 140 #endif
 141
 142
 143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 144 searched linearly. Put all the names into a single string, in order to reduce
 145 the number of relocations when a shared library is dynamically linked. */
 146
 147 typedef struct verbitem {
 148   int   len;
 149   int   op;
 150 } verbitem;
 151
 152 static const char verbnames[] =
 153   "ACCEPT\0"
 154   "COMMIT\0"
 155   "F\0"
 156   "FAIL\0"
 157   "PRUNE\0"
 158   "SKIP\0"
 159   "THEN";
 160
 161 static const verbitem verbs[] = {
 162   { 6, OP_ACCEPT },
 163   { 6, OP_COMMIT },
 164   { 1, OP_FAIL },
 165   { 4, OP_FAIL },
 166   { 5, OP_PRUNE },
 167   { 4, OP_SKIP  },
 168   { 4, OP_THEN  }
 169 };
 170
 171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
 172
 173
 174 /* Tables of names of POSIX character classes and their lengths. The names are
 175 now all in a single string, to reduce the number of relocations when a shared
 176 library is dynamically loaded. The list of lengths is terminated by a zero
 177 length entry. The first three must be alpha, lower, upper, as this is assumed
 178 for handling case independence. */
 179
 180 static const char posix_names[] =
 181   "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
 182   "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
 183   "word\0"   "xdigit";
 184
 185 static const uschar posix_name_lengths[] = {
 186   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 187
 188 /* Table of class bit maps for each POSIX class. Each class is formed from a
 189 base map, with an optional addition or removal of another map. Then, for some
 190 classes, there is some additional tweaking: for [:blank:] the vertical space
 191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 192 character is removed. The triples in the table consist of the base map offset,
 193 second map offset or -1 if no second map, and a non-negative value for map
 194 addition or a negative value for map subtraction (if there are two maps). The
 195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 196 remove vertical space characters, 2 => remove underscore. */
 197
 198 static const int posix_class_maps[] = {
 199   cbit_word,  cbit_digit, -2,             /* alpha */
 200   cbit_lower, -1,          0,             /* lower */
 201   cbit_upper, -1,          0,             /* upper */
 202   cbit_word,  -1,          2,             /* alnum - word without underscore */
 203   cbit_print, cbit_cntrl,  0,             /* ascii */
 204   cbit_space, -1,          1,             /* blank - a GNU extension */
 205   cbit_cntrl, -1,          0,             /* cntrl */
 206   cbit_digit, -1,          0,             /* digit */
 207   cbit_graph, -1,          0,             /* graph */
 208   cbit_print, -1,          0,             /* print */
 209   cbit_punct, -1,          0,             /* punct */
 210   cbit_space, -1,          0,             /* space */
 211   cbit_word,  -1,          0,             /* word - a Perl extension */
 212   cbit_xdigit,-1,          0              /* xdigit */
 213 };
 214
 215
 216 #define STRING(a)  # a
 217 #define XSTRING(s) STRING(s)
 218
 219 /* The texts of compile-time error messages. These are "char *" because they
 220 are passed to the outside world. Do not ever re-use any error number, because
 221 they are documented. Always add a new error instead. Messages marked DEAD below
 222 are no longer used. This used to be a table of strings, but in order to reduce
 223 the number of relocations needed when a shared library is loaded dynamically,
 224 it is now one long string. We cannot use a table of offsets, because the
 225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 226 simply count through to the one we want - this isn't a performance issue
 227 because these strings are used only when there is a compilation error. */
 228
 229 static const char error_texts[] =
 230   "no error\0"
 231   "\\ at end of pattern\0"
 232   "\\c at end of pattern\0"
 233   "unrecognized character follows \\\0"
 234   "numbers out of order in {} quantifier\0"
 235   /* 5 */
 236   "number too big in {} quantifier\0"
 237   "missing terminating ] for character class\0"
 238   "invalid escape sequence in character class\0"
 239   "range out of order in character class\0"
 240   "nothing to repeat\0"
 241   /* 10 */
 242   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
 243   "internal error: unexpected repeat\0"
 244   "unrecognized character after (? or (?-\0"
 245   "POSIX named classes are supported only within a class\0"
 246   "missing )\0"
 247   /* 15 */
 248   "reference to non-existent subpattern\0"
 249   "erroffset passed as NULL\0"
 250   "unknown option bit(s) set\0"
 251   "missing ) after comment\0"
 252   "parentheses nested too deeply\0"  /** DEAD **/
 253   /* 20 */
 254   "regular expression is too large\0"
 255   "failed to get memory\0"
 256   "unmatched parentheses\0"
 257   "internal error: code overflow\0"
 258   "unrecognized character after (?<\0"
 259   /* 25 */
 260   "lookbehind assertion is not fixed length\0"
 261   "malformed number or name after (?(\0"
 262   "conditional group contains more than two branches\0"
 263   "assertion expected after (?(\0"
 264   "(?R or (?[+-]digits must be followed by )\0"
 265   /* 30 */
 266   "unknown POSIX class name\0"
 267   "POSIX collating elements are not supported\0"
 268   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
 269   "spare error\0"  /** DEAD **/
 270   "character value in \\x{...} sequence is too large\0"
 271   /* 35 */
 272   "invalid condition (?(0)\0"
 273   "\\C not allowed in lookbehind assertion\0"
 274   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
 275   "number after (?C is > 255\0"
 276   "closing ) for (?C expected\0"
 277   /* 40 */
 278   "recursive call could loop indefinitely\0"
 279   "unrecognized character after (?P\0"
 280   "syntax error in subpattern name (missing terminator)\0"
 281   "two named subpatterns have the same name\0"
 282   "invalid UTF-8 string\0"
 283   /* 45 */
 284   "support for \\P, \\p, and \\X has not been compiled\0"
 285   "malformed \\P or \\p sequence\0"
 286   "unknown property name after \\P or \\p\0"
 287   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
 288   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
 289   /* 50 */
 290   "repeated subpattern is too long\0"    /** DEAD **/
 291   "octal value is greater than \\377 (not in UTF-8 mode)\0"
 292   "internal error: overran compiling workspace\0"
 293   "internal error: previously-checked referenced subpattern not found\0"
 294   "DEFINE group contains more than one branch\0"
 295   /* 55 */
 296   "repeating a DEFINE group is not allowed\0"
 297   "inconsistent NEWLINE options\0"
 298   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
 299   "a numbered reference must not be zero\0"
 300   "(*VERB) with an argument is not supported\0"
 301   /* 60 */
 302   "(*VERB) not recognized\0"
 303   "number is too big\0"
 304   "subpattern name expected\0"
 305   "digit expected after (?+\0"
 306   "] is an invalid data character in JavaScript compatibility mode";
 307
 308
 309 /* Definition to allow mutual recursion */
 310
 311 static BOOL
 312   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
 313     int *, int *, branch_chain *, compile_data *, int *);
 314
 315
 316
 317 /*************************************************
 318 *            Find an error text                  *
 319 *************************************************/
 320
 321 /* The error texts are now all in one long string, to save on relocations. As
 322 some of the text is of unknown length, we can't use a table of offsets.
 323 Instead, just count through the strings. This is not a performance issue
 324 because it happens only when there has been a compilation error.
 325
 326 Argument:   the error number
 327 Returns:    pointer to the error string
 328 */
 329
 330 static const char *
 331 find_error_text(int n)
 332 {
 333 const char *s = error_texts;
 334 for (; n > 0; n--) while (*s++ != 0) {};
 335 return s;
 336 }
 337
 338
 339 /*************************************************
 340 *            Handle escapes                      *
 341 *************************************************/
 342
 343 /* This function is called when a \ has been encountered. It either returns a
 344 positive value for a simple escape such as \n, or a negative value which
 345 encodes one of the more complicated things such as \d. A backreference to group
 346 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
 347 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
 348 ptr is pointing at the \. On exit, it is on the final character of the escape
 349 sequence.
 350
 351 Arguments:
 352   ptrptr         points to the pattern position pointer
 353   errorcodeptr   points to the errorcode variable
 354   bracount       number of previous extracting brackets
 355   options        the options bits
 356   isclass        TRUE if inside a character class
 357
 358 Returns:         zero or positive => a data character
 359                  negative => a special escape sequence
 360                  on error, errorcodeptr is set
 361 */
 362
 363 static int
 364 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
 365   int options, BOOL isclass)
 366 {
 367 BOOL utf8 = (options & PCRE_UTF8) != 0;
 368 const uschar *ptr = *ptrptr + 1;
 369 int c, i;
 370
 371 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 372 ptr--;                            /* Set pointer back to the last byte */
 373
 374 /* If backslash is at the end of the pattern, it's an error. */
 375
 376 if (c == 0) *errorcodeptr = ERR1;
 377
 378 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
 379 in a table. A non-zero result is something that can be returned immediately.
 380 Otherwise further processing may be required. */
 381
 382 #ifndef EBCDIC  /* ASCII coding */
 383 else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
 384 else if ((i = escapes[c - '0']) != 0) c = i;
 385
 386 #else           /* EBCDIC coding */
 387 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
 388 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 389 #endif
 390
 391 /* Escapes that need further processing, or are illegal. */
 392
 393 else
 394   {
 395   const uschar *oldptr;
 396   BOOL braced, negated;
 397
 398   switch (c)
 399     {
 400     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 401     error. */
 402
 403     case 'l':
 404     case 'L':
 405     case 'N':
 406     case 'u':
 407     case 'U':
 408     *errorcodeptr = ERR37;
 409     break;
 410
 411     /* \g must be followed by one of a number of specific things:
 412
 413     (1) A number, either plain or braced. If positive, it is an absolute
 414     backreference. If negative, it is a relative backreference. This is a Perl
 415     5.10 feature.
 416
 417     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
 418     is part of Perl's movement towards a unified syntax for back references. As
 419     this is synonymous with \k{name}, we fudge it up by pretending it really
 420     was \k.
 421
 422     (3) For Oniguruma compatibility we also support \g followed by a name or a
 423     number either in angle brackets or in single quotes. However, these are
 424     (possibly recursive) subroutine calls, _not_ backreferences. Just return
 425     the -ESC_g code (cf \k). */
 426
 427     case 'g':
 428     if (ptr[1] == '<' || ptr[1] == '\'')
 429       {
 430       c = -ESC_g;
 431       break;
 432       }
 433
 434     /* Handle the Perl-compatible cases */
 435
 436     if (ptr[1] == '{')
 437       {
 438       const uschar *p;
 439       for (p = ptr+2; *p != 0 && *p != '}'; p++)
 440         if (*p != '-' && g_ascii_isdigit (*p) == 0) break;
 441       if (*p != 0 && *p != '}')
 442         {
 443         c = -ESC_k;
 444         break;
 445         }
 446       braced = TRUE;
 447       ptr++;
 448       }
 449     else braced = FALSE;
 450
 451     if (ptr[1] == '-')
 452       {
 453       negated = TRUE;
 454       ptr++;
 455       }
 456     else negated = FALSE;
 457
 458     c = 0;
 459     while (g_ascii_isdigit (ptr[1]) != 0)
 460       c = c * 10 + *(++ptr) - '0';
 461
 462     if (c < 0)   /* Integer overflow */
 463       {
 464       *errorcodeptr = ERR61;
 465       break;
 466       }
 467
 468     if (braced && *(++ptr) != '}')
 469       {
 470       *errorcodeptr = ERR57;
 471       break;
 472       }
 473
 474     if (c == 0)
 475       {
 476       *errorcodeptr = ERR58;
 477       break;
 478       }
 479
 480     if (negated)
 481       {
 482       if (c > bracount)
 483         {
 484         *errorcodeptr = ERR15;
 485         break;
 486         }
 487       c = bracount - (c - 1);
 488       }
 489
 490     c = -(ESC_REF + c);
 491     break;
 492
 493     /* The handling of escape sequences consisting of a string of digits
 494     starting with one that is not zero is not straightforward. By experiment,
 495     the way Perl works seems to be as follows:
 496
 497     Outside a character class, the digits are read as a decimal number. If the
 498     number is less than 10, or if there are that many previous extracting
 499     left brackets, then it is a back reference. Otherwise, up to three octal
 500     digits are read to form an escaped byte. Thus \123 is likely to be octal
 501     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 502     value is greater than 377, the least significant 8 bits are taken. Inside a
 503     character class, \ followed by a digit is always an octal number. */
 504
 505     case '1': case '2': case '3': case '4': case '5':
 506     case '6': case '7': case '8': case '9':
 507
 508     if (!isclass)
 509       {
 510       oldptr = ptr;
 511       c -= '0';
 512       while (g_ascii_isdigit (ptr[1]))
 513         c = c * 10 + *(++ptr) - '0';
 514       if (c < 0)    /* Integer overflow */
 515         {
 516         *errorcodeptr = ERR61;
 517         break;
 518         }
 519       if (c < 10 || c <= bracount)
 520         {
 521         c = -(ESC_REF + c);
 522         break;
 523         }
 524       ptr = oldptr;      /* Put the pointer back and fall through */
 525       }
 526
 527     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 528     generates a binary zero byte and treats the digit as a following literal.
 529     Thus we have to pull back the pointer by one. */
 530
 531     if ((c = *ptr) >= '8')
 532       {
 533       ptr--;
 534       c = 0;
 535       break;
 536       }
 537
 538     /* \0 always starts an octal number, but we may drop through to here with a
 539     larger first octal digit. The original code used just to take the least
 540     significant 8 bits of octal numbers (I think this is what early Perls used
 541     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
 542     than 3 octal digits. */
 543
 544     case '0':
 545     c -= '0';
 546     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
 547         c = c * 8 + *(++ptr) - '0';
 548     if (!utf8 && c > 255) *errorcodeptr = ERR51;
 549     break;
 550
 551     /* \x is complicated. \x{ddd} is a character number which can be greater
 552     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
 553     treated as a data character. */
 554
 555     case 'x':
 556     if (ptr[1] == '{')
 557       {
 558       const uschar *pt = ptr + 2;
 559       int count = 0;
 560
 561       c = 0;
 562       while (g_ascii_isxdigit (*pt) != 0)
 563         {
 564         register int cc = *pt++;
 565         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
 566         count++;
 567
 568 #ifndef EBCDIC  /* ASCII coding */
 569         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
 570         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
 571 #else           /* EBCDIC coding */
 572         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
 573         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
 574 #endif
 575         }
 576
 577       if (*pt == '}')
 578         {
 579         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
 580         ptr = pt;
 581         break;
 582         }
 583
 584       /* If the sequence of hex digits does not end with '}', then we don't
 585       recognize this construct; fall through to the normal \x handling. */
 586       }
 587
 588     /* Read just a single-byte hex-defined char */
 589
 590     c = 0;
 591     while (i++ < 2 && g_ascii_isxdigit (ptr[1]) != 0)
 592       {
 593       int cc;                               /* Some compilers don't like ++ */
 594       cc = *(++ptr);                        /* in initializers */
 595 #ifndef EBCDIC  /* ASCII coding */
 596       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
 597       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
 598 #else           /* EBCDIC coding */
 599       if (cc <= 'z') cc += 64;              /* Convert to upper case */
 600       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
 601 #endif
 602       }
 603     break;
 604
 605     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 606     This coding is ASCII-specific, but then the whole concept of \cx is
 607     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 608
 609     case 'c':
 610     c = *(++ptr);
 611     if (c == 0)
 612       {
 613       *errorcodeptr = ERR2;
 614       break;
 615       }
 616
 617 #ifndef EBCDIC  /* ASCII coding */
 618     if (c >= 'a' && c <= 'z') c -= 32;
 619     c ^= 0x40;
 620 #else           /* EBCDIC coding */
 621     if (c >= 'a' && c <= 'z') c += 64;
 622     c ^= 0xC0;
 623 #endif
 624     break;
 625
 626     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 627     other alphanumeric following \ is an error if PCRE_EXTRA was set;
 628     otherwise, for Perl compatibility, it is a literal. This code looks a bit
 629     odd, but there used to be some cases other than the default, and there may
 630     be again in future, so I haven't "optimized" it. */
 631
 632     default:
 633     if ((options & PCRE_EXTRA) != 0) switch(c)
 634       {
 635       default:
 636       *errorcodeptr = ERR3;
 637       break;
 638       }
 639     break;
 640     }
 641   }
 642
 643 *ptrptr = ptr;
 644 return c;
 645 }
 646
 647
 648
 649 #ifdef SUPPORT_UCP
 650 /*************************************************
 651 *               Handle \P and \p                 *
 652 *************************************************/
 653
 654 /* This function is called after \P or \p has been encountered, provided that
 655 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 656 pointing at the P or p. On exit, it is pointing at the final character of the
 657 escape sequence.
 658
 659 Argument:
 660   ptrptr         points to the pattern position pointer
 661   negptr         points to a boolean that is set TRUE for negation else FALSE
 662   dptr           points to an int that is set to the detailed property value
 663   errorcodeptr   points to the error code variable
 664
 665 Returns:         type value from ucp_type_table, or -1 for an invalid type
 666 */
 667
 668 static int
 669 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 670 {
 671 int c, i, bot, top;
 672 const uschar *ptr = *ptrptr;
 673 char name[32];
 674
 675 c = *(++ptr);
 676 if (c == 0) goto ERROR_RETURN;
 677
 678 *negptr = FALSE;
 679
 680 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 681 negation. */
 682
 683 if (c == '{')
 684   {
 685   if (ptr[1] == '^')
 686     {
 687     *negptr = TRUE;
 688     ptr++;
 689     }
 690   for (i = 0; i < (int)sizeof(name) - 1; i++)
 691     {
 692     c = *(++ptr);
 693     if (c == 0) goto ERROR_RETURN;
 694     if (c == '}') break;
 695     name[i] = c;
 696     }
 697   if (c !='}') goto ERROR_RETURN;
 698   name[i] = 0;
 699   }
 700
 701 /* Otherwise there is just one following character */
 702
 703 else
 704   {
 705   name[0] = c;
 706   name[1] = 0;
 707   }
 708
 709 *ptrptr = ptr;
 710
 711 /* Search for a recognized property name using binary chop */
 712
 713 bot = 0;
 714 top = _pcre_utt_size;
 715
 716 while (bot < top)
 717   {
 718   i = (bot + top) >> 1;
 719   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
 720   if (c == 0)
 721     {
 722     *dptr = _pcre_utt[i].value;
 723     return _pcre_utt[i].type;
 724     }
 725   if (c > 0) bot = i + 1; else top = i;
 726   }
 727
 728 *errorcodeptr = ERR47;
 729 *ptrptr = ptr;
 730 return -1;
 731
 732 ERROR_RETURN:
 733 *errorcodeptr = ERR46;
 734 *ptrptr = ptr;
 735 return -1;
 736 }
 737 #endif
 738
 739
 740
 741
 742 /*************************************************
 743 *            Check for counted repeat            *
 744 *************************************************/
 745
 746 /* This function is called when a '{' is encountered in a place where it might
 747 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 748 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 749 where the ddds are digits.
 750
 751 Arguments:
 752   p         pointer to the first char after '{'
 753
 754 Returns:    TRUE or FALSE
 755 */
 756
 757 static BOOL
 758 is_counted_repeat(const uschar *p)
 759 {
 760 if (g_ascii_isdigit (*p++) == 0) return FALSE;
 761 while (g_ascii_isdigit (*p) != 0) p++;
 762 if (*p == '}') return TRUE;
 763
 764 if (*p++ != ',') return FALSE;
 765 if (*p == '}') return TRUE;
 766
 767 if (g_ascii_isdigit (*p++) == 0) return FALSE;
 768 while (g_ascii_isdigit (*p) != 0) p++;
 769
 770 return (*p == '}');
 771 }
 772
 773
 774
 775 /*************************************************
 776 *         Read repeat counts                     *
 777 *************************************************/
 778
 779 /* Read an item of the form {n,m} and return the values. This is called only
 780 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 781 so the syntax is guaranteed to be correct, but we need to check the values.
 782
 783 Arguments:
 784   p              pointer to first char after '{'
 785   minp           pointer to int for min
 786   maxp           pointer to int for max
 787                  returned as -1 if no max
 788   errorcodeptr   points to error code variable
 789
 790 Returns:         pointer to '}' on success;
 791                  current ptr on error, with errorcodeptr set non-zero
 792 */
 793
 794 static const uschar *
 795 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
 796 {
 797 int min = 0;
 798 int max = -1;
 799
 800 /* Read the minimum value and do a paranoid check: a negative value indicates
 801 an integer overflow. */
 802
 803 while (g_ascii_isdigit (*p) != 0) min = min * 10 + *p++ - '0';
 804 if (min < 0 || min > 65535)
 805   {
 806   *errorcodeptr = ERR5;
 807   return p;
 808   }
 809
 810 /* Read the maximum value if there is one, and again do a paranoid on its size.
 811 Also, max must not be less than min. */
 812
 813 if (*p == '}') max = min; else
 814   {
 815   if (*(++p) != '}')
 816     {
 817     max = 0;
 818     while(g_ascii_isdigit (*p) != 0) max = max * 10 + *p++ - '0';
 819     if (max < 0 || max > 65535)
 820       {
 821       *errorcodeptr = ERR5;
 822       return p;
 823       }
 824     if (max < min)
 825       {
 826       *errorcodeptr = ERR4;
 827       return p;
 828       }
 829     }
 830   }
 831
 832 /* Fill in the required variables, and pass back the pointer to the terminating
 833 '}'. */
 834
 835 *minp = min;
 836 *maxp = max;
 837 return p;
 838 }
 839
 840
 841
 842 /*************************************************
 843 *       Find forward referenced subpattern       *
 844 *************************************************/
 845
 846 /* This function scans along a pattern's text looking for capturing
 847 subpatterns, and counting them. If it finds a named pattern that matches the
 848 name it is given, it returns its number. Alternatively, if the name is NULL, it
 849 returns when it reaches a given numbered subpattern. This is used for forward
 850 references to subpatterns. We know that if (?P< is encountered, the name will
 851 be terminated by '>' because that is checked in the first pass.
 852
 853 Arguments:
 854   ptr          current position in the pattern
 855   cd           compile background data
 856   name         name to seek, or NULL if seeking a numbered subpattern
 857   lorn         name length, or subpattern number if name is NULL
 858   xmode        TRUE if we are in /x mode
 859
 860 Returns:       the number of the named subpattern, or -1 if not found
 861 */
 862
 863 static int
 864 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
 865   BOOL xmode)
 866 {
 867 const uschar *thisname;
 868 int count = cd->bracount;
 869
 870 for (; *ptr != 0; ptr++)
 871   {
 872   int term;
 873
 874   /* Skip over backslashed characters and also entire \Q...\E */
 875
 876   if (*ptr == '\\')
 877     {
 878     if (*(++ptr) == 0) return -1;
 879     if (*ptr == 'Q') for (;;)
 880       {
 881       while (*(++ptr) != 0 && *ptr != '\\') {};
 882       if (*ptr == 0) return -1;
 883       if (*(++ptr) == 'E') break;
 884       }
 885     continue;
 886     }
 887
 888   /* Skip over character classes; this logic must be similar to the way they
 889   are handled for real. If the first character is '^', skip it. Also, if the
 890   first few characters (either before or after ^) are \Q\E or \E we skip them
 891   too. This makes for compatibility with Perl. */
 892
 893   if (*ptr == '[')
 894     {
 895     BOOL negate_class = FALSE;
 896     for (;;)
 897       {
 898       int c = *(++ptr);
 899       if (c == '\\')
 900         {
 901         if (ptr[1] == 'E') ptr++;
 902           else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
 903             else break;
 904         }
 905       else if (!negate_class && c == '^')
 906         negate_class = TRUE;
 907       else break;
 908       }
 909
 910     /* If the next character is ']', it is a data character that must be
 911     skipped, except in JavaScript compatibility mode. */
 912
 913     if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
 914       ptr++;
 915
 916     while (*(++ptr) != ']')
 917       {
 918       if (*ptr == 0) return -1;
 919       if (*ptr == '\\')
 920         {
 921         if (*(++ptr) == 0) return -1;
 922         if (*ptr == 'Q') for (;;)
 923           {
 924           while (*(++ptr) != 0 && *ptr != '\\') {};
 925           if (*ptr == 0) return -1;
 926           if (*(++ptr) == 'E') break;
 927           }
 928         continue;
 929         }
 930       }
 931     continue;
 932     }
 933
 934   /* Skip comments in /x mode */
 935
 936   if (xmode && *ptr == '#')
 937     {
 938     while (*(++ptr) != 0 && *ptr != '\n') {};
 939     if (*ptr == 0) return -1;
 940     continue;
 941     }
 942
 943   /* An opening parens must now be a real metacharacter */
 944
 945   if (*ptr != '(') continue;
 946   if (ptr[1] != '?' && ptr[1] != '*')
 947     {
 948     count++;
 949     if (name == NULL && count == lorn) return count;
 950     continue;
 951     }
 952
 953   ptr += 2;
 954   if (*ptr == 'P') ptr++;                      /* Allow optional P */
 955
 956   /* We have to disambiguate (?<! and (?<= from (?<name> */
 957
 958   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
 959        *ptr != '\'')
 960     continue;
 961
 962   count++;
 963
 964   if (name == NULL && count == lorn) return count;
 965   term = *ptr++;
 966   if (term == '<') term = '>';
 967   thisname = ptr;
 968   while (*ptr != term) ptr++;
 969   if (name != NULL && lorn == ptr - thisname &&
 970       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
 971     return count;
 972   }
 973
 974 return -1;
 975 }
 976
 977
 978
 979 /*************************************************
 980 *      Find first significant op code            *
 981 *************************************************/
 982
 983 /* This is called by several functions that scan a compiled expression looking
 984 for a fixed first character, or an anchoring op code etc. It skips over things
 985 that do not influence this. For some calls, a change of option is important.
 986 For some calls, it makes sense to skip negative forward and all backward
 987 assertions, and also the \b assertion; for others it does not.
 988
 989 Arguments:
 990   code         pointer to the start of the group
 991   options      pointer to external options
 992   optbit       the option bit whose changing is significant, or
 993                  zero if none are
 994   skipassert   TRUE if certain assertions are to be skipped
 995
 996 Returns:       pointer to the first significant opcode
 997 */
 998
 999 static const uschar*
1000 first_significant_code(const uschar *code, int *options, int optbit,
1001   BOOL skipassert)
1002 {
1003 for (;;)
1004   {
1005   switch ((int)*code)
1006     {
1007     case OP_OPT:
1008     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1009       *options = (int)code[1];
1010     code += 2;
1011     break;
1012
1013     case OP_ASSERT_NOT:
1014     case OP_ASSERTBACK:
1015     case OP_ASSERTBACK_NOT:
1016     if (!skipassert) return code;
1017     do code += GET(code, 1); while (*code == OP_ALT);
1018     code += _pcre_OP_lengths[*code];
1019     break;
1020
1021     case OP_WORD_BOUNDARY:
1022     case OP_NOT_WORD_BOUNDARY:
1023     if (!skipassert) return code;
1024     /* Fall through */
1025
1026     case OP_CALLOUT:
1027     case OP_CREF:
1028     case OP_RREF:
1029     case OP_DEF:
1030     code += _pcre_OP_lengths[*code];
1031     break;
1032
1033     default:
1034     return code;
1035     }
1036   }
1037 /* Control never reaches here */
1038 }
1039
1040
1041
1042
1043 /*************************************************
1044 *        Find the fixed length of a pattern      *
1045 *************************************************/
1046
1047 /* Scan a pattern and compute the fixed length of subject that will match it,
1048 if the length is fixed. This is needed for dealing with backward assertions.
1049 In UTF8 mode, the result is in characters rather than bytes.
1050
1051 Arguments:
1052   code     points to the start of the pattern (the bracket)
1053   options  the compiling options
1054
1055 Returns:   the fixed length, or -1 if there is no fixed length,
1056              or -2 if \C was encountered
1057 */
1058
1059 static int
1060 find_fixedlength(uschar *code, int options)
1061 {
1062 int length = -1;
1063
1064 register int branchlength = 0;
1065 register uschar *cc = code + 1 + LINK_SIZE;
1066
1067 /* Scan along the opcodes for this branch. If we get to the end of the
1068 branch, check the length against that of the other branches. */
1069
1070 for (;;)
1071   {
1072   int d;
1073   register int op = *cc;
1074   switch (op)
1075     {
1076     case OP_CBRA:
1077     case OP_BRA:
1078     case OP_ONCE:
1079     case OP_COND:
1080     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1081     if (d < 0) return d;
1082     branchlength += d;
1083     do cc += GET(cc, 1); while (*cc == OP_ALT);
1084     cc += 1 + LINK_SIZE;
1085     break;
1086
1087     /* Reached end of a branch; if it's a ket it is the end of a nested
1088     call. If it's ALT it is an alternation in a nested call. If it is
1089     END it's the end of the outer call. All can be handled by the same code. */
1090
1091     case OP_ALT:
1092     case OP_KET:
1093     case OP_KETRMAX:
1094     case OP_KETRMIN:
1095     case OP_END:
1096     if (length < 0) length = branchlength;
1097       else if (length != branchlength) return -1;
1098     if (*cc != OP_ALT) return length;
1099     cc += 1 + LINK_SIZE;
1100     branchlength = 0;
1101     break;
1102
1103     /* Skip over assertive subpatterns */
1104
1105     case OP_ASSERT:
1106     case OP_ASSERT_NOT:
1107     case OP_ASSERTBACK:
1108     case OP_ASSERTBACK_NOT:
1109     do cc += GET(cc, 1); while (*cc == OP_ALT);
1110     /* Fall through */
1111
1112     /* Skip over things that don't match chars */
1113
1114     case OP_REVERSE:
1115     case OP_CREF:
1116     case OP_RREF:
1117     case OP_DEF:
1118     case OP_OPT:
1119     case OP_CALLOUT:
1120     case OP_SOD:
1121     case OP_SOM:
1122     case OP_EOD:
1123     case OP_EODN:
1124     case OP_CIRC:
1125     case OP_DOLL:
1126     case OP_NOT_WORD_BOUNDARY:
1127     case OP_WORD_BOUNDARY:
1128     cc += _pcre_OP_lengths[*cc];
1129     break;
1130
1131     /* Handle literal characters */
1132
1133     case OP_CHAR:
1134     case OP_CHARNC:
1135     case OP_NOT:
1136     branchlength++;
1137     cc += 2;
1138 #ifdef SUPPORT_UTF8
1139     if ((options & PCRE_UTF8) != 0)
1140       {
1141       while ((*cc & 0xc0) == 0x80) cc++;
1142       }
1143 #endif
1144     break;
1145
1146     /* Handle exact repetitions. The count is already in characters, but we
1147     need to skip over a multibyte character in UTF8 mode.  */
1148
1149     case OP_EXACT:
1150     branchlength += GET2(cc,1);
1151     cc += 4;
1152 #ifdef SUPPORT_UTF8
1153     if ((options & PCRE_UTF8) != 0)
1154       {
1155       while((*cc & 0x80) == 0x80) cc++;
1156       }
1157 #endif
1158     break;
1159
1160     case OP_TYPEEXACT:
1161     branchlength += GET2(cc,1);
1162     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1163     cc += 4;
1164     break;
1165
1166     /* Handle single-char matchers */
1167
1168     case OP_PROP:
1169     case OP_NOTPROP:
1170     cc += 2;
1171     /* Fall through */
1172
1173     case OP_NOT_DIGIT:
1174     case OP_DIGIT:
1175     case OP_NOT_WHITESPACE:
1176     case OP_WHITESPACE:
1177     case OP_NOT_WORDCHAR:
1178     case OP_WORDCHAR:
1179     case OP_ANY:
1180     case OP_ALLANY:
1181     branchlength++;
1182     cc++;
1183     break;
1184
1185     /* The single-byte matcher isn't allowed */
1186
1187     case OP_ANYBYTE:
1188     return -2;
1189
1190     /* Check a class for variable quantification */
1191
1192 #ifdef SUPPORT_UTF8
1193     case OP_XCLASS:
1194     cc += GET(cc, 1) - 33;
1195     /* Fall through */
1196 #endif
1197
1198     case OP_CLASS:
1199     case OP_NCLASS:
1200     cc += 33;
1201
1202     switch (*cc)
1203       {
1204       case OP_CRSTAR:
1205       case OP_CRMINSTAR:
1206       case OP_CRQUERY:
1207       case OP_CRMINQUERY:
1208       return -1;
1209
1210       case OP_CRRANGE:
1211       case OP_CRMINRANGE:
1212       if (GET2(cc,1) != GET2(cc,3)) return -1;
1213       branchlength += GET2(cc,1);
1214       cc += 5;
1215       break;
1216
1217       default:
1218       branchlength++;
1219       }
1220     break;
1221
1222     /* Anything else is variable length */
1223
1224     default:
1225     return -1;
1226     }
1227   }
1228 /* Control never gets here */
1229 }
1230
1231
1232
1233
1234 /*************************************************
1235 *    Scan compiled regex for numbered bracket    *
1236 *************************************************/
1237
1238 /* This little function scans through a compiled pattern until it finds a
1239 capturing bracket with the given number.
1240
1241 Arguments:
1242   code        points to start of expression
1243   utf8        TRUE in UTF-8 mode
1244   number      the required bracket number
1245
1246 Returns:      pointer to the opcode for the bracket, or NULL if not found
1247 */
1248
1249 static const uschar *
1250 find_bracket(const uschar *code, BOOL utf8, int number)
1251 {
1252 for (;;)
1253   {
1254   register int c = *code;
1255   if (c == OP_END) return NULL;
1256
1257   /* XCLASS is used for classes that cannot be represented just by a bit
1258   map. This includes negated single high-valued characters. The length in
1259   the table is zero; the actual length is stored in the compiled code. */
1260
1261   if (c == OP_XCLASS) code += GET(code, 1);
1262
1263   /* Handle capturing bracket */
1264
1265   else if (c == OP_CBRA)
1266     {
1267     int n = GET2(code, 1+LINK_SIZE);
1268     if (n == number) return (uschar *)code;
1269     code += _pcre_OP_lengths[c];
1270     }
1271
1272   /* Otherwise, we can get the item's length from the table, except that for
1273   repeated character types, we have to test for \p and \P, which have an extra
1274   two bytes of parameters. */
1275
1276   else
1277     {
1278     switch(c)
1279       {
1280       case OP_TYPESTAR:
1281       case OP_TYPEMINSTAR:
1282       case OP_TYPEPLUS:
1283       case OP_TYPEMINPLUS:
1284       case OP_TYPEQUERY:
1285       case OP_TYPEMINQUERY:
1286       case OP_TYPEPOSSTAR:
1287       case OP_TYPEPOSPLUS:
1288       case OP_TYPEPOSQUERY:
1289       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1290       break;
1291
1292       case OP_TYPEUPTO:
1293       case OP_TYPEMINUPTO:
1294       case OP_TYPEEXACT:
1295       case OP_TYPEPOSUPTO:
1296       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1297       break;
1298       }
1299
1300     /* Add in the fixed length from the table */
1301
1302     code += _pcre_OP_lengths[c];
1303
1304   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1305   a multi-byte character. The length in the table is a minimum, so we have to
1306   arrange to skip the extra bytes. */
1307
1308 #ifdef SUPPORT_UTF8
1309     if (utf8) switch(c)
1310       {
1311       case OP_CHAR:
1312       case OP_CHARNC:
1313       case OP_EXACT:
1314       case OP_UPTO:
1315       case OP_MINUPTO:
1316       case OP_POSUPTO:
1317       case OP_STAR:
1318       case OP_MINSTAR:
1319       case OP_POSSTAR:
1320       case OP_PLUS:
1321       case OP_MINPLUS:
1322       case OP_POSPLUS:
1323       case OP_QUERY:
1324       case OP_MINQUERY:
1325       case OP_POSQUERY:
1326       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1327       break;
1328       }
1329 #else
1330     (void)(utf8);  /* Keep compiler happy by referencing function argument */
1331 #endif
1332     }
1333   }
1334 }
1335
1336
1337
1338 /*************************************************
1339 *   Scan compiled regex for recursion reference  *
1340 *************************************************/
1341
1342 /* This little function scans through a compiled pattern until it finds an
1343 instance of OP_RECURSE.
1344
1345 Arguments:
1346   code        points to start of expression
1347   utf8        TRUE in UTF-8 mode
1348
1349 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1350 */
1351
1352 static const uschar *
1353 find_recurse(const uschar *code, BOOL utf8)
1354 {
1355 for (;;)
1356   {
1357   register int c = *code;
1358   if (c == OP_END) return NULL;
1359   if (c == OP_RECURSE) return code;
1360
1361   /* XCLASS is used for classes that cannot be represented just by a bit
1362   map. This includes negated single high-valued characters. The length in
1363   the table is zero; the actual length is stored in the compiled code. */
1364
1365   if (c == OP_XCLASS) code += GET(code, 1);
1366
1367   /* Otherwise, we can get the item's length from the table, except that for
1368   repeated character types, we have to test for \p and \P, which have an extra
1369   two bytes of parameters. */
1370
1371   else
1372     {
1373     switch(c)
1374       {
1375       case OP_TYPESTAR:
1376       case OP_TYPEMINSTAR:
1377       case OP_TYPEPLUS:
1378       case OP_TYPEMINPLUS:
1379       case OP_TYPEQUERY:
1380       case OP_TYPEMINQUERY:
1381       case OP_TYPEPOSSTAR:
1382       case OP_TYPEPOSPLUS:
1383       case OP_TYPEPOSQUERY:
1384       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1385       break;
1386
1387       case OP_TYPEPOSUPTO:
1388       case OP_TYPEUPTO:
1389       case OP_TYPEMINUPTO:
1390       case OP_TYPEEXACT:
1391       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1392       break;
1393       }
1394
1395     /* Add in the fixed length from the table */
1396
1397     code += _pcre_OP_lengths[c];
1398
1399     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1400     by a multi-byte character. The length in the table is a minimum, so we have
1401     to arrange to skip the extra bytes. */
1402
1403 #ifdef SUPPORT_UTF8
1404     if (utf8) switch(c)
1405       {
1406       case OP_CHAR:
1407       case OP_CHARNC:
1408       case OP_EXACT:
1409       case OP_UPTO:
1410       case OP_MINUPTO:
1411       case OP_POSUPTO:
1412       case OP_STAR:
1413       case OP_MINSTAR:
1414       case OP_POSSTAR:
1415       case OP_PLUS:
1416       case OP_MINPLUS:
1417       case OP_POSPLUS:
1418       case OP_QUERY:
1419       case OP_MINQUERY:
1420       case OP_POSQUERY:
1421       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1422       break;
1423       }
1424 #else
1425     (void)(utf8);  /* Keep compiler happy by referencing function argument */
1426 #endif
1427     }
1428   }
1429 }
1430
1431
1432
1433 /*************************************************
1434 *    Scan compiled branch for non-emptiness      *
1435 *************************************************/
1436
1437 /* This function scans through a branch of a compiled pattern to see whether it
1438 can match the empty string or not. It is called from could_be_empty()
1439 below and from compile_branch() when checking for an unlimited repeat of a
1440 group that can match nothing. Note that first_significant_code() skips over
1441 backward and negative forward assertions when its final argument is TRUE. If we
1442 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1443 bracket whose current branch will already have been scanned.
1444
1445 Arguments:
1446   code        points to start of search
1447   endcode     points to where to stop
1448   utf8        TRUE if in UTF8 mode
1449
1450 Returns:      TRUE if what is matched could be empty
1451 */
1452
1453 static BOOL
1454 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1455 {
1456 register int c;
1457 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1458      code < endcode;
1459      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1460   {
1461   const uschar *ccode;
1462
1463   c = *code;
1464
1465   /* Skip over forward assertions; the other assertions are skipped by
1466   first_significant_code() with a TRUE final argument. */
1467
1468   if (c == OP_ASSERT)
1469     {
1470     do code += GET(code, 1); while (*code == OP_ALT);
1471     c = *code;
1472     continue;
1473     }
1474
1475   /* Groups with zero repeats can of course be empty; skip them. */
1476
1477   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1478     {
1479     code += _pcre_OP_lengths[c];
1480     do code += GET(code, 1); while (*code == OP_ALT);
1481     c = *code;
1482     continue;
1483     }
1484
1485   /* For other groups, scan the branches. */
1486
1487   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1488     {
1489     BOOL empty_branch;
1490     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1491
1492     /* Scan a closed bracket */
1493
1494     empty_branch = FALSE;
1495     do
1496       {
1497       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1498         empty_branch = TRUE;
1499       code += GET(code, 1);
1500       }
1501     while (*code == OP_ALT);
1502     if (!empty_branch) return FALSE;   /* All branches are non-empty */
1503     c = *code;
1504     continue;
1505     }
1506
1507   /* Handle the other opcodes */
1508
1509   switch (c)
1510     {
1511     /* Check for quantifiers after a class. XCLASS is used for classes that
1512     cannot be represented just by a bit map. This includes negated single
1513     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1514     actual length is stored in the compiled code, so we must update "code"
1515     here. */
1516
1517 #ifdef SUPPORT_UTF8
1518     case OP_XCLASS:
1519     ccode = code += GET(code, 1);
1520     goto CHECK_CLASS_REPEAT;
1521 #endif
1522
1523     case OP_CLASS:
1524     case OP_NCLASS:
1525     ccode = code + 33;
1526
1527 #ifdef SUPPORT_UTF8
1528     CHECK_CLASS_REPEAT:
1529 #endif
1530
1531     switch (*ccode)
1532       {
1533       case OP_CRSTAR:            /* These could be empty; continue */
1534       case OP_CRMINSTAR:
1535       case OP_CRQUERY:
1536       case OP_CRMINQUERY:
1537       break;
1538
1539       default:                   /* Non-repeat => class must match */
1540       case OP_CRPLUS:            /* These repeats aren't empty */
1541       case OP_CRMINPLUS:
1542       return FALSE;
1543
1544       case OP_CRRANGE:
1545       case OP_CRMINRANGE:
1546       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1547       break;
1548       }
1549     break;
1550
1551     /* Opcodes that must match a character */
1552
1553     case OP_PROP:
1554     case OP_NOTPROP:
1555     case OP_EXTUNI:
1556     case OP_NOT_DIGIT:
1557     case OP_DIGIT:
1558     case OP_NOT_WHITESPACE:
1559     case OP_WHITESPACE:
1560     case OP_NOT_WORDCHAR:
1561     case OP_WORDCHAR:
1562     case OP_ANY:
1563     case OP_ALLANY:
1564     case OP_ANYBYTE:
1565     case OP_CHAR:
1566     case OP_CHARNC:
1567     case OP_NOT:
1568     case OP_PLUS:
1569     case OP_MINPLUS:
1570     case OP_POSPLUS:
1571     case OP_EXACT:
1572     case OP_NOTPLUS:
1573     case OP_NOTMINPLUS:
1574     case OP_NOTPOSPLUS:
1575     case OP_NOTEXACT:
1576     case OP_TYPEPLUS:
1577     case OP_TYPEMINPLUS:
1578     case OP_TYPEPOSPLUS:
1579     case OP_TYPEEXACT:
1580     return FALSE;
1581
1582     /* These are going to continue, as they may be empty, but we have to
1583     fudge the length for the \p and \P cases. */
1584
1585     case OP_TYPESTAR:
1586     case OP_TYPEMINSTAR:
1587     case OP_TYPEPOSSTAR:
1588     case OP_TYPEQUERY:
1589     case OP_TYPEMINQUERY:
1590     case OP_TYPEPOSQUERY:
1591     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1592     break;
1593
1594     /* Same for these */
1595
1596     case OP_TYPEUPTO:
1597     case OP_TYPEMINUPTO:
1598     case OP_TYPEPOSUPTO:
1599     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1600     break;
1601
1602     /* End of branch */
1603
1604     case OP_KET:
1605     case OP_KETRMAX:
1606     case OP_KETRMIN:
1607     case OP_ALT:
1608     return TRUE;
1609
1610     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1611     MINUPTO, and POSUPTO may be followed by a multibyte character */
1612
1613 #ifdef SUPPORT_UTF8
1614     case OP_STAR:
1615     case OP_MINSTAR:
1616     case OP_POSSTAR:
1617     case OP_QUERY:
1618     case OP_MINQUERY:
1619     case OP_POSQUERY:
1620     case OP_UPTO:
1621     case OP_MINUPTO:
1622     case OP_POSUPTO:
1623     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1624     break;
1625 #endif
1626     }
1627   }
1628
1629 return TRUE;
1630 }
1631
1632
1633
1634 /*************************************************
1635 *    Scan compiled regex for non-emptiness       *
1636 *************************************************/
1637
1638 /* This function is called to check for left recursive calls. We want to check
1639 the current branch of the current pattern to see if it could match the empty
1640 string. If it could, we must look outwards for branches at other levels,
1641 stopping when we pass beyond the bracket which is the subject of the recursion.
1642
1643 Arguments:
1644   code        points to start of the recursion
1645   endcode     points to where to stop (current RECURSE item)
1646   bcptr       points to the chain of current (unclosed) branch starts
1647   utf8        TRUE if in UTF-8 mode
1648
1649 Returns:      TRUE if what is matched could be empty
1650 */
1651
1652 static BOOL
1653 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1654   BOOL utf8)
1655 {
1656 while (bcptr != NULL && bcptr->current >= code)
1657   {
1658   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1659   bcptr = bcptr->outer;
1660   }
1661 return TRUE;
1662 }
1663
1664
1665
1666 /*************************************************
1667 *           Check for POSIX class syntax         *
1668 *************************************************/
1669
1670 /* This function is called when the sequence "[:" or "[." or "[=" is
1671 encountered in a character class. It checks whether this is followed by a
1672 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1673 reach an unescaped ']' without the special preceding character, return FALSE.
1674
1675 Originally, this function only recognized a sequence of letters between the
1676 terminators, but it seems that Perl recognizes any sequence of characters,
1677 though of course unknown POSIX names are subsequently rejected. Perl gives an
1678 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1679 didn't consider this to be a POSIX class. Likewise for [:1234:].
1680
1681 The problem in trying to be exactly like Perl is in the handling of escapes. We
1682 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1683 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1684 below handles the special case of \], but does not try to do any other escape
1685 processing. This makes it different from Perl for cases such as [:l\ower:]
1686 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1687 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1688 I think.
1689
1690 Arguments:
1691   ptr      pointer to the initial [
1692   endptr   where to return the end pointer
1693
1694 Returns:   TRUE or FALSE
1695 */
1696
1697 static BOOL
1698 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1699 {
1700 int terminator;          /* Don't combine these lines; the Solaris cc */
1701 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1702 for (++ptr; *ptr != 0; ptr++)
1703   {
1704   if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1705     {
1706     if (*ptr == ']') return FALSE;
1707     if (*ptr == terminator && ptr[1] == ']')
1708       {
1709       *endptr = ptr;
1710       return TRUE;
1711       }
1712     }
1713   }
1714 return FALSE;
1715 }
1716
1717
1718
1719
1720 /*************************************************
1721 *          Check POSIX class name                *
1722 *************************************************/
1723
1724 /* This function is called to check the name given in a POSIX-style class entry
1725 such as [:alnum:].
1726
1727 Arguments:
1728   ptr        points to the first letter
1729   len        the length of the name
1730
1731 Returns:     a value representing the name, or -1 if unknown
1732 */
1733
1734 static int
1735 check_posix_name(const uschar *ptr, int len)
1736 {
1737 const char *pn = posix_names;
1738 register int yield = 0;
1739 while (posix_name_lengths[yield] != 0)
1740   {
1741   if (len == posix_name_lengths[yield] &&
1742     strncmp((const char *)ptr, pn, len) == 0) return yield;
1743   pn += posix_name_lengths[yield] + 1;
1744   yield++;
1745   }
1746 return -1;
1747 }
1748
1749
1750 /*************************************************
1751 *    Adjust OP_RECURSE items in repeated group   *
1752 *************************************************/
1753
1754 /* OP_RECURSE items contain an offset from the start of the regex to the group
1755 that is referenced. This means that groups can be replicated for fixed
1756 repetition simply by copying (because the recursion is allowed to refer to
1757 earlier groups that are outside the current group). However, when a group is
1758 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1759 inserted before it, after it has been compiled. This means that any OP_RECURSE
1760 items within it that refer to the group itself or any contained groups have to
1761 have their offsets adjusted. That one of the jobs of this function. Before it
1762 is called, the partially compiled regex must be temporarily terminated with
1763 OP_END.
1764
1765 This function has been extended with the possibility of forward references for
1766 recursions and subroutine calls. It must also check the list of such references
1767 for the group we are dealing with. If it finds that one of the recursions in
1768 the current group is on this list, it adjusts the offset in the list, not the
1769 value in the reference (which is a group number).
1770
1771 Arguments:
1772   group      points to the start of the group
1773   adjust     the amount by which the group is to be moved
1774   utf8       TRUE in UTF-8 mode
1775   cd         contains pointers to tables etc.
1776   save_hwm   the hwm forward reference pointer at the start of the group
1777
1778 Returns:     nothing
1779 */
1780
1781 static void
1782 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1783   uschar *save_hwm)
1784 {
1785 uschar *ptr = group;
1786
1787 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1788   {
1789   int offset;
1790   uschar *hc;
1791
1792   /* See if this recursion is on the forward reference list. If so, adjust the
1793   reference. */
1794
1795   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1796     {
1797     offset = GET(hc, 0);
1798     if (cd->start_code + offset == ptr + 1)
1799       {
1800       PUT(hc, 0, offset + adjust);
1801       break;
1802       }
1803     }
1804
1805   /* Otherwise, adjust the recursion offset if it's after the start of this
1806   group. */
1807
1808   if (hc >= cd->hwm)
1809     {
1810     offset = GET(ptr, 1);
1811     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1812     }
1813
1814   ptr += 1 + LINK_SIZE;
1815   }
1816 }
1817
1818
1819
1820 /*************************************************
1821 *        Insert an automatic callout point       *
1822 *************************************************/
1823
1824 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1825 callout points before each pattern item.
1826
1827 Arguments:
1828   code           current code pointer
1829   ptr            current pattern pointer
1830   cd             pointers to tables etc
1831
1832 Returns:         new code pointer
1833 */
1834
1835 static uschar *
1836 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1837 {
1838 *code++ = OP_CALLOUT;
1839 *code++ = 255;
1840 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1841 PUT(code, LINK_SIZE, 0);                /* Default length */
1842 return code + 2*LINK_SIZE;
1843 }
1844
1845
1846
1847 /*************************************************
1848 *         Complete a callout item                *
1849 *************************************************/
1850
1851 /* A callout item contains the length of the next item in the pattern, which
1852 we can't fill in till after we have reached the relevant point. This is used
1853 for both automatic and manual callouts.
1854
1855 Arguments:
1856   previous_callout   points to previous callout item
1857   ptr                current pattern pointer
1858   cd                 pointers to tables etc
1859
1860 Returns:             nothing
1861 */
1862
1863 static void
1864 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1865 {
1866 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1867 PUT(previous_callout, 2 + LINK_SIZE, length);
1868 }
1869
1870
1871
1872 #ifdef SUPPORT_UCP
1873 /*************************************************
1874 *           Get othercase range                  *
1875 *************************************************/
1876
1877 /* This function is passed the start and end of a class range, in UTF-8 mode
1878 with UCP support. It searches up the characters, looking for internal ranges of
1879 characters in the "other" case. Each call returns the next one, updating the
1880 start address.
1881
1882 Arguments:
1883   cptr        points to starting character value; updated
1884   d           end value
1885   ocptr       where to put start of othercase range
1886   odptr       where to put end of othercase range
1887
1888 Yield:        TRUE when range returned; FALSE when no more
1889 */
1890
1891 static BOOL
1892 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1893   unsigned int *odptr)
1894 {
1895 unsigned int c, othercase, next;
1896
1897 for (c = *cptr; c <= d; c++)
1898   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
1899
1900 if (c > d) return FALSE;
1901
1902 *ocptr = othercase;
1903 next = othercase + 1;
1904
1905 for (++c; c <= d; c++)
1906   {
1907   if (UCD_OTHERCASE(c) != next) break;
1908   next++;
1909   }
1910
1911 *odptr = next - 1;
1912 *cptr = c;
1913
1914 return TRUE;
1915 }
1916 #endif  /* SUPPORT_UCP */
1917
1918
1919
1920 /*************************************************
1921 *     Check if auto-possessifying is possible    *
1922 *************************************************/
1923
1924 /* This function is called for unlimited repeats of certain items, to see
1925 whether the next thing could possibly match the repeated item. If not, it makes
1926 sense to automatically possessify the repeated item.
1927
1928 Arguments:
1929   op_code       the repeated op code
1930   this          data for this item, depends on the opcode
1931   utf8          TRUE in UTF-8 mode
1932   utf8_char     used for utf8 character bytes, NULL if not relevant
1933   ptr           next character in pattern
1934   options       options bits
1935   cd            contains pointers to tables etc.
1936
1937 Returns:        TRUE if possessifying is wanted
1938 */
1939
1940 static BOOL
1941 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1942   const uschar *ptr, int options, compile_data *cd)
1943 {
1944 int next;
1945
1946 /* Skip whitespace and comments in extended mode */
1947
1948 if ((options & PCRE_EXTENDED) != 0)
1949   {
1950   for (;;)
1951     {
1952     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1953     if (*ptr == '#')
1954       {
1955       while (*(++ptr) != 0)
1956         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1957       }
1958     else break;
1959     }
1960   }
1961
1962 /* If the next item is one that we can handle, get its value. A non-negative
1963 value is a character, a negative value is an escape value. */
1964
1965 if (*ptr == '\\')
1966   {
1967   int temperrorcode = 0;
1968   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1969   if (temperrorcode != 0) return FALSE;
1970   ptr++;    /* Point after the escape sequence */
1971   }
1972
1973 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1974   {
1975 #ifdef SUPPORT_UTF8
1976   if (utf8) { GETCHARINC(next, ptr); } else
1977 #endif
1978   next = *ptr++;
1979   }
1980
1981 else return FALSE;
1982
1983 /* Skip whitespace and comments in extended mode */
1984
1985 if ((options & PCRE_EXTENDED) != 0)
1986   {
1987   for (;;)
1988     {
1989     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1990     if (*ptr == '#')
1991       {
1992       while (*(++ptr) != 0)
1993         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1994       }
1995     else break;
1996     }
1997   }
1998
1999 /* If the next thing is itself optional, we have to give up. */
2000
2001 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2002   return FALSE;
2003
2004 /* Now compare the next item with the previous opcode. If the previous is a
2005 positive single character match, "item" either contains the character or, if
2006 "item" is greater than 127 in utf8 mode, the character's bytes are in
2007 utf8_char. */
2008
2009
2010 /* Handle cases when the next item is a character. */
2011
2012 if (next >= 0) switch(op_code)
2013   {
2014   case OP_CHAR:
2015 #ifdef SUPPORT_UTF8
2016   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2017 #else
2018   (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2019 #endif
2020   return item != next;
2021
2022   /* For CHARNC (caseless character) we must check the other case. If we have
2023   Unicode property support, we can use it to test the other case of
2024   high-valued characters. */
2025
2026   case OP_CHARNC:
2027 #ifdef SUPPORT_UTF8
2028   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2029 #endif
2030   if (item == next) return FALSE;
2031 #ifdef SUPPORT_UTF8
2032   if (utf8)
2033     {
2034     unsigned int othercase;
2035     if (next < 128) othercase = cd->fcc[next]; else
2036 #ifdef SUPPORT_UCP
2037     othercase = UCD_OTHERCASE((unsigned int)next);
2038 #else
2039     othercase = NOTACHAR;
2040 #endif
2041     return (unsigned int)item != othercase;
2042     }
2043   else
2044 #endif  /* SUPPORT_UTF8 */
2045   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2046
2047   /* For OP_NOT, "item" must be a single-byte character. */
2048
2049   case OP_NOT:
2050   if (item == next) return TRUE;
2051   if ((options & PCRE_CASELESS) == 0) return FALSE;
2052 #ifdef SUPPORT_UTF8
2053   if (utf8)
2054     {
2055     unsigned int othercase;
2056     if (next < 128) othercase = cd->fcc[next]; else
2057 #ifdef SUPPORT_UCP
2058     othercase = UCD_OTHERCASE(next);
2059 #else
2060     othercase = NOTACHAR;
2061 #endif
2062     return (unsigned int)item == othercase;
2063     }
2064   else
2065 #endif  /* SUPPORT_UTF8 */
2066   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2067
2068   case OP_DIGIT:
2069   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2070
2071   case OP_NOT_DIGIT:
2072   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2073
2074   case OP_WHITESPACE:
2075   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2076
2077   case OP_NOT_WHITESPACE:
2078   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2079
2080   case OP_WORDCHAR:
2081   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2082
2083   case OP_NOT_WORDCHAR:
2084   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2085
2086   case OP_HSPACE:
2087   case OP_NOT_HSPACE:
2088   switch(next)
2089     {
2090     case 0x09:
2091     case 0x20:
2092     case 0xa0:
2093     case 0x1680:
2094     case 0x180e:
2095     case 0x2000:
2096     case 0x2001:
2097     case 0x2002:
2098     case 0x2003:
2099     case 0x2004:
2100     case 0x2005:
2101     case 0x2006:
2102     case 0x2007:
2103     case 0x2008:
2104     case 0x2009:
2105     case 0x200A:
2106     case 0x202f:
2107     case 0x205f:
2108     case 0x3000:
2109     return op_code != OP_HSPACE;
2110     default:
2111     return op_code == OP_HSPACE;
2112     }
2113
2114   case OP_VSPACE:
2115   case OP_NOT_VSPACE:
2116   switch(next)
2117     {
2118     case 0x0a:
2119     case 0x0b:
2120     case 0x0c:
2121     case 0x0d:
2122     case 0x85:
2123     case 0x2028:
2124     case 0x2029:
2125     return op_code != OP_VSPACE;
2126     default:
2127     return op_code == OP_VSPACE;
2128     }
2129
2130   default:
2131   return FALSE;
2132   }
2133
2134
2135 /* Handle the case when the next item is \d, \s, etc. */
2136
2137 switch(op_code)
2138   {
2139   case OP_CHAR:
2140   case OP_CHARNC:
2141 #ifdef SUPPORT_UTF8
2142   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2143 #endif
2144   switch(-next)
2145     {
2146     case ESC_d:
2147     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2148
2149     case ESC_D:
2150     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2151
2152     case ESC_s:
2153     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2154
2155     case ESC_S:
2156     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2157
2158     case ESC_w:
2159     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2160
2161     case ESC_W:
2162     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2163
2164     case ESC_h:
2165     case ESC_H:
2166     switch(item)
2167       {
2168       case 0x09:
2169       case 0x20:
2170       case 0xa0:
2171       case 0x1680:
2172       case 0x180e:
2173       case 0x2000:
2174       case 0x2001:
2175       case 0x2002:
2176       case 0x2003:
2177       case 0x2004:
2178       case 0x2005:
2179       case 0x2006:
2180       case 0x2007:
2181       case 0x2008:
2182       case 0x2009:
2183       case 0x200A:
2184       case 0x202f:
2185       case 0x205f:
2186       case 0x3000:
2187       return -next != ESC_h;
2188       default:
2189       return -next == ESC_h;
2190       }
2191
2192     case ESC_v:
2193     case ESC_V:
2194     switch(item)
2195       {
2196       case 0x0a:
2197       case 0x0b:
2198       case 0x0c:
2199       case 0x0d:
2200       case 0x85:
2201       case 0x2028:
2202       case 0x2029:
2203       return -next != ESC_v;
2204       default:
2205       return -next == ESC_v;
2206       }
2207
2208     default:
2209     return FALSE;
2210     }
2211
2212   case OP_DIGIT:
2213   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2214          next == -ESC_h || next == -ESC_v;
2215
2216   case OP_NOT_DIGIT:
2217   return next == -ESC_d;
2218
2219   case OP_WHITESPACE:
2220   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2221
2222   case OP_NOT_WHITESPACE:
2223   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2224
2225   case OP_HSPACE:
2226   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2227
2228   case OP_NOT_HSPACE:
2229   return next == -ESC_h;
2230
2231   /* Can't have \S in here because VT matches \S (Perl anomaly) */
2232   case OP_VSPACE:
2233   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2234
2235   case OP_NOT_VSPACE:
2236   return next == -ESC_v;
2237
2238   case OP_WORDCHAR:
2239   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2240
2241   case OP_NOT_WORDCHAR:
2242   return next == -ESC_w || next == -ESC_d;
2243
2244   default:
2245   return FALSE;
2246   }
2247
2248 /* Control does not reach here */
2249 }
2250
2251
2252
2253 /*************************************************
2254 *           Compile one branch                   *
2255 *************************************************/
2256
2257 /* Scan the pattern, compiling it into the a vector. If the options are
2258 changed during the branch, the pointer is used to change the external options
2259 bits. This function is used during the pre-compile phase when we are trying
2260 to find out the amount of memory needed, as well as during the real compile
2261 phase. The value of lengthptr distinguishes the two phases.
2262
2263 Arguments:
2264   optionsptr     pointer to the option bits
2265   codeptr        points to the pointer to the current code point
2266   ptrptr         points to the current pattern pointer
2267   errorcodeptr   points to error code variable
2268   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2269   reqbyteptr     set to the last literal character required, else < 0
2270   bcptr          points to current branch chain
2271   cd             contains pointers to tables etc.
2272   lengthptr      NULL during the real compile phase
2273                  points to length accumulator during pre-compile phase
2274
2275 Returns:         TRUE on success
2276                  FALSE, with *errorcodeptr set non-zero on error
2277 */
2278
2279 static BOOL
2280 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2281   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2282   compile_data *cd, int *lengthptr)
2283 {
2284 int repeat_type, op_type;
2285 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2286 int bravalue = 0;
2287 int greedy_default, greedy_non_default;
2288 int firstbyte, reqbyte;
2289 int zeroreqbyte, zerofirstbyte;
2290 int req_caseopt, reqvary, tempreqvary;
2291 int options = *optionsptr;
2292 int after_manual_callout = 0;
2293 int length_prevgroup = 0;
2294 register int c;
2295 register uschar *code = *codeptr;
2296 uschar *last_code = code;
2297 uschar *orig_code = code;
2298 uschar *tempcode;
2299 BOOL inescq = FALSE;
2300 BOOL groupsetfirstbyte = FALSE;
2301 const uschar *ptr = *ptrptr;
2302 const uschar *tempptr;
2303 uschar *previous = NULL;
2304 uschar *previous_callout = NULL;
2305 uschar *save_hwm = NULL;
2306 uschar classbits[32];
2307
2308 #ifdef SUPPORT_UTF8
2309 BOOL class_utf8;
2310 BOOL utf8 = (options & PCRE_UTF8) != 0;
2311 uschar *class_utf8data;
2312 uschar *class_utf8data_base;
2313 uschar utf8_char[6];
2314 #else
2315 BOOL utf8 = FALSE;
2316 uschar *utf8_char = NULL;
2317 #endif
2318
2319 #ifdef DEBUG
2320 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2321 #endif
2322
2323 /* Set up the default and non-default settings for greediness */
2324
2325 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2326 greedy_non_default = greedy_default ^ 1;
2327
2328 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2329 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2330 matches a non-fixed char first char; reqbyte just remains unset if we never
2331 find one.
2332
2333 When we hit a repeat whose minimum is zero, we may have to adjust these values
2334 to take the zero repeat into account. This is implemented by setting them to
2335 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2336 item types that can be repeated set these backoff variables appropriately. */
2337
2338 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2339
2340 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2341 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2342 value > 255. It is added into the firstbyte or reqbyte variables to record the
2343 case status of the value. This is used only for ASCII characters. */
2344
2345 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2346
2347 /* Switch on next character until the end of the branch */
2348
2349 for (;; ptr++)
2350   {
2351   BOOL negate_class;
2352   BOOL should_flip_negation;
2353   BOOL possessive_quantifier;
2354   BOOL is_quantifier;
2355   BOOL is_recurse;
2356   BOOL reset_bracount;
2357   int class_charcount;
2358   int class_lastchar;
2359   int newoptions;
2360   int recno;
2361   int refsign;
2362   int skipbytes;
2363   int subreqbyte;
2364   int subfirstbyte;
2365   int terminator;
2366   int mclength;
2367   uschar mcbuffer[8];
2368
2369   /* Get next byte in the pattern */
2370
2371   c = *ptr;
2372
2373   /* If we are in the pre-compile phase, accumulate the length used for the
2374   previous cycle of this loop. */
2375
2376   if (lengthptr != NULL)
2377     {
2378 #ifdef DEBUG
2379     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2380 #endif
2381     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2382       {
2383       *errorcodeptr = ERR52;
2384       goto FAILED;
2385       }
2386
2387     /* There is at least one situation where code goes backwards: this is the
2388     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2389     the class is simply eliminated. However, it is created first, so we have to
2390     allow memory for it. Therefore, don't ever reduce the length at this point.
2391     */
2392
2393     if (code < last_code) code = last_code;
2394
2395     /* Paranoid check for integer overflow */
2396
2397     if (OFLOW_MAX - *lengthptr < code - last_code)
2398       {
2399       *errorcodeptr = ERR20;
2400       goto FAILED;
2401       }
2402
2403     *lengthptr += code - last_code;
2404     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2405
2406     /* If "previous" is set and it is not at the start of the work space, move
2407     it back to there, in order to avoid filling up the work space. Otherwise,
2408     if "previous" is NULL, reset the current code pointer to the start. */
2409
2410     if (previous != NULL)
2411       {
2412       if (previous > orig_code)
2413         {
2414         memmove(orig_code, previous, code - previous);
2415         code -= previous - orig_code;
2416         previous = orig_code;
2417         }
2418       }
2419     else code = orig_code;
2420
2421     /* Remember where this code item starts so we can pick up the length
2422     next time round. */
2423
2424     last_code = code;
2425     }
2426
2427   /* In the real compile phase, just check the workspace used by the forward
2428   reference list. */
2429
2430   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2431     {
2432     *errorcodeptr = ERR52;
2433     goto FAILED;
2434     }
2435
2436   /* If in \Q...\E, check for the end; if not, we have a literal */
2437
2438   if (inescq && c != 0)
2439     {
2440     if (c == '\\' && ptr[1] == 'E')
2441       {
2442       inescq = FALSE;
2443       ptr++;
2444       continue;
2445       }
2446     else
2447       {
2448       if (previous_callout != NULL)
2449         {
2450         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2451           complete_callout(previous_callout, ptr, cd);
2452         previous_callout = NULL;
2453         }
2454       if ((options & PCRE_AUTO_CALLOUT) != 0)
2455         {
2456         previous_callout = code;
2457         code = auto_callout(code, ptr, cd);
2458         }
2459       goto NORMAL_CHAR;
2460       }
2461     }
2462
2463   /* Fill in length of a previous callout, except when the next thing is
2464   a quantifier. */
2465
2466   is_quantifier = c == '*' || c == '+' || c == '?' ||
2467     (c == '{' && is_counted_repeat(ptr+1));
2468
2469   if (!is_quantifier && previous_callout != NULL &&
2470        after_manual_callout-- <= 0)
2471     {
2472     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2473       complete_callout(previous_callout, ptr, cd);
2474     previous_callout = NULL;
2475     }
2476
2477   /* In extended mode, skip white space and comments */
2478
2479   if ((options & PCRE_EXTENDED) != 0)
2480     {
2481     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2482     if (c == '#')
2483       {
2484       while (*(++ptr) != 0)
2485         {
2486         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2487         }
2488       if (*ptr != 0) continue;
2489
2490       /* Else fall through to handle end of string */
2491       c = 0;
2492       }
2493     }
2494
2495   /* No auto callout for quantifiers. */
2496
2497   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2498     {
2499     previous_callout = code;
2500     code = auto_callout(code, ptr, cd);
2501     }
2502
2503   switch(c)
2504     {
2505     /* ===================================================================*/
2506     case 0:                        /* The branch terminates at string end */
2507     case '|':                      /* or | or ) */
2508     case ')':
2509     *firstbyteptr = firstbyte;
2510     *reqbyteptr = reqbyte;
2511     *codeptr = code;
2512     *ptrptr = ptr;
2513     if (lengthptr != NULL)
2514       {
2515       if (OFLOW_MAX - *lengthptr < code - last_code)
2516         {
2517         *errorcodeptr = ERR20;
2518         goto FAILED;
2519         }
2520       *lengthptr += code - last_code;   /* To include callout length */
2521       DPRINTF((">> end branch\n"));
2522       }
2523     return TRUE;
2524
2525
2526     /* ===================================================================*/
2527     /* Handle single-character metacharacters. In multiline mode, ^ disables
2528     the setting of any following char as a first character. */
2529
2530     case '^':
2531     if ((options & PCRE_MULTILINE) != 0)
2532       {
2533       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2534       }
2535     previous = NULL;
2536     *code++ = OP_CIRC;
2537     break;
2538
2539     case '$':
2540     previous = NULL;
2541     *code++ = OP_DOLL;
2542     break;
2543
2544     /* There can never be a first char if '.' is first, whatever happens about
2545     repeats. The value of reqbyte doesn't change either. */
2546
2547     case '.':
2548     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2549     zerofirstbyte = firstbyte;
2550     zeroreqbyte = reqbyte;
2551     previous = code;
2552     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2553     break;
2554
2555
2556     /* ===================================================================*/
2557     /* Character classes. If the included characters are all < 256, we build a
2558     32-byte bitmap of the permitted characters, except in the special case
2559     where there is only one such character. For negated classes, we build the
2560     map as usual, then invert it at the end. However, we use a different opcode
2561     so that data characters > 255 can be handled correctly.
2562
2563     If the class contains characters outside the 0-255 range, a different
2564     opcode is compiled. It may optionally have a bit map for characters < 256,
2565     but those above are are explicitly listed afterwards. A flag byte tells
2566     whether the bitmap is present, and whether this is a negated class or not.
2567
2568     In JavaScript compatibility mode, an isolated ']' causes an error. In
2569     default (Perl) mode, it is treated as a data character. */
2570
2571     case ']':
2572     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2573       {
2574       *errorcodeptr = ERR64;
2575       goto FAILED;
2576       }
2577     goto NORMAL_CHAR;
2578
2579     case '[':
2580     previous = code;
2581
2582     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2583     they are encountered at the top level, so we'll do that too. */
2584
2585     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2586         check_posix_syntax(ptr, &tempptr))
2587       {
2588       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2589       goto FAILED;
2590       }
2591
2592     /* If the first character is '^', set the negation flag and skip it. Also,
2593     if the first few characters (either before or after ^) are \Q\E or \E we
2594     skip them too. This makes for compatibility with Perl. */
2595
2596     negate_class = FALSE;
2597     for (;;)
2598       {
2599       c = *(++ptr);
2600       if (c == '\\')
2601         {
2602         if (ptr[1] == 'E') ptr++;
2603           else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2604             else break;
2605         }
2606       else if (!negate_class && c == '^')
2607         negate_class = TRUE;
2608       else break;
2609       }
2610
2611     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2612     an initial ']' is taken as a data character -- the code below handles
2613     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2614     [^] must match any character, so generate OP_ALLANY. */
2615
2616     if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2617       {
2618       *code++ = negate_class? OP_ALLANY : OP_FAIL;
2619       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2620       zerofirstbyte = firstbyte;
2621       break;
2622       }
2623
2624     /* If a class contains a negative special such as \S, we need to flip the
2625     negation flag at the end, so that support for characters > 255 works
2626     correctly (they are all included in the class). */
2627
2628     should_flip_negation = FALSE;
2629
2630     /* Keep a count of chars with values < 256 so that we can optimize the case
2631     of just a single character (as long as it's < 256). However, For higher
2632     valued UTF-8 characters, we don't yet do any optimization. */
2633
2634     class_charcount = 0;
2635     class_lastchar = -1;
2636
2637     /* Initialize the 32-char bit map to all zeros. We build the map in a
2638     temporary bit of memory, in case the class contains only 1 character (less
2639     than 256), because in that case the compiled code doesn't use the bit map.
2640     */
2641
2642     memset(classbits, 0, 32 * sizeof(uschar));
2643
2644 #ifdef SUPPORT_UTF8
2645     class_utf8 = FALSE;                       /* No chars >= 256 */
2646     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2647     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2648 #endif
2649
2650     /* Process characters until ] is reached. By writing this as a "do" it
2651     means that an initial ] is taken as a data character. At the start of the
2652     loop, c contains the first byte of the character. */
2653
2654     if (c != 0) do
2655       {
2656       const uschar *oldptr;
2657
2658 #ifdef SUPPORT_UTF8
2659       if (utf8 && c > 127)
2660         {                           /* Braces are required because the */
2661         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2662         }
2663
2664       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2665       data and reset the pointer. This is so that very large classes that
2666       contain a zillion UTF-8 characters no longer overwrite the work space
2667       (which is on the stack). */
2668
2669       if (lengthptr != NULL)
2670         {
2671         *lengthptr += class_utf8data - class_utf8data_base;
2672         class_utf8data = class_utf8data_base;
2673         }
2674
2675 #endif
2676
2677       /* Inside \Q...\E everything is literal except \E */
2678
2679       if (inescq)
2680         {
2681         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2682           {
2683           inescq = FALSE;                   /* Reset literal state */
2684           ptr++;                            /* Skip the 'E' */
2685           continue;                         /* Carry on with next */
2686           }
2687         goto CHECK_RANGE;                   /* Could be range if \E follows */
2688         }
2689
2690       /* Handle POSIX class names. Perl allows a negation extension of the
2691       form [:^name:]. A square bracket that doesn't match the syntax is
2692       treated as a literal. We also recognize the POSIX constructions
2693       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2694       5.6 and 5.8 do. */
2695
2696       if (c == '[' &&
2697           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2698           check_posix_syntax(ptr, &tempptr))
2699         {
2700         BOOL local_negate = FALSE;
2701         int posix_class, taboffset, tabopt;
2702         register const uschar *cbits = cd->cbits;
2703         uschar pbits[32];
2704
2705         if (ptr[1] != ':')
2706           {
2707           *errorcodeptr = ERR31;
2708           goto FAILED;
2709           }
2710
2711         ptr += 2;
2712         if (*ptr == '^')
2713           {
2714           local_negate = TRUE;
2715           should_flip_negation = TRUE;  /* Note negative special */
2716           ptr++;
2717           }
2718
2719         posix_class = check_posix_name(ptr, tempptr - ptr);
2720         if (posix_class < 0)
2721           {
2722           *errorcodeptr = ERR30;
2723           goto FAILED;
2724           }
2725
2726         /* If matching is caseless, upper and lower are converted to
2727         alpha. This relies on the fact that the class table starts with
2728         alpha, lower, upper as the first 3 entries. */
2729
2730         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2731           posix_class = 0;
2732
2733         /* We build the bit map for the POSIX class in a chunk of local store
2734         because we may be adding and subtracting from it, and we don't want to
2735         subtract bits that may be in the main map already. At the end we or the
2736         result into the bit map that is being built. */
2737
2738         posix_class *= 3;
2739
2740         /* Copy in the first table (always present) */
2741
2742         memcpy(pbits, cbits + posix_class_maps[posix_class],
2743           32 * sizeof(uschar));
2744
2745         /* If there is a second table, add or remove it as required. */
2746
2747         taboffset = posix_class_maps[posix_class + 1];
2748         tabopt = posix_class_maps[posix_class + 2];
2749
2750         if (taboffset >= 0)
2751           {
2752           if (tabopt >= 0)
2753             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2754           else
2755             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2756           }
2757
2758         /* Not see if we need to remove any special characters. An option
2759         value of 1 removes vertical space and 2 removes underscore. */
2760
2761         if (tabopt < 0) tabopt = -tabopt;
2762         if (tabopt == 1) pbits[1] &= ~0x3c;
2763           else if (tabopt == 2) pbits[11] &= 0x7f;
2764
2765         /* Add the POSIX table or its complement into the main table that is
2766         being built and we are done. */
2767
2768         if (local_negate)
2769           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2770         else
2771           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2772
2773         ptr = tempptr + 1;
2774         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2775         continue;    /* End of POSIX syntax handling */
2776         }
2777
2778       /* Backslash may introduce a single character, or it may introduce one
2779       of the specials, which just set a flag. The sequence \b is a special
2780       case. Inside a class (and only there) it is treated as backspace.
2781       Elsewhere it marks a word boundary. Other escapes have preset maps ready
2782       to 'or' into the one we are building. We assume they have more than one
2783       character in them, so set class_charcount bigger than one. */
2784
2785       if (c == '\\')
2786         {
2787         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2788         if (*errorcodeptr != 0) goto FAILED;
2789
2790         if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2791         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2792         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2793         else if (-c == ESC_Q)            /* Handle start of quoted string */
2794           {
2795           if (ptr[1] == '\\' && ptr[2] == 'E')
2796             {
2797             ptr += 2; /* avoid empty string */
2798             }
2799           else inescq = TRUE;
2800           continue;
2801           }
2802         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2803
2804         if (c < 0)
2805           {
2806           register const uschar *cbits = cd->cbits;
2807           class_charcount += 2;     /* Greater than 1 is what matters */
2808
2809           /* Save time by not doing this in the pre-compile phase. */
2810
2811           if (lengthptr == NULL) switch (-c)
2812             {
2813             case ESC_d:
2814             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2815             continue;
2816
2817             case ESC_D:
2818             should_flip_negation = TRUE;
2819             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2820             continue;
2821
2822             case ESC_w:
2823             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2824             continue;
2825
2826             case ESC_W:
2827             should_flip_negation = TRUE;
2828             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2829             continue;
2830
2831             case ESC_s:
2832             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2833             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2834             continue;
2835
2836             case ESC_S:
2837             should_flip_negation = TRUE;
2838             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2839             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2840             continue;
2841
2842             default:    /* Not recognized; fall through */
2843             break;      /* Need "default" setting to stop compiler warning. */
2844             }
2845
2846           /* In the pre-compile phase, just do the recognition. */
2847
2848           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2849                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2850
2851           /* We need to deal with \H, \h, \V, and \v in both phases because
2852           they use extra memory. */
2853
2854           if (-c == ESC_h)
2855             {
2856             SETBIT(classbits, 0x09); /* VT */
2857             SETBIT(classbits, 0x20); /* SPACE */
2858             SETBIT(classbits, 0xa0); /* NSBP */
2859 #ifdef SUPPORT_UTF8
2860             if (utf8)
2861               {
2862               class_utf8 = TRUE;
2863               *class_utf8data++ = XCL_SINGLE;
2864               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2865               *class_utf8data++ = XCL_SINGLE;
2866               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2867               *class_utf8data++ = XCL_RANGE;
2868               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2869               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2870               *class_utf8data++ = XCL_SINGLE;
2871               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2872               *class_utf8data++ = XCL_SINGLE;
2873               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2874               *class_utf8data++ = XCL_SINGLE;
2875               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2876               }
2877 #endif
2878             continue;
2879             }
2880
2881           if (-c == ESC_H)
2882             {
2883             for (c = 0; c < 32; c++)
2884               {
2885               int x = 0xff;
2886               switch (c)
2887                 {
2888                 case 0x09/8: x ^= 1 << (0x09%8); break;
2889                 case 0x20/8: x ^= 1 << (0x20%8); break;
2890                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2891                 default: break;
2892                 }
2893               classbits[c] |= x;
2894               }
2895
2896 #ifdef SUPPORT_UTF8
2897             if (utf8)
2898               {
2899               class_utf8 = TRUE;
2900               *class_utf8data++ = XCL_RANGE;
2901               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2902               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2903               *class_utf8data++ = XCL_RANGE;
2904               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2905               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2906               *class_utf8data++ = XCL_RANGE;
2907               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2908               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2909               *class_utf8data++ = XCL_RANGE;
2910               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2911               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2912               *class_utf8data++ = XCL_RANGE;
2913               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2914               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2915               *class_utf8data++ = XCL_RANGE;
2916               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2917               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2918               *class_utf8data++ = XCL_RANGE;
2919               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2920               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2921               }
2922 #endif
2923             continue;
2924             }
2925
2926           if (-c == ESC_v)
2927             {
2928             SETBIT(classbits, 0x0a); /* LF */
2929             SETBIT(classbits, 0x0b); /* VT */
2930             SETBIT(classbits, 0x0c); /* FF */
2931             SETBIT(classbits, 0x0d); /* CR */
2932             SETBIT(classbits, 0x85); /* NEL */
2933 #ifdef SUPPORT_UTF8
2934             if (utf8)
2935               {
2936               class_utf8 = TRUE;
2937               *class_utf8data++ = XCL_RANGE;
2938               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2939               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2940               }
2941 #endif
2942             continue;
2943             }
2944
2945           if (-c == ESC_V)
2946             {
2947             for (c = 0; c < 32; c++)
2948               {
2949               int x = 0xff;
2950               switch (c)
2951                 {
2952                 case 0x0a/8: x ^= 1 << (0x0a%8);
2953                              x ^= 1 << (0x0b%8);
2954                              x ^= 1 << (0x0c%8);
2955                              x ^= 1 << (0x0d%8);
2956                              break;
2957                 case 0x85/8: x ^= 1 << (0x85%8); break;
2958                 default: break;
2959                 }
2960               classbits[c] |= x;
2961               }
2962
2963 #ifdef SUPPORT_UTF8
2964             if (utf8)
2965               {
2966               class_utf8 = TRUE;
2967               *class_utf8data++ = XCL_RANGE;
2968               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2969               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2970               *class_utf8data++ = XCL_RANGE;
2971               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2972               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2973               }
2974 #endif
2975             continue;
2976             }
2977
2978           /* We need to deal with \P and \p in both phases. */
2979
2980 #ifdef SUPPORT_UCP
2981           if (-c == ESC_p || -c == ESC_P)
2982             {
2983             BOOL negated;
2984             int pdata;
2985             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2986             if (ptype < 0) goto FAILED;
2987             class_utf8 = TRUE;
2988             *class_utf8data++ = ((-c == ESC_p) != negated)?
2989               XCL_PROP : XCL_NOTPROP;
2990             *class_utf8data++ = ptype;
2991             *class_utf8data++ = pdata;
2992             class_charcount -= 2;   /* Not a < 256 character */
2993             continue;
2994             }
2995 #endif
2996           /* Unrecognized escapes are faulted if PCRE is running in its
2997           strict mode. By default, for compatibility with Perl, they are
2998           treated as literals. */
2999
3000           if ((options & PCRE_EXTRA) != 0)
3001             {
3002             *errorcodeptr = ERR7;
3003             goto FAILED;
3004             }
3005
3006           class_charcount -= 2;  /* Undo the default count from above */
3007           c = *ptr;              /* Get the final character and fall through */
3008           }
3009
3010         /* Fall through if we have a single character (c >= 0). This may be
3011         greater than 256 in UTF-8 mode. */
3012
3013         }   /* End of backslash handling */
3014
3015       /* A single character may be followed by '-' to form a range. However,
3016       Perl does not permit ']' to be the end of the range. A '-' character
3017       at the end is treated as a literal. Perl ignores orphaned \E sequences
3018       entirely. The code for handling \Q and \E is messy. */
3019
3020       CHECK_RANGE:
3021       while (ptr[1] == '\\' && ptr[2] == 'E')
3022         {
3023         inescq = FALSE;
3024         ptr += 2;
3025         }
3026
3027       oldptr = ptr;
3028
3029       /* Remember \r or \n */
3030
3031       if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3032
3033       /* Check for range */
3034
3035       if (!inescq && ptr[1] == '-')
3036         {
3037         int d;
3038         ptr += 2;
3039         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3040
3041         /* If we hit \Q (not followed by \E) at this point, go into escaped
3042         mode. */
3043
3044         while (*ptr == '\\' && ptr[1] == 'Q')
3045           {
3046           ptr += 2;
3047           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3048           inescq = TRUE;
3049           break;
3050           }
3051
3052         if (*ptr == 0 || (!inescq && *ptr == ']'))
3053           {
3054           ptr = oldptr;
3055           goto LONE_SINGLE_CHARACTER;
3056           }
3057
3058 #ifdef SUPPORT_UTF8
3059         if (utf8)
3060           {                           /* Braces are required because the */
3061           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3062           }
3063         else
3064 #endif
3065         d = *ptr;  /* Not UTF-8 mode */
3066
3067         /* The second part of a range can be a single-character escape, but
3068         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3069         in such circumstances. */
3070
3071         if (!inescq && d == '\\')
3072           {
3073           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3074           if (*errorcodeptr != 0) goto FAILED;
3075
3076           /* \b is backspace; \X is literal X; \R is literal R; any other
3077           special means the '-' was literal */
3078
3079           if (d < 0)
3080             {
3081             if (d == -ESC_b) d = '\b';
3082             else if (d == -ESC_X) d = 'X';
3083             else if (d == -ESC_R) d = 'R'; else
3084               {
3085               ptr = oldptr;
3086               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3087               }
3088             }
3089           }
3090
3091         /* Check that the two values are in the correct order. Optimize
3092         one-character ranges */
3093
3094         if (d < c)
3095           {
3096           *errorcodeptr = ERR8;
3097           goto FAILED;
3098           }
3099
3100         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3101
3102         /* Remember \r or \n */
3103
3104         if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3105
3106         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3107         matching, we have to use an XCLASS with extra data items. Caseless
3108         matching for characters > 127 is available only if UCP support is
3109         available. */
3110
3111 #ifdef SUPPORT_UTF8
3112         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3113           {
3114           class_utf8 = TRUE;
3115
3116           /* With UCP support, we can find the other case equivalents of
3117           the relevant characters. There may be several ranges. Optimize how
3118           they fit with the basic range. */
3119
3120 #ifdef SUPPORT_UCP
3121           if ((options & PCRE_CASELESS) != 0)
3122             {
3123             unsigned int occ, ocd;
3124             unsigned int cc = c;
3125             unsigned int origd = d;
3126             while (get_othercase_range(&cc, origd, &occ, &ocd))
3127               {
3128               if (occ >= (unsigned int)c &&
3129                   ocd <= (unsigned int)d)
3130                 continue;                          /* Skip embedded ranges */
3131
3132               if (occ < (unsigned int)c  &&
3133                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3134                 {                                  /* if there is overlap,   */
3135                 c = occ;                           /* noting that if occ < c */
3136                 continue;                          /* we can't have ocd > d  */
3137                 }                                  /* because a subrange is  */
3138               if (ocd > (unsigned int)d &&
3139                   occ <= (unsigned int)d + 1)      /* always shorter than    */
3140                 {                                  /* the basic range.       */
3141                 d = ocd;
3142                 continue;
3143                 }
3144
3145               if (occ == ocd)
3146                 {
3147                 *class_utf8data++ = XCL_SINGLE;
3148                 }
3149               else
3150                 {
3151                 *class_utf8data++ = XCL_RANGE;
3152                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3153                 }
3154               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3155               }
3156             }
3157 #endif  /* SUPPORT_UCP */
3158
3159           /* Now record the original range, possibly modified for UCP caseless
3160           overlapping ranges. */
3161
3162           *class_utf8data++ = XCL_RANGE;
3163           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3164           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3165
3166           /* With UCP support, we are done. Without UCP support, there is no
3167           caseless matching for UTF-8 characters > 127; we can use the bit map
3168           for the smaller ones. */
3169
3170 #ifdef SUPPORT_UCP
3171           continue;    /* With next character in the class */
3172 #else
3173           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3174
3175           /* Adjust upper limit and fall through to set up the map */
3176
3177           d = 127;
3178
3179 #endif  /* SUPPORT_UCP */
3180           }
3181 #endif  /* SUPPORT_UTF8 */
3182
3183         /* We use the bit map for all cases when not in UTF-8 mode; else
3184         ranges that lie entirely within 0-127 when there is UCP support; else
3185         for partial ranges without UCP support. */
3186
3187         class_charcount += d - c + 1;
3188         class_lastchar = d;
3189
3190         /* We can save a bit of time by skipping this in the pre-compile. */
3191
3192         if (lengthptr == NULL) for (; c <= d; c++)
3193           {
3194           classbits[c/8] |= (1 << (c&7));
3195           if ((options & PCRE_CASELESS) != 0)
3196             {
3197             int uc = cd->fcc[c];           /* flip case */
3198             classbits[uc/8] |= (1 << (uc&7));
3199             }
3200           }
3201
3202         continue;   /* Go get the next char in the class */
3203         }
3204
3205       /* Handle a lone single character - we can get here for a normal
3206       non-escape char, or after \ that introduces a single character or for an
3207       apparent range that isn't. */
3208
3209       LONE_SINGLE_CHARACTER:
3210
3211       /* Handle a character that cannot go in the bit map */
3212
3213 #ifdef SUPPORT_UTF8
3214       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3215         {
3216         class_utf8 = TRUE;
3217         *class_utf8data++ = XCL_SINGLE;
3218         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3219
3220 #ifdef SUPPORT_UCP
3221         if ((options & PCRE_CASELESS) != 0)
3222           {
3223           unsigned int othercase;
3224           if ((othercase = UCD_OTHERCASE(c)) != c)
3225             {
3226             *class_utf8data++ = XCL_SINGLE;
3227             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3228             }
3229           }
3230 #endif  /* SUPPORT_UCP */
3231
3232         }
3233       else
3234 #endif  /* SUPPORT_UTF8 */
3235
3236       /* Handle a single-byte character */
3237         {
3238         classbits[c/8] |= (1 << (c&7));
3239         if ((options & PCRE_CASELESS) != 0)
3240           {
3241           c = cd->fcc[c];   /* flip case */
3242           classbits[c/8] |= (1 << (c&7));
3243           }
3244         class_charcount++;
3245         class_lastchar = c;
3246         }
3247       }
3248
3249     /* Loop until ']' reached. This "while" is the end of the "do" above. */
3250
3251     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3252
3253     if (c == 0)                          /* Missing terminating ']' */
3254       {
3255       *errorcodeptr = ERR6;
3256       goto FAILED;
3257       }
3258
3259
3260 /* This code has been disabled because it would mean that \s counts as
3261 an explicit \r or \n reference, and that's not really what is wanted. Now
3262 we set the flag only if there is a literal "\r" or "\n" in the class. */
3263
3264 #if 0
3265     /* Remember whether \r or \n are in this class */
3266
3267     if (negate_class)
3268       {
3269       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3270       }
3271     else
3272       {
3273       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3274       }
3275 #endif
3276
3277
3278     /* If class_charcount is 1, we saw precisely one character whose value is
3279     less than 256. As long as there were no characters >= 128 and there was no
3280     use of \p or \P, in other words, no use of any XCLASS features, we can
3281     optimize.
3282
3283     In UTF-8 mode, we can optimize the negative case only if there were no
3284     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3285     operate on single-bytes only. This is an historical hangover. Maybe one day
3286     we can tidy these opcodes to handle multi-byte characters.
3287
3288     The optimization throws away the bit map. We turn the item into a
3289     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3290     that OP_NOT does not support multibyte characters. In the positive case, it
3291     can cause firstbyte to be set. Otherwise, there can be no first char if
3292     this item is first, whatever repeat count may follow. In the case of
3293     reqbyte, save the previous value for reinstating. */
3294
3295 #ifdef SUPPORT_UTF8
3296     if (class_charcount == 1 && !class_utf8 &&
3297       (!utf8 || !negate_class || class_lastchar < 128))
3298 #else
3299     if (class_charcount == 1)
3300 #endif
3301       {
3302       zeroreqbyte = reqbyte;
3303
3304       /* The OP_NOT opcode works on one-byte characters only. */
3305
3306       if (negate_class)
3307         {
3308         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3309         zerofirstbyte = firstbyte;
3310         *code++ = OP_NOT;
3311         *code++ = class_lastchar;
3312         break;
3313         }
3314
3315       /* For a single, positive character, get the value into mcbuffer, and
3316       then we can handle this with the normal one-character code. */
3317
3318 #ifdef SUPPORT_UTF8
3319       if (utf8 && class_lastchar > 127)
3320         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3321       else
3322 #endif
3323         {
3324         mcbuffer[0] = class_lastchar;
3325         mclength = 1;
3326         }
3327       goto ONE_CHAR;
3328       }       /* End of 1-char optimization */
3329
3330     /* The general case - not the one-char optimization. If this is the first
3331     thing in the branch, there can be no first char setting, whatever the
3332     repeat count. Any reqbyte setting must remain unchanged after any kind of
3333     repeat. */
3334
3335     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3336     zerofirstbyte = firstbyte;
3337     zeroreqbyte = reqbyte;
3338
3339     /* If there are characters with values > 255, we have to compile an
3340     extended class, with its own opcode, unless there was a negated special
3341     such as \S in the class, because in that case all characters > 255 are in
3342     the class, so any that were explicitly given as well can be ignored. If
3343     (when there are explicit characters > 255 that must be listed) there are no
3344     characters < 256, we can omit the bitmap in the actual compiled code. */
3345
3346 #ifdef SUPPORT_UTF8
3347     if (class_utf8 && !should_flip_negation)
3348       {
3349       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3350       *code++ = OP_XCLASS;
3351       code += LINK_SIZE;
3352       *code = negate_class? XCL_NOT : 0;
3353
3354       /* If the map is required, move up the extra data to make room for it;
3355       otherwise just move the code pointer to the end of the extra data. */
3356
3357       if (class_charcount > 0)
3358         {
3359         *code++ |= XCL_MAP;
3360         memmove(code + 32, code, class_utf8data - code);
3361         memcpy(code, classbits, 32);
3362         code = class_utf8data + 32;
3363         }
3364       else code = class_utf8data;
3365
3366       /* Now fill in the complete length of the item */
3367
3368       PUT(previous, 1, code - previous);
3369       break;   /* End of class handling */
3370       }
3371 #endif
3372
3373     /* If there are no characters > 255, set the opcode to OP_CLASS or
3374     OP_NCLASS, depending on whether the whole class was negated and whether
3375     there were negative specials such as \S in the class. Then copy the 32-byte
3376     map into the code vector, negating it if necessary. */
3377
3378     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3379     if (negate_class)
3380       {
3381       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3382         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3383       }
3384     else
3385       {
3386       memcpy(code, classbits, 32);
3387       }
3388     code += 32;
3389     break;
3390
3391
3392     /* ===================================================================*/
3393     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3394     has been tested above. */
3395
3396     case '{':
3397     if (!is_quantifier) goto NORMAL_CHAR;
3398     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3399     if (*errorcodeptr != 0) goto FAILED;
3400     goto REPEAT;
3401
3402     case '*':
3403     repeat_min = 0;
3404     repeat_max = -1;
3405     goto REPEAT;
3406
3407     case '+':
3408     repeat_min = 1;
3409     repeat_max = -1;
3410     goto REPEAT;
3411
3412     case '?':
3413     repeat_min = 0;
3414     repeat_max = 1;
3415
3416     REPEAT:
3417     if (previous == NULL)
3418       {
3419       *errorcodeptr = ERR9;
3420       goto FAILED;
3421       }
3422
3423     if (repeat_min == 0)
3424       {
3425       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3426       reqbyte = zeroreqbyte;        /* Ditto */
3427       }
3428
3429     /* Remember whether this is a variable length repeat */
3430
3431     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3432
3433     op_type = 0;                    /* Default single-char op codes */
3434     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3435
3436     /* Save start of previous item, in case we have to move it up to make space
3437     for an inserted OP_ONCE for the additional '+' extension. */
3438
3439     tempcode = previous;
3440
3441     /* If the next character is '+', we have a possessive quantifier. This
3442     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3443     If the next character is '?' this is a minimizing repeat, by default,
3444     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3445     repeat type to the non-default. */
3446
3447     if (ptr[1] == '+')
3448       {
3449       repeat_type = 0;                  /* Force greedy */
3450       possessive_quantifier = TRUE;
3451       ptr++;
3452       }
3453     else if (ptr[1] == '?')
3454       {
3455       repeat_type = greedy_non_default;
3456       ptr++;
3457       }
3458     else repeat_type = greedy_default;
3459
3460     /* If previous was a character match, abolish the item and generate a
3461     repeat item instead. If a char item has a minumum of more than one, ensure
3462     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3463     the first thing in a branch because the x will have gone into firstbyte
3464     instead.  */
3465
3466     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3467       {
3468       /* Deal with UTF-8 characters that take up more than one byte. It's
3469       easier to write this out separately than try to macrify it. Use c to
3470       hold the length of the character in bytes, plus 0x80 to flag that it's a
3471       length rather than a small character. */
3472
3473 #ifdef SUPPORT_UTF8
3474       if (utf8 && (code[-1] & 0x80) != 0)
3475         {
3476         uschar *lastchar = code - 1;
3477         while((*lastchar & 0xc0) == 0x80) lastchar--;
3478         c = code - lastchar;            /* Length of UTF-8 character */
3479         memcpy(utf8_char, lastchar, c); /* Save the char */
3480         c |= 0x80;                      /* Flag c as a length */
3481         }
3482       else
3483 #endif
3484
3485       /* Handle the case of a single byte - either with no UTF8 support, or
3486       with UTF-8 disabled, or for a UTF-8 character < 128. */
3487
3488         {
3489         c = code[-1];
3490         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3491         }
3492
3493       /* If the repetition is unlimited, it pays to see if the next thing on
3494       the line is something that cannot possibly match this character. If so,
3495       automatically possessifying this item gains some performance in the case
3496       where the match fails. */
3497
3498       if (!possessive_quantifier &&
3499           repeat_max < 0 &&
3500           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3501             options, cd))
3502         {
3503         repeat_type = 0;    /* Force greedy */
3504         possessive_quantifier = TRUE;
3505         }
3506
3507       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3508       }
3509
3510     /* If previous was a single negated character ([^a] or similar), we use
3511     one of the special opcodes, replacing it. The code is shared with single-
3512     character repeats by setting opt_type to add a suitable offset into
3513     repeat_type. We can also test for auto-possessification. OP_NOT is
3514     currently used only for single-byte chars. */
3515
3516     else if (*previous == OP_NOT)
3517       {
3518       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3519       c = previous[1];
3520       if (!possessive_quantifier &&
3521           repeat_max < 0 &&
3522           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3523         {
3524         repeat_type = 0;    /* Force greedy */
3525         possessive_quantifier = TRUE;
3526         }
3527       goto OUTPUT_SINGLE_REPEAT;
3528       }
3529
3530     /* If previous was a character type match (\d or similar), abolish it and
3531     create a suitable repeat item. The code is shared with single-character
3532     repeats by setting op_type to add a suitable offset into repeat_type. Note
3533     the the Unicode property types will be present only when SUPPORT_UCP is
3534     defined, but we don't wrap the little bits of code here because it just
3535     makes it horribly messy. */
3536
3537     else if (*previous < OP_EODN)
3538       {
3539       uschar *oldcode;
3540       int prop_type, prop_value;
3541       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3542       c = *previous;
3543
3544       if (!possessive_quantifier &&
3545           repeat_max < 0 &&
3546           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3547         {
3548         repeat_type = 0;    /* Force greedy */
3549         possessive_quantifier = TRUE;
3550         }
3551
3552       OUTPUT_SINGLE_REPEAT:
3553       if (*previous == OP_PROP || *previous == OP_NOTPROP)
3554         {
3555         prop_type = previous[1];
3556         prop_value = previous[2];
3557         }
3558       else prop_type = prop_value = -1;
3559
3560       oldcode = code;
3561       code = previous;                  /* Usually overwrite previous item */
3562
3563       /* If the maximum is zero then the minimum must also be zero; Perl allows
3564       this case, so we do too - by simply omitting the item altogether. */
3565
3566       if (repeat_max == 0) goto END_REPEAT;
3567
3568       /* All real repeats make it impossible to handle partial matching (maybe
3569       one day we will be able to remove this restriction). */
3570
3571       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3572
3573       /* Combine the op_type with the repeat_type */
3574
3575       repeat_type += op_type;
3576
3577       /* A minimum of zero is handled either as the special case * or ?, or as
3578       an UPTO, with the maximum given. */
3579
3580       if (repeat_min == 0)
3581         {
3582         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3583           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3584         else
3585           {
3586           *code++ = OP_UPTO + repeat_type;
3587           PUT2INC(code, 0, repeat_max);
3588           }
3589         }
3590
3591       /* A repeat minimum of 1 is optimized into some special cases. If the
3592       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3593       left in place and, if the maximum is greater than 1, we use OP_UPTO with
3594       one less than the maximum. */
3595
3596       else if (repeat_min == 1)
3597         {
3598         if (repeat_max == -1)
3599           *code++ = OP_PLUS + repeat_type;
3600         else
3601           {
3602           code = oldcode;                 /* leave previous item in place */
3603           if (repeat_max == 1) goto END_REPEAT;
3604           *code++ = OP_UPTO + repeat_type;
3605           PUT2INC(code, 0, repeat_max - 1);
3606           }
3607         }
3608
3609       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3610       handled as an EXACT followed by an UPTO. */
3611
3612       else
3613         {
3614         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3615         PUT2INC(code, 0, repeat_min);
3616
3617         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3618         we have to insert the character for the previous code. For a repeated
3619         Unicode property match, there are two extra bytes that define the
3620         required property. In UTF-8 mode, long characters have their length in
3621         c, with the 0x80 bit as a flag. */
3622
3623         if (repeat_max < 0)
3624           {
3625 #ifdef SUPPORT_UTF8
3626           if (utf8 && c >= 128)
3627             {
3628             memcpy(code, utf8_char, c & 7);
3629             code += c & 7;
3630             }
3631           else
3632 #endif
3633             {
3634             *code++ = c;
3635             if (prop_type >= 0)
3636               {
3637               *code++ = prop_type;
3638               *code++ = prop_value;
3639               }
3640             }
3641           *code++ = OP_STAR + repeat_type;
3642           }
3643
3644         /* Else insert an UPTO if the max is greater than the min, again
3645         preceded by the character, for the previously inserted code. If the
3646         UPTO is just for 1 instance, we can use QUERY instead. */
3647
3648         else if (repeat_max != repeat_min)
3649           {
3650 #ifdef SUPPORT_UTF8
3651           if (utf8 && c >= 128)
3652             {
3653             memcpy(code, utf8_char, c & 7);
3654             code += c & 7;
3655             }
3656           else
3657 #endif
3658           *code++ = c;
3659           if (prop_type >= 0)
3660             {
3661             *code++ = prop_type;
3662             *code++ = prop_value;
3663             }
3664           repeat_max -= repeat_min;
3665
3666           if (repeat_max == 1)
3667             {
3668             *code++ = OP_QUERY + repeat_type;
3669             }
3670           else
3671             {
3672             *code++ = OP_UPTO + repeat_type;
3673             PUT2INC(code, 0, repeat_max);
3674             }
3675           }
3676         }
3677
3678       /* The character or character type itself comes last in all cases. */
3679
3680 #ifdef SUPPORT_UTF8
3681       if (utf8 && c >= 128)
3682         {
3683         memcpy(code, utf8_char, c & 7);
3684         code += c & 7;
3685         }
3686       else
3687 #endif
3688       *code++ = c;
3689
3690       /* For a repeated Unicode property match, there are two extra bytes that
3691       define the required property. */
3692
3693 #ifdef SUPPORT_UCP
3694       if (prop_type >= 0)
3695         {
3696         *code++ = prop_type;
3697         *code++ = prop_value;
3698         }
3699 #endif
3700       }
3701
3702     /* If previous was a character class or a back reference, we put the repeat
3703     stuff after it, but just skip the item if the repeat was {0,0}. */
3704
3705     else if (*previous == OP_CLASS ||
3706              *previous == OP_NCLASS ||
3707 #ifdef SUPPORT_UTF8
3708              *previous == OP_XCLASS ||
3709 #endif
3710              *previous == OP_REF)
3711       {
3712       if (repeat_max == 0)
3713         {
3714         code = previous;
3715         goto END_REPEAT;
3716         }
3717
3718       /* All real repeats make it impossible to handle partial matching (maybe
3719       one day we will be able to remove this restriction). */
3720
3721       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3722
3723       if (repeat_min == 0 && repeat_max == -1)
3724         *code++ = OP_CRSTAR + repeat_type;
3725       else if (repeat_min == 1 && repeat_max == -1)
3726         *code++ = OP_CRPLUS + repeat_type;
3727       else if (repeat_min == 0 && repeat_max == 1)
3728         *code++ = OP_CRQUERY + repeat_type;
3729       else
3730         {
3731         *code++ = OP_CRRANGE + repeat_type;
3732         PUT2INC(code, 0, repeat_min);
3733         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3734         PUT2INC(code, 0, repeat_max);
3735         }
3736       }
3737
3738     /* If previous was a bracket group, we may have to replicate it in certain
3739     cases. */
3740
3741     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3742              *previous == OP_ONCE || *previous == OP_COND)
3743       {
3744       register int i;
3745       int ketoffset = 0;
3746       int len = code - previous;
3747       uschar *bralink = NULL;
3748
3749       /* Repeating a DEFINE group is pointless */
3750
3751       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3752         {
3753         *errorcodeptr = ERR55;
3754         goto FAILED;
3755         }
3756
3757       /* If the maximum repeat count is unlimited, find the end of the bracket
3758       by scanning through from the start, and compute the offset back to it
3759       from the current code pointer. There may be an OP_OPT setting following
3760       the final KET, so we can't find the end just by going back from the code
3761       pointer. */
3762
3763       if (repeat_max == -1)
3764         {
3765         register uschar *ket = previous;
3766         do ket += GET(ket, 1); while (*ket != OP_KET);
3767         ketoffset = code - ket;
3768         }
3769
3770       /* The case of a zero minimum is special because of the need to stick
3771       OP_BRAZERO in front of it, and because the group appears once in the
3772       data, whereas in other cases it appears the minimum number of times. For
3773       this reason, it is simplest to treat this case separately, as otherwise
3774       the code gets far too messy. There are several special subcases when the
3775       minimum is zero. */
3776
3777       if (repeat_min == 0)
3778         {
3779         /* If the maximum is also zero, we used to just omit the group from the
3780         output altogether, like this:
3781
3782         ** if (repeat_max == 0)
3783         **   {
3784         **   code = previous;
3785         **   goto END_REPEAT;
3786         **   }
3787
3788         However, that fails when a group is referenced as a subroutine from
3789         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3790         so that it is skipped on execution. As we don't have a list of which
3791         groups are referenced, we cannot do this selectively.
3792
3793         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3794         and do no more at this point. However, we do need to adjust any
3795         OP_RECURSE calls inside the group that refer to the group itself or any
3796         internal or forward referenced group, because the offset is from the
3797         start of the whole regex. Temporarily terminate the pattern while doing
3798         this. */
3799
3800         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3801           {
3802           *code = OP_END;
3803           adjust_recurse(previous, 1, utf8, cd, save_hwm);
3804           memmove(previous+1, previous, len);
3805           code++;
3806           if (repeat_max == 0)
3807             {
3808             *previous++ = OP_SKIPZERO;
3809             goto END_REPEAT;
3810             }
3811           *previous++ = OP_BRAZERO + repeat_type;
3812           }
3813
3814         /* If the maximum is greater than 1 and limited, we have to replicate
3815         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3816         The first one has to be handled carefully because it's the original
3817         copy, which has to be moved up. The remainder can be handled by code
3818         that is common with the non-zero minimum case below. We have to
3819         adjust the value or repeat_max, since one less copy is required. Once
3820         again, we may have to adjust any OP_RECURSE calls inside the group. */
3821
3822         else
3823           {
3824           int offset;
3825           *code = OP_END;
3826           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3827           memmove(previous + 2 + LINK_SIZE, previous, len);
3828           code += 2 + LINK_SIZE;
3829           *previous++ = OP_BRAZERO + repeat_type;
3830           *previous++ = OP_BRA;
3831
3832           /* We chain together the bracket offset fields that have to be
3833           filled in later when the ends of the brackets are reached. */
3834
3835           offset = (bralink == NULL)? 0 : previous - bralink;
3836           bralink = previous;
3837           PUTINC(previous, 0, offset);
3838           }
3839
3840         repeat_max--;
3841         }
3842
3843       /* If the minimum is greater than zero, replicate the group as many
3844       times as necessary, and adjust the maximum to the number of subsequent
3845       copies that we need. If we set a first char from the group, and didn't
3846       set a required char, copy the latter from the former. If there are any
3847       forward reference subroutine calls in the group, there will be entries on
3848       the workspace list; replicate these with an appropriate increment. */
3849
3850       else
3851         {
3852         if (repeat_min > 1)
3853           {
3854           /* In the pre-compile phase, we don't actually do the replication. We
3855           just adjust the length as if we had. Do some paranoid checks for
3856           potential integer overflow. */
3857
3858           if (lengthptr != NULL)
3859             {
3860             int delta = (repeat_min - 1)*length_prevgroup;
3861             if ((double)(repeat_min - 1)*(double)length_prevgroup >
3862                                                             (double)INT_MAX ||
3863                 OFLOW_MAX - *lengthptr < delta)
3864               {
3865               *errorcodeptr = ERR20;
3866               goto FAILED;
3867               }
3868             *lengthptr += delta;
3869             }
3870
3871           /* This is compiling for real */
3872
3873           else
3874             {
3875             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3876             for (i = 1; i < repeat_min; i++)
3877               {
3878               uschar *hc;
3879               uschar *this_hwm = cd->hwm;
3880               memcpy(code, previous, len);
3881               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3882                 {
3883                 PUT(cd->hwm, 0, GET(hc, 0) + len);
3884                 cd->hwm += LINK_SIZE;
3885                 }
3886               save_hwm = this_hwm;
3887               code += len;
3888               }
3889             }
3890           }
3891
3892         if (repeat_max > 0) repeat_max -= repeat_min;
3893         }
3894
3895       /* This code is common to both the zero and non-zero minimum cases. If
3896       the maximum is limited, it replicates the group in a nested fashion,
3897       remembering the bracket starts on a stack. In the case of a zero minimum,
3898       the first one was set up above. In all cases the repeat_max now specifies
3899       the number of additional copies needed. Again, we must remember to
3900       replicate entries on the forward reference list. */
3901
3902       if (repeat_max >= 0)
3903         {
3904         /* In the pre-compile phase, we don't actually do the replication. We
3905         just adjust the length as if we had. For each repetition we must add 1
3906         to the length for BRAZERO and for all but the last repetition we must
3907         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3908         paranoid checks to avoid integer overflow. */
3909
3910         if (lengthptr != NULL && repeat_max > 0)
3911           {
3912           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3913                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3914           if ((double)repeat_max *
3915                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3916                   > (double)INT_MAX ||
3917               OFLOW_MAX - *lengthptr < delta)
3918             {
3919             *errorcodeptr = ERR20;
3920             goto FAILED;
3921             }
3922           *lengthptr += delta;
3923           }
3924
3925         /* This is compiling for real */
3926
3927         else for (i = repeat_max - 1; i >= 0; i--)
3928           {
3929           uschar *hc;
3930           uschar *this_hwm = cd->hwm;
3931
3932           *code++ = OP_BRAZERO + repeat_type;
3933
3934           /* All but the final copy start a new nesting, maintaining the
3935           chain of brackets outstanding. */
3936
3937           if (i != 0)
3938             {
3939             int offset;
3940             *code++ = OP_BRA;
3941             offset = (bralink == NULL)? 0 : code - bralink;
3942             bralink = code;
3943             PUTINC(code, 0, offset);
3944             }
3945
3946           memcpy(code, previous, len);
3947           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3948             {
3949             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3950             cd->hwm += LINK_SIZE;
3951             }
3952           save_hwm = this_hwm;
3953           code += len;
3954           }
3955
3956         /* Now chain through the pending brackets, and fill in their length
3957         fields (which are holding the chain links pro tem). */
3958
3959         while (bralink != NULL)
3960           {
3961           int oldlinkoffset;
3962           int offset = code - bralink + 1;
3963           uschar *bra = code - offset;
3964           oldlinkoffset = GET(bra, 1);
3965           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3966           *code++ = OP_KET;
3967           PUTINC(code, 0, offset);
3968           PUT(bra, 1, offset);
3969           }
3970         }
3971
3972       /* If the maximum is unlimited, set a repeater in the final copy. We
3973       can't just offset backwards from the current code point, because we
3974       don't know if there's been an options resetting after the ket. The
3975       correct offset was computed above.
3976
3977       Then, when we are doing the actual compile phase, check to see whether
3978       this group is a non-atomic one that could match an empty string. If so,
3979       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3980       that runtime checking can be done. [This check is also applied to
3981       atomic groups at runtime, but in a different way.] */
3982
3983       else
3984         {
3985         uschar *ketcode = code - ketoffset;
3986         uschar *bracode = ketcode - GET(ketcode, 1);
3987         *ketcode = OP_KETRMAX + repeat_type;
3988         if (lengthptr == NULL && *bracode != OP_ONCE)
3989           {
3990           uschar *scode = bracode;
3991           do
3992             {
3993             if (could_be_empty_branch(scode, ketcode, utf8))
3994               {
3995               *bracode += OP_SBRA - OP_BRA;
3996               break;
3997               }
3998             scode += GET(scode, 1);
3999             }
4000           while (*scode == OP_ALT);
4001           }
4002         }
4003       }
4004
4005     /* If previous is OP_FAIL, it was generated by an empty class [] in
4006     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4007     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4008     error above. We can just ignore the repeat in JS case. */
4009
4010     else if (*previous == OP_FAIL) goto END_REPEAT;
4011
4012     /* Else there's some kind of shambles */
4013
4014     else
4015       {
4016       *errorcodeptr = ERR11;
4017       goto FAILED;
4018       }
4019
4020     /* If the character following a repeat is '+', or if certain optimization
4021     tests above succeeded, possessive_quantifier is TRUE. For some of the
4022     simpler opcodes, there is an special alternative opcode for this. For
4023     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4024     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4025     but the special opcodes can optimize it a bit. The repeated item starts at
4026     tempcode, not at previous, which might be the first part of a string whose
4027     (former) last char we repeated.
4028
4029     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4030     an 'upto' may follow. We skip over an 'exact' item, and then test the
4031     length of what remains before proceeding. */
4032
4033     if (possessive_quantifier)
4034       {
4035       int len;
4036       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4037           *tempcode == OP_NOTEXACT)
4038         tempcode += _pcre_OP_lengths[*tempcode] +
4039           ((*tempcode == OP_TYPEEXACT &&
4040              (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4041       len = code - tempcode;
4042       if (len > 0) switch (*tempcode)
4043         {
4044         case OP_STAR:  *tempcode = OP_POSSTAR; break;
4045         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4046         case OP_QUERY: *tempcode = OP_POSQUERY; break;
4047         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4048
4049         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4050         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4051         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4052         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4053
4054         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4055         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4056         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4057         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4058
4059         default:
4060         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4061         code += 1 + LINK_SIZE;
4062         len += 1 + LINK_SIZE;
4063         tempcode[0] = OP_ONCE;
4064         *code++ = OP_KET;
4065         PUTINC(code, 0, len);
4066         PUT(tempcode, 1, len);
4067         break;
4068         }
4069       }
4070
4071     /* In all case we no longer have a previous item. We also set the
4072     "follows varying string" flag for subsequently encountered reqbytes if
4073     it isn't already set and we have just passed a varying length item. */
4074
4075     END_REPEAT:
4076     previous = NULL;
4077     cd->req_varyopt |= reqvary;
4078     break;
4079
4080
4081     /* ===================================================================*/
4082     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4083     lookbehind or option setting or condition or all the other extended
4084     parenthesis forms.  */
4085
4086     case '(':
4087     newoptions = options;
4088     skipbytes = 0;
4089     bravalue = OP_CBRA;
4090     save_hwm = cd->hwm;
4091     reset_bracount = FALSE;
4092
4093     /* First deal with various "verbs" that can be introduced by '*'. */
4094
4095     if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4096       {
4097       int i, namelen;
4098       const char *vn = verbnames;
4099       const uschar *name = ++ptr;
4100       previous = NULL;
4101       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4102       if (*ptr == ':')
4103         {
4104         *errorcodeptr = ERR59;   /* Not supported */
4105         goto FAILED;
4106         }
4107       if (*ptr != ')')
4108         {
4109         *errorcodeptr = ERR60;
4110         goto FAILED;
4111         }
4112       namelen = ptr - name;
4113       for (i = 0; i < verbcount; i++)
4114         {
4115         if (namelen == verbs[i].len &&
4116             strncmp((char *)name, vn, namelen) == 0)
4117           {
4118           *code = verbs[i].op;
4119           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4120           break;
4121           }
4122         vn += verbs[i].len + 1;
4123         }
4124       if (i < verbcount) continue;
4125       *errorcodeptr = ERR60;
4126       goto FAILED;
4127       }
4128
4129     /* Deal with the extended parentheses; all are introduced by '?', and the
4130     appearance of any of them means that this is not a capturing group. */
4131
4132     else if (*ptr == '?')
4133       {
4134       int i, set, unset, namelen;
4135       int *optset;
4136       const uschar *name;
4137       uschar *slot;
4138
4139       switch (*(++ptr))
4140         {
4141         case '#':                 /* Comment; skip to ket */
4142         ptr++;
4143         while (*ptr != 0 && *ptr != ')') ptr++;
4144         if (*ptr == 0)
4145           {
4146           *errorcodeptr = ERR18;
4147           goto FAILED;
4148           }
4149         continue;
4150
4151
4152         /* ------------------------------------------------------------ */
4153         case '|':                 /* Reset capture count for each branch */
4154         reset_bracount = TRUE;
4155         /* Fall through */
4156
4157         /* ------------------------------------------------------------ */
4158         case ':':                 /* Non-capturing bracket */
4159         bravalue = OP_BRA;
4160         ptr++;
4161         break;
4162
4163
4164         /* ------------------------------------------------------------ */
4165         case '(':
4166         bravalue = OP_COND;       /* Conditional group */
4167
4168         /* A condition can be an assertion, a number (referring to a numbered
4169         group), a name (referring to a named group), or 'R', referring to
4170         recursion. R<digits> and R&name are also permitted for recursion tests.
4171
4172         There are several syntaxes for testing a named group: (?(name)) is used
4173         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4174
4175         There are two unfortunate ambiguities, caused by history. (a) 'R' can
4176         be the recursive thing or the name 'R' (and similarly for 'R' followed
4177         by digits), and (b) a number could be a name that consists of digits.
4178         In both cases, we look for a name first; if not found, we try the other
4179         cases. */
4180
4181         /* For conditions that are assertions, check the syntax, and then exit
4182         the switch. This will take control down to where bracketed groups,
4183         including assertions, are processed. */
4184
4185         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4186           break;
4187
4188         /* Most other conditions use OP_CREF (a couple change to OP_RREF
4189         below), and all need to skip 3 bytes at the start of the group. */
4190
4191         code[1+LINK_SIZE] = OP_CREF;
4192         skipbytes = 3;
4193         refsign = -1;
4194
4195         /* Check for a test for recursion in a named group. */
4196
4197         if (ptr[1] == 'R' && ptr[2] == '&')
4198           {
4199           terminator = -1;
4200           ptr += 2;
4201           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4202           }
4203
4204         /* Check for a test for a named group's having been set, using the Perl
4205         syntax (?(<name>) or (?('name') */
4206
4207         else if (ptr[1] == '<')
4208           {
4209           terminator = '>';
4210           ptr++;
4211           }
4212         else if (ptr[1] == '\'')
4213           {
4214           terminator = '\'';
4215           ptr++;
4216           }
4217         else
4218           {
4219           terminator = 0;
4220           if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4221           }
4222
4223         /* We now expect to read a name; any thing else is an error */
4224
4225         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4226           {
4227           ptr += 1;  /* To get the right offset */
4228           *errorcodeptr = ERR28;
4229           goto FAILED;
4230           }
4231
4232         /* Read the name, but also get it as a number if it's all digits */
4233
4234         recno = 0;
4235         name = ++ptr;
4236         while ((cd->ctypes[*ptr] & ctype_word) != 0)
4237           {
4238           if (recno >= 0)
4239             recno = (g_ascii_isdigit (*ptr) != 0)?
4240               recno * 10 + *ptr - '0' : -1;
4241           ptr++;
4242           }
4243         namelen = ptr - name;
4244
4245         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4246           {
4247           ptr--;      /* Error offset */
4248           *errorcodeptr = ERR26;
4249           goto FAILED;
4250           }
4251
4252         /* Do no further checking in the pre-compile phase. */
4253
4254         if (lengthptr != NULL) break;
4255
4256         /* In the real compile we do the work of looking for the actual
4257         reference. If the string started with "+" or "-" we require the rest to
4258         be digits, in which case recno will be set. */
4259
4260         if (refsign > 0)
4261           {
4262           if (recno <= 0)
4263             {
4264             *errorcodeptr = ERR58;
4265             goto FAILED;
4266             }
4267           recno = (refsign == '-')?
4268             cd->bracount - recno + 1 : recno +cd->bracount;
4269           if (recno <= 0 || recno > cd->final_bracount)
4270             {
4271             *errorcodeptr = ERR15;
4272             goto FAILED;
4273             }
4274           PUT2(code, 2+LINK_SIZE, recno);
4275           break;
4276           }
4277
4278         /* Otherwise (did not start with "+" or "-"), start by looking for the
4279         name. */
4280
4281         slot = cd->name_table;
4282         for (i = 0; i < cd->names_found; i++)
4283           {
4284           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4285           slot += cd->name_entry_size;
4286           }
4287
4288         /* Found a previous named subpattern */
4289
4290         if (i < cd->names_found)
4291           {
4292           recno = GET2(slot, 0);
4293           PUT2(code, 2+LINK_SIZE, recno);
4294           }
4295
4296         /* Search the pattern for a forward reference */
4297
4298         else if ((i = find_parens(ptr, cd, name, namelen,
4299                         (options & PCRE_EXTENDED) != 0)) > 0)
4300           {
4301           PUT2(code, 2+LINK_SIZE, i);
4302           }
4303
4304         /* If terminator == 0 it means that the name followed directly after
4305         the opening parenthesis [e.g. (?(abc)...] and in this case there are
4306         some further alternatives to try. For the cases where terminator != 0
4307         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4308         now checked all the possibilities, so give an error. */
4309
4310         else if (terminator != 0)
4311           {
4312           *errorcodeptr = ERR15;
4313           goto FAILED;
4314           }
4315
4316         /* Check for (?(R) for recursion. Allow digits after R to specify a
4317         specific group number. */
4318
4319         else if (*name == 'R')
4320           {
4321           recno = 0;
4322           for (i = 1; i < namelen; i++)
4323             {
4324             if (g_ascii_isdigit (name[i]) == 0)
4325               {
4326               *errorcodeptr = ERR15;
4327               goto FAILED;
4328               }
4329             recno = recno * 10 + name[i] - '0';
4330             }
4331           if (recno == 0) recno = RREF_ANY;
4332           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4333           PUT2(code, 2+LINK_SIZE, recno);
4334           }
4335
4336         /* Similarly, check for the (?(DEFINE) "condition", which is always
4337         false. */
4338
4339         else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4340           {
4341           code[1+LINK_SIZE] = OP_DEF;
4342           skipbytes = 1;
4343           }
4344
4345         /* Check for the "name" actually being a subpattern number. We are
4346         in the second pass here, so final_bracount is set. */
4347
4348         else if (recno > 0 && recno <= cd->final_bracount)
4349           {
4350           PUT2(code, 2+LINK_SIZE, recno);
4351           }
4352
4353         /* Either an unidentified subpattern, or a reference to (?(0) */
4354
4355         else
4356           {
4357           *errorcodeptr = (recno == 0)? ERR35: ERR15;
4358           goto FAILED;
4359           }
4360         break;
4361
4362
4363         /* ------------------------------------------------------------ */
4364         case '=':                 /* Positive lookahead */
4365         bravalue = OP_ASSERT;
4366         ptr++;
4367         break;
4368
4369
4370         /* ------------------------------------------------------------ */
4371         case '!':                 /* Negative lookahead */
4372         ptr++;
4373         if (*ptr == ')')          /* Optimize (?!) */
4374           {
4375           *code++ = OP_FAIL;
4376           previous = NULL;
4377           continue;
4378           }
4379         bravalue = OP_ASSERT_NOT;
4380         break;
4381
4382
4383         /* ------------------------------------------------------------ */
4384         case '<':                 /* Lookbehind or named define */
4385         switch (ptr[1])
4386           {
4387           case '=':               /* Positive lookbehind */
4388           bravalue = OP_ASSERTBACK;
4389           ptr += 2;
4390           break;
4391
4392           case '!':               /* Negative lookbehind */
4393           bravalue = OP_ASSERTBACK_NOT;
4394           ptr += 2;
4395           break;
4396
4397           default:                /* Could be name define, else bad */
4398           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4399           ptr++;                  /* Correct offset for error */
4400           *errorcodeptr = ERR24;
4401           goto FAILED;
4402           }
4403         break;
4404
4405
4406         /* ------------------------------------------------------------ */
4407         case '>':                 /* One-time brackets */
4408         bravalue = OP_ONCE;
4409         ptr++;
4410         break;
4411
4412
4413         /* ------------------------------------------------------------ */
4414         case 'C':                 /* Callout - may be followed by digits; */
4415         previous_callout = code;  /* Save for later completion */
4416         after_manual_callout = 1; /* Skip one item before completing */
4417         *code++ = OP_CALLOUT;
4418           {
4419           int n = 0;
4420           while (g_ascii_isdigit (*(++ptr)) != 0)
4421             n = n * 10 + *ptr - '0';
4422           if (*ptr != ')')
4423             {
4424             *errorcodeptr = ERR39;
4425             goto FAILED;
4426             }
4427           if (n > 255)
4428             {
4429             *errorcodeptr = ERR38;
4430             goto FAILED;
4431             }
4432           *code++ = n;
4433           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4434           PUT(code, LINK_SIZE, 0);                    /* Default length */
4435           code += 2 * LINK_SIZE;
4436           }
4437         previous = NULL;
4438         continue;
4439
4440
4441         /* ------------------------------------------------------------ */
4442         case 'P':                 /* Python-style named subpattern handling */
4443         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4444           {
4445           is_recurse = *ptr == '>';
4446           terminator = ')';
4447           goto NAMED_REF_OR_RECURSE;
4448           }
4449         else if (*ptr != '<')    /* Test for Python-style definition */
4450           {
4451           *errorcodeptr = ERR41;
4452           goto FAILED;
4453           }
4454         /* Fall through to handle (?P< as (?< is handled */
4455
4456
4457         /* ------------------------------------------------------------ */
4458         DEFINE_NAME:    /* Come here from (?< handling */
4459         case '\'':
4460           {
4461           terminator = (*ptr == '<')? '>' : '\'';
4462           name = ++ptr;
4463
4464           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4465           namelen = ptr - name;
4466
4467           /* In the pre-compile phase, just do a syntax check. */
4468
4469           if (lengthptr != NULL)
4470             {
4471             if (*ptr != terminator)
4472               {
4473               *errorcodeptr = ERR42;
4474               goto FAILED;
4475               }
4476             if (cd->names_found >= MAX_NAME_COUNT)
4477               {
4478               *errorcodeptr = ERR49;
4479               goto FAILED;
4480               }
4481             if (namelen + 3 > cd->name_entry_size)
4482               {
4483               cd->name_entry_size = namelen + 3;
4484               if (namelen > MAX_NAME_SIZE)
4485                 {
4486                 *errorcodeptr = ERR48;
4487                 goto FAILED;
4488                 }
4489               }
4490             }
4491
4492           /* In the real compile, create the entry in the table */
4493
4494           else
4495             {
4496             slot = cd->name_table;
4497             for (i = 0; i < cd->names_found; i++)
4498               {
4499               int crc = memcmp(name, slot+2, namelen);
4500               if (crc == 0)
4501                 {
4502                 if (slot[2+namelen] == 0)
4503                   {
4504                   if ((options & PCRE_DUPNAMES) == 0)
4505                     {
4506                     *errorcodeptr = ERR43;
4507                     goto FAILED;
4508                     }
4509                   }
4510                 else crc = -1;      /* Current name is substring */
4511                 }
4512               if (crc < 0)
4513                 {
4514                 memmove(slot + cd->name_entry_size, slot,
4515                   (cd->names_found - i) * cd->name_entry_size);
4516                 break;
4517                 }
4518               slot += cd->name_entry_size;
4519               }
4520
4521             PUT2(slot, 0, cd->bracount + 1);
4522             memcpy(slot + 2, name, namelen);
4523             slot[2+namelen] = 0;
4524             }
4525           }
4526
4527         /* In both cases, count the number of names we've encountered. */
4528
4529         ptr++;                    /* Move past > or ' */
4530         cd->names_found++;
4531         goto NUMBERED_GROUP;
4532
4533
4534         /* ------------------------------------------------------------ */
4535         case '&':                 /* Perl recursion/subroutine syntax */
4536         terminator = ')';
4537         is_recurse = TRUE;
4538         /* Fall through */
4539
4540         /* We come here from the Python syntax above that handles both
4541         references (?P=name) and recursion (?P>name), as well as falling
4542         through from the Perl recursion syntax (?&name). We also come here from
4543         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4544         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4545
4546         NAMED_REF_OR_RECURSE:
4547         name = ++ptr;
4548         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4549         namelen = ptr - name;
4550
4551         /* In the pre-compile phase, do a syntax check and set a dummy
4552         reference number. */
4553
4554         if (lengthptr != NULL)
4555           {
4556           if (namelen == 0)
4557             {
4558             *errorcodeptr = ERR62;
4559             goto FAILED;
4560             }
4561           if (*ptr != terminator)
4562             {
4563             *errorcodeptr = ERR42;
4564             goto FAILED;
4565             }
4566           if (namelen > MAX_NAME_SIZE)
4567             {
4568             *errorcodeptr = ERR48;
4569             goto FAILED;
4570             }
4571           recno = 0;
4572           }
4573
4574         /* In the real compile, seek the name in the table. We check the name
4575         first, and then check that we have reached the end of the name in the
4576         table. That way, if the name that is longer than any in the table,
4577         the comparison will fail without reading beyond the table entry. */
4578
4579         else
4580           {
4581           slot = cd->name_table;
4582           for (i = 0; i < cd->names_found; i++)
4583             {
4584             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4585                 slot[2+namelen] == 0)
4586               break;
4587             slot += cd->name_entry_size;
4588             }
4589
4590           if (i < cd->names_found)         /* Back reference */
4591             {
4592             recno = GET2(slot, 0);
4593             }
4594           else if ((recno =                /* Forward back reference */
4595                     find_parens(ptr, cd, name, namelen,
4596                       (options & PCRE_EXTENDED) != 0)) <= 0)
4597             {
4598             *errorcodeptr = ERR15;
4599             goto FAILED;
4600             }
4601           }
4602
4603         /* In both phases, we can now go to the code than handles numerical
4604         recursion or backreferences. */
4605
4606         if (is_recurse) goto HANDLE_RECURSION;
4607           else goto HANDLE_REFERENCE;
4608
4609
4610         /* ------------------------------------------------------------ */
4611         case 'R':                 /* Recursion */
4612         ptr++;                    /* Same as (?0)      */
4613         /* Fall through */
4614
4615
4616         /* ------------------------------------------------------------ */
4617         case '-': case '+':
4618         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4619         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4620           {
4621           const uschar *called;
4622           terminator = ')';
4623
4624           /* Come here from the \g<...> and \g'...' code (Oniguruma
4625           compatibility). However, the syntax has been checked to ensure that
4626           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4627           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4628           ever be taken. */
4629
4630           HANDLE_NUMERICAL_RECURSION:
4631
4632           if ((refsign = *ptr) == '+')
4633             {
4634             ptr++;
4635             if (g_ascii_isdigit (*ptr) == 0)
4636               {
4637               *errorcodeptr = ERR63;
4638               goto FAILED;
4639               }
4640             }
4641           else if (refsign == '-')
4642             {
4643             if (g_ascii_isdigit (ptr[1]) == 0)
4644               goto OTHER_CHAR_AFTER_QUERY;
4645             ptr++;
4646             }
4647
4648           recno = 0;
4649           while(g_ascii_isdigit (*ptr) != 0)
4650             recno = recno * 10 + *ptr++ - '0';
4651
4652           if (*ptr != terminator)
4653             {
4654             *errorcodeptr = ERR29;
4655             goto FAILED;
4656             }
4657
4658           if (refsign == '-')
4659             {
4660             if (recno == 0)
4661               {
4662               *errorcodeptr = ERR58;
4663               goto FAILED;
4664               }
4665             recno = cd->bracount - recno + 1;
4666             if (recno <= 0)
4667               {
4668               *errorcodeptr = ERR15;
4669               goto FAILED;
4670               }
4671             }
4672           else if (refsign == '+')
4673             {
4674             if (recno == 0)
4675               {
4676               *errorcodeptr = ERR58;
4677               goto FAILED;
4678               }
4679             recno += cd->bracount;
4680             }
4681
4682           /* Come here from code above that handles a named recursion */
4683
4684           HANDLE_RECURSION:
4685
4686           previous = code;
4687           called = cd->start_code;
4688
4689           /* When we are actually compiling, find the bracket that is being
4690           referenced. Temporarily end the regex in case it doesn't exist before
4691           this point. If we end up with a forward reference, first check that
4692           the bracket does occur later so we can give the error (and position)
4693           now. Then remember this forward reference in the workspace so it can
4694           be filled in at the end. */
4695
4696           if (lengthptr == NULL)
4697             {
4698             *code = OP_END;
4699             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4700
4701             /* Forward reference */
4702
4703             if (called == NULL)
4704               {
4705               if (find_parens(ptr, cd, NULL, recno,
4706                     (options & PCRE_EXTENDED) != 0) < 0)
4707                 {
4708                 *errorcodeptr = ERR15;
4709                 goto FAILED;
4710                 }
4711               called = cd->start_code + recno;
4712               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4713               }
4714
4715             /* If not a forward reference, and the subpattern is still open,
4716             this is a recursive call. We check to see if this is a left
4717             recursion that could loop for ever, and diagnose that case. */
4718
4719             else if (GET(called, 1) == 0 &&
4720                      could_be_empty(called, code, bcptr, utf8))
4721               {
4722               *errorcodeptr = ERR40;
4723               goto FAILED;
4724               }
4725             }
4726
4727           /* Insert the recursion/subroutine item, automatically wrapped inside
4728           "once" brackets. Set up a "previous group" length so that a
4729           subsequent quantifier will work. */
4730
4731           *code = OP_ONCE;
4732           PUT(code, 1, 2 + 2*LINK_SIZE);
4733           code += 1 + LINK_SIZE;
4734
4735           *code = OP_RECURSE;
4736           PUT(code, 1, called - cd->start_code);
4737           code += 1 + LINK_SIZE;
4738
4739           *code = OP_KET;
4740           PUT(code, 1, 2 + 2*LINK_SIZE);
4741           code += 1 + LINK_SIZE;
4742
4743           length_prevgroup = 3 + 3*LINK_SIZE;
4744           }
4745
4746         /* Can't determine a first byte now */
4747
4748         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4749         continue;
4750
4751
4752         /* ------------------------------------------------------------ */
4753         default:              /* Other characters: check option setting */
4754         OTHER_CHAR_AFTER_QUERY:
4755         set = unset = 0;
4756         optset = &set;
4757
4758         while (*ptr != ')' && *ptr != ':')
4759           {
4760           switch (*ptr++)
4761             {
4762             case '-': optset = &unset; break;
4763
4764             case 'J':    /* Record that it changed in the external options */
4765             *optset |= PCRE_DUPNAMES;
4766             cd->external_flags |= PCRE_JCHANGED;
4767             break;
4768
4769             case 'i': *optset |= PCRE_CASELESS; break;
4770             case 'm': *optset |= PCRE_MULTILINE; break;
4771             case 's': *optset |= PCRE_DOTALL; break;
4772             case 'x': *optset |= PCRE_EXTENDED; break;
4773             case 'U': *optset |= PCRE_UNGREEDY; break;
4774             case 'X': *optset |= PCRE_EXTRA; break;
4775
4776             default:  *errorcodeptr = ERR12;
4777                       ptr--;    /* Correct the offset */
4778                       goto FAILED;
4779             }
4780           }
4781
4782         /* Set up the changed option bits, but don't change anything yet. */
4783
4784         newoptions = (options | set) & (~unset);
4785
4786         /* If the options ended with ')' this is not the start of a nested
4787         group with option changes, so the options change at this level. If this
4788         item is right at the start of the pattern, the options can be
4789         abstracted and made external in the pre-compile phase, and ignored in
4790         the compile phase. This can be helpful when matching -- for instance in
4791         caseless checking of required bytes.
4792
4793         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4794         definitely *not* at the start of the pattern because something has been
4795         compiled. In the pre-compile phase, however, the code pointer can have
4796         that value after the start, because it gets reset as code is discarded
4797         during the pre-compile. However, this can happen only at top level - if
4798         we are within parentheses, the starting BRA will still be present. At
4799         any parenthesis level, the length value can be used to test if anything
4800         has been compiled at that level. Thus, a test for both these conditions
4801         is necessary to ensure we correctly detect the start of the pattern in
4802         both phases.
4803
4804         If we are not at the pattern start, compile code to change the ims
4805         options if this setting actually changes any of them, and reset the
4806         greedy defaults and the case value for firstbyte and reqbyte. */
4807
4808         if (*ptr == ')')
4809           {
4810           if (code == cd->start_code + 1 + LINK_SIZE &&
4811                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4812             {
4813             cd->external_options = newoptions;
4814             }
4815          else
4816             {
4817             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4818               {
4819               *code++ = OP_OPT;
4820               *code++ = newoptions & PCRE_IMS;
4821               }
4822             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4823             greedy_non_default = greedy_default ^ 1;
4824             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4825             }
4826
4827           /* Change options at this level, and pass them back for use
4828           in subsequent branches. When not at the start of the pattern, this
4829           information is also necessary so that a resetting item can be
4830           compiled at the end of a group (if we are in a group). */
4831
4832           *optionsptr = options = newoptions;
4833           previous = NULL;       /* This item can't be repeated */
4834           continue;              /* It is complete */
4835           }
4836
4837         /* If the options ended with ':' we are heading into a nested group
4838         with possible change of options. Such groups are non-capturing and are
4839         not assertions of any kind. All we need to do is skip over the ':';
4840         the newoptions value is handled below. */
4841
4842         bravalue = OP_BRA;
4843         ptr++;
4844         }     /* End of switch for character following (? */
4845       }       /* End of (? handling */
4846
4847     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4848     all unadorned brackets become non-capturing and behave like (?:...)
4849     brackets. */
4850
4851     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4852       {
4853       bravalue = OP_BRA;
4854       }
4855
4856     /* Else we have a capturing group. */
4857
4858     else
4859       {
4860       NUMBERED_GROUP:
4861       cd->bracount += 1;
4862       PUT2(code, 1+LINK_SIZE, cd->bracount);
4863       skipbytes = 2;
4864       }
4865
4866     /* Process nested bracketed regex. Assertions may not be repeated, but
4867     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4868     non-register variable in order to be able to pass its address because some
4869     compilers complain otherwise. Pass in a new setting for the ims options if
4870     they have changed. */
4871
4872     previous = (bravalue >= OP_ONCE)? code : NULL;
4873     *code = bravalue;
4874     tempcode = code;
4875     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4876     length_prevgroup = 0;              /* Initialize for pre-compile phase */
4877
4878     if (!compile_regex(
4879          newoptions,                   /* The complete new option state */
4880          options & PCRE_IMS,           /* The previous ims option state */
4881          &tempcode,                    /* Where to put code (updated) */
4882          &ptr,                         /* Input pointer (updated) */
4883          errorcodeptr,                 /* Where to put an error message */
4884          (bravalue == OP_ASSERTBACK ||
4885           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4886          reset_bracount,               /* True if (?| group */
4887          skipbytes,                    /* Skip over bracket number */
4888          &subfirstbyte,                /* For possible first char */
4889          &subreqbyte,                  /* For possible last char */
4890          bcptr,                        /* Current branch chain */
4891          cd,                           /* Tables block */
4892          (lengthptr == NULL)? NULL :   /* Actual compile phase */
4893            &length_prevgroup           /* Pre-compile phase */
4894          ))
4895       goto FAILED;
4896
4897     /* At the end of compiling, code is still pointing to the start of the
4898     group, while tempcode has been updated to point past the end of the group
4899     and any option resetting that may follow it. The pattern pointer (ptr)
4900     is on the bracket. */
4901
4902     /* If this is a conditional bracket, check that there are no more than
4903     two branches in the group, or just one if it's a DEFINE group. We do this
4904     in the real compile phase, not in the pre-pass, where the whole group may
4905     not be available. */
4906
4907     if (bravalue == OP_COND && lengthptr == NULL)
4908       {
4909       uschar *tc = code;
4910       int condcount = 0;
4911
4912       do {
4913          condcount++;
4914          tc += GET(tc,1);
4915          }
4916       while (*tc != OP_KET);
4917
4918       /* A DEFINE group is never obeyed inline (the "condition" is always
4919       false). It must have only one branch. */
4920
4921       if (code[LINK_SIZE+1] == OP_DEF)
4922         {
4923         if (condcount > 1)
4924           {
4925           *errorcodeptr = ERR54;
4926           goto FAILED;
4927           }
4928         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4929         }
4930
4931       /* A "normal" conditional group. If there is just one branch, we must not
4932       make use of its firstbyte or reqbyte, because this is equivalent to an
4933       empty second branch. */
4934
4935       else
4936         {
4937         if (condcount > 2)
4938           {
4939           *errorcodeptr = ERR27;
4940           goto FAILED;
4941           }
4942         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4943         }
4944       }
4945
4946     /* Error if hit end of pattern */
4947
4948     if (*ptr != ')')
4949       {
4950       *errorcodeptr = ERR14;
4951       goto FAILED;
4952       }
4953
4954     /* In the pre-compile phase, update the length by the length of the group,
4955     less the brackets at either end. Then reduce the compiled code to just a
4956     set of non-capturing brackets so that it doesn't use much memory if it is
4957     duplicated by a quantifier.*/
4958
4959     if (lengthptr != NULL)
4960       {
4961       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4962         {
4963         *errorcodeptr = ERR20;
4964         goto FAILED;
4965         }
4966       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4967       *code++ = OP_BRA;
4968       PUTINC(code, 0, 1 + LINK_SIZE);
4969       *code++ = OP_KET;
4970       PUTINC(code, 0, 1 + LINK_SIZE);
4971       break;    /* No need to waste time with special character handling */
4972       }
4973
4974     /* Otherwise update the main code pointer to the end of the group. */
4975
4976     code = tempcode;
4977
4978     /* For a DEFINE group, required and first character settings are not
4979     relevant. */
4980
4981     if (bravalue == OP_DEF) break;
4982
4983     /* Handle updating of the required and first characters for other types of
4984     group. Update for normal brackets of all kinds, and conditions with two
4985     branches (see code above). If the bracket is followed by a quantifier with
4986     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4987     zerofirstbyte outside the main loop so that they can be accessed for the
4988     back off. */
4989
4990     zeroreqbyte = reqbyte;
4991     zerofirstbyte = firstbyte;
4992     groupsetfirstbyte = FALSE;
4993
4994     if (bravalue >= OP_ONCE)
4995       {
4996       /* If we have not yet set a firstbyte in this branch, take it from the
4997       subpattern, remembering that it was set here so that a repeat of more
4998       than one can replicate it as reqbyte if necessary. If the subpattern has
4999       no firstbyte, set "none" for the whole branch. In both cases, a zero
5000       repeat forces firstbyte to "none". */
5001
5002       if (firstbyte == REQ_UNSET)
5003         {
5004         if (subfirstbyte >= 0)
5005           {
5006           firstbyte = subfirstbyte;
5007           groupsetfirstbyte = TRUE;
5008           }
5009         else firstbyte = REQ_NONE;
5010         zerofirstbyte = REQ_NONE;
5011         }
5012
5013       /* If firstbyte was previously set, convert the subpattern's firstbyte
5014       into reqbyte if there wasn't one, using the vary flag that was in
5015       existence beforehand. */
5016
5017       else if (subfirstbyte >= 0 && subreqbyte < 0)
5018         subreqbyte = subfirstbyte | tempreqvary;
5019
5020       /* If the subpattern set a required byte (or set a first byte that isn't
5021       really the first byte - see above), set it. */
5022
5023       if (subreqbyte >= 0) reqbyte = subreqbyte;
5024       }
5025
5026     /* For a forward assertion, we take the reqbyte, if set. This can be
5027     helpful if the pattern that follows the assertion doesn't set a different
5028     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5029     for an assertion, however because it leads to incorrect effect for patterns
5030     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5031     of a firstbyte. This is overcome by a scan at the end if there's no
5032     firstbyte, looking for an asserted first char. */
5033
5034     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5035     break;     /* End of processing '(' */
5036
5037
5038     /* ===================================================================*/
5039     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5040     are arranged to be the negation of the corresponding OP_values. For the
5041     back references, the values are ESC_REF plus the reference number. Only
5042     back references and those types that consume a character may be repeated.
5043     We can test for values between ESC_b and ESC_Z for the latter; this may
5044     have to change if any new ones are ever created. */
5045
5046     case '\\':
5047     tempptr = ptr;
5048     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5049     if (*errorcodeptr != 0) goto FAILED;
5050
5051     if (c < 0)
5052       {
5053       if (-c == ESC_Q)            /* Handle start of quoted string */
5054         {
5055         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5056           else inescq = TRUE;
5057         continue;
5058         }
5059
5060       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5061
5062       /* For metasequences that actually match a character, we disable the
5063       setting of a first character if it hasn't already been set. */
5064
5065       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5066         firstbyte = REQ_NONE;
5067
5068       /* Set values to reset to if this is followed by a zero repeat. */
5069
5070       zerofirstbyte = firstbyte;
5071       zeroreqbyte = reqbyte;
5072
5073       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5074       is a subroutine call by number (Oniguruma syntax). In fact, the value
5075       -ESC_g is returned only for these cases. So we don't need to check for <
5076       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5077       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5078       that is a synonym for a named back reference). */
5079
5080       if (-c == ESC_g)
5081         {
5082         const uschar *p;
5083         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5084         terminator = (*(++ptr) == '<')? '>' : '\'';
5085
5086         /* These two statements stop the compiler for warning about possibly
5087         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5088         fact, because we actually check for a number below, the paths that
5089         would actually be in error are never taken. */
5090
5091         skipbytes = 0;
5092         reset_bracount = FALSE;
5093
5094         /* Test for a name */
5095
5096         if (ptr[1] != '+' && ptr[1] != '-')
5097           {
5098           BOOL isnumber = TRUE;
5099           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5100             {
5101             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5102             if ((cd->ctypes[*p] & ctype_word) == 0) break;
5103             }
5104           if (*p != terminator)
5105             {
5106             *errorcodeptr = ERR57;
5107             break;
5108             }
5109           if (isnumber)
5110             {
5111             ptr++;
5112             goto HANDLE_NUMERICAL_RECURSION;
5113             }
5114           is_recurse = TRUE;
5115           goto NAMED_REF_OR_RECURSE;
5116           }
5117
5118         /* Test a signed number in angle brackets or quotes. */
5119
5120         p = ptr + 2;
5121         while (g_ascii_isdigit (*p) != 0) p++;
5122         if (*p != terminator)
5123           {
5124           *errorcodeptr = ERR57;
5125           break;
5126           }
5127         ptr++;
5128         goto HANDLE_NUMERICAL_RECURSION;
5129         }
5130
5131       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5132       We also support \k{name} (.NET syntax) */
5133
5134       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5135         {
5136         is_recurse = FALSE;
5137         terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5138         goto NAMED_REF_OR_RECURSE;
5139         }
5140
5141       /* Back references are handled specially; must disable firstbyte if
5142       not set to cope with cases like (?=(\w+))\1: which would otherwise set
5143       ':' later. */
5144
5145       if (-c >= ESC_REF)
5146         {
5147         recno = -c - ESC_REF;
5148
5149         HANDLE_REFERENCE:    /* Come here from named backref handling */
5150         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5151         previous = code;
5152         *code++ = OP_REF;
5153         PUT2INC(code, 0, recno);
5154         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5155         if (recno > cd->top_backref) cd->top_backref = recno;
5156         }
5157
5158       /* So are Unicode property matches, if supported. */
5159
5160 #ifdef SUPPORT_UCP
5161       else if (-c == ESC_P || -c == ESC_p)
5162         {
5163         BOOL negated;
5164         int pdata;
5165         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5166         if (ptype < 0) goto FAILED;
5167         previous = code;
5168         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5169         *code++ = ptype;
5170         *code++ = pdata;
5171         }
5172 #else
5173
5174       /* If Unicode properties are not supported, \X, \P, and \p are not
5175       allowed. */
5176
5177       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5178         {
5179         *errorcodeptr = ERR45;
5180         goto FAILED;
5181         }
5182 #endif
5183
5184       /* For the rest (including \X when Unicode properties are supported), we
5185       can obtain the OP value by negating the escape value. */
5186
5187       else
5188         {
5189         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5190         *code++ = -c;
5191         }
5192       continue;
5193       }
5194
5195     /* We have a data character whose value is in c. In UTF-8 mode it may have
5196     a value > 127. We set its representation in the length/buffer, and then
5197     handle it as a data character. */
5198
5199 #ifdef SUPPORT_UTF8
5200     if (utf8 && c > 127)
5201       mclength = _pcre_ord2utf8(c, mcbuffer);
5202     else
5203 #endif
5204
5205      {
5206      mcbuffer[0] = c;
5207      mclength = 1;
5208      }
5209     goto ONE_CHAR;
5210
5211
5212     /* ===================================================================*/
5213     /* Handle a literal character. It is guaranteed not to be whitespace or #
5214     when the extended flag is set. If we are in UTF-8 mode, it may be a
5215     multi-byte literal character. */
5216
5217     default:
5218     NORMAL_CHAR:
5219     mclength = 1;
5220     mcbuffer[0] = c;
5221
5222 #ifdef SUPPORT_UTF8
5223     if (utf8 && c >= 0xc0)
5224       {
5225       while ((ptr[1] & 0xc0) == 0x80)
5226         mcbuffer[mclength++] = *(++ptr);
5227       }
5228 #endif
5229
5230     /* At this point we have the character's bytes in mcbuffer, and the length
5231     in mclength. When not in UTF-8 mode, the length is always 1. */
5232
5233     ONE_CHAR:
5234     previous = code;
5235     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5236     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5237
5238     /* Remember if \r or \n were seen */
5239
5240     if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5241       cd->external_flags |= PCRE_HASCRORLF;
5242
5243     /* Set the first and required bytes appropriately. If no previous first
5244     byte, set it from this character, but revert to none on a zero repeat.
5245     Otherwise, leave the firstbyte value alone, and don't change it on a zero
5246     repeat. */
5247
5248     if (firstbyte == REQ_UNSET)
5249       {
5250       zerofirstbyte = REQ_NONE;
5251       zeroreqbyte = reqbyte;
5252
5253       /* If the character is more than one byte long, we can set firstbyte
5254       only if it is not to be matched caselessly. */
5255
5256       if (mclength == 1 || req_caseopt == 0)
5257         {
5258         firstbyte = mcbuffer[0] | req_caseopt;
5259         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5260         }
5261       else firstbyte = reqbyte = REQ_NONE;
5262       }
5263
5264     /* firstbyte was previously set; we can set reqbyte only the length is
5265     1 or the matching is caseful. */
5266
5267     else
5268       {
5269       zerofirstbyte = firstbyte;
5270       zeroreqbyte = reqbyte;
5271       if (mclength == 1 || req_caseopt == 0)
5272         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5273       }
5274
5275     break;            /* End of literal character handling */
5276     }
5277   }                   /* end of big loop */
5278
5279
5280 /* Control never reaches here by falling through, only by a goto for all the
5281 error states. Pass back the position in the pattern so that it can be displayed
5282 to the user for diagnosing the error. */
5283
5284 FAILED:
5285 *ptrptr = ptr;
5286 return FALSE;
5287 }
5288
5289
5290
5291
5292 /*************************************************
5293 *     Compile sequence of alternatives           *
5294 *************************************************/
5295
5296 /* On entry, ptr is pointing past the bracket character, but on return it
5297 points to the closing bracket, or vertical bar, or end of string. The code
5298 variable is pointing at the byte into which the BRA operator has been stored.
5299 If the ims options are changed at the start (for a (?ims: group) or during any
5300 branch, we need to insert an OP_OPT item at the start of every following branch
5301 to ensure they get set correctly at run time, and also pass the new options
5302 into every subsequent branch compile.
5303
5304 This function is used during the pre-compile phase when we are trying to find
5305 out the amount of memory needed, as well as during the real compile phase. The
5306 value of lengthptr distinguishes the two phases.
5307
5308 Arguments:
5309   options        option bits, including any changes for this subpattern
5310   oldims         previous settings of ims option bits
5311   codeptr        -> the address of the current code pointer
5312   ptrptr         -> the address of the current pattern pointer
5313   errorcodeptr   -> pointer to error code variable
5314   lookbehind     TRUE if this is a lookbehind assertion
5315   reset_bracount TRUE to reset the count for each branch
5316   skipbytes      skip this many bytes at start (for brackets and OP_COND)
5317   firstbyteptr   place to put the first required character, or a negative number
5318   reqbyteptr     place to put the last required character, or a negative number
5319   bcptr          pointer to the chain of currently open branches
5320   cd             points to the data block with tables pointers etc.
5321   lengthptr      NULL during the real compile phase
5322                  points to length accumulator during pre-compile phase
5323
5324 Returns:         TRUE on success
5325 */
5326
5327 static BOOL
5328 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5329   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5330   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5331   int *lengthptr)
5332 {
5333 const uschar *ptr = *ptrptr;
5334 uschar *code = *codeptr;
5335 uschar *last_branch = code;
5336 uschar *start_bracket = code;
5337 uschar *reverse_count = NULL;
5338 int firstbyte, reqbyte;
5339 int branchfirstbyte, branchreqbyte;
5340 int length;
5341 int orig_bracount;
5342 int max_bracount;
5343 branch_chain bc;
5344
5345 bc.outer = bcptr;
5346 bc.current = code;
5347
5348 firstbyte = reqbyte = REQ_UNSET;
5349
5350 /* Accumulate the length for use in the pre-compile phase. Start with the
5351 length of the BRA and KET and any extra bytes that are required at the
5352 beginning. We accumulate in a local variable to save frequent testing of
5353 lenthptr for NULL. We cannot do this by looking at the value of code at the
5354 start and end of each alternative, because compiled items are discarded during
5355 the pre-compile phase so that the work space is not exceeded. */
5356
5357 length = 2 + 2*LINK_SIZE + skipbytes;
5358
5359 /* WARNING: If the above line is changed for any reason, you must also change
5360 the code that abstracts option settings at the start of the pattern and makes
5361 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5362 pre-compile phase to find out whether anything has yet been compiled or not. */
5363
5364 /* Offset is set zero to mark that this bracket is still open */
5365
5366 PUT(code, 1, 0);
5367 code += 1 + LINK_SIZE + skipbytes;
5368
5369 /* Loop for each alternative branch */
5370
5371 orig_bracount = max_bracount = cd->bracount;
5372 for (;;)
5373   {
5374   /* For a (?| group, reset the capturing bracket count so that each branch
5375   uses the same numbers. */
5376
5377   if (reset_bracount) cd->bracount = orig_bracount;
5378
5379   /* Handle a change of ims options at the start of the branch */
5380
5381   if ((options & PCRE_IMS) != oldims)
5382     {
5383     *code++ = OP_OPT;
5384     *code++ = options & PCRE_IMS;
5385     length += 2;
5386     }
5387
5388   /* Set up dummy OP_REVERSE if lookbehind assertion */
5389
5390   if (lookbehind)
5391     {
5392     *code++ = OP_REVERSE;
5393     reverse_count = code;
5394     PUTINC(code, 0, 0);
5395     length += 1 + LINK_SIZE;
5396     }
5397
5398   /* Now compile the branch; in the pre-compile phase its length gets added
5399   into the length. */
5400
5401   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5402         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5403     {
5404     *ptrptr = ptr;
5405     return FALSE;
5406     }
5407
5408   /* Keep the highest bracket count in case (?| was used and some branch
5409   has fewer than the rest. */
5410
5411   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5412
5413   /* In the real compile phase, there is some post-processing to be done. */
5414
5415   if (lengthptr == NULL)
5416     {
5417     /* If this is the first branch, the firstbyte and reqbyte values for the
5418     branch become the values for the regex. */
5419
5420     if (*last_branch != OP_ALT)
5421       {
5422       firstbyte = branchfirstbyte;
5423       reqbyte = branchreqbyte;
5424       }
5425
5426     /* If this is not the first branch, the first char and reqbyte have to
5427     match the values from all the previous branches, except that if the
5428     previous value for reqbyte didn't have REQ_VARY set, it can still match,
5429     and we set REQ_VARY for the regex. */
5430
5431     else
5432       {
5433       /* If we previously had a firstbyte, but it doesn't match the new branch,
5434       we have to abandon the firstbyte for the regex, but if there was
5435       previously no reqbyte, it takes on the value of the old firstbyte. */
5436
5437       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5438         {
5439         if (reqbyte < 0) reqbyte = firstbyte;
5440         firstbyte = REQ_NONE;
5441         }
5442
5443       /* If we (now or from before) have no firstbyte, a firstbyte from the
5444       branch becomes a reqbyte if there isn't a branch reqbyte. */
5445
5446       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5447           branchreqbyte = branchfirstbyte;
5448
5449       /* Now ensure that the reqbytes match */
5450
5451       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5452         reqbyte = REQ_NONE;
5453       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5454       }
5455
5456     /* If lookbehind, check that this branch matches a fixed-length string, and
5457     put the length into the OP_REVERSE item. Temporarily mark the end of the
5458     branch with OP_END. */
5459
5460     if (lookbehind)
5461       {
5462       int fixed_length;
5463       *code = OP_END;
5464       fixed_length = find_fixedlength(last_branch, options);
5465       DPRINTF(("fixed length = %d\n", fixed_length));
5466       if (fixed_length < 0)
5467         {
5468         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5469         *ptrptr = ptr;
5470         return FALSE;
5471         }
5472       PUT(reverse_count, 0, fixed_length);
5473       }
5474     }
5475
5476   /* Reached end of expression, either ')' or end of pattern. In the real
5477   compile phase, go back through the alternative branches and reverse the chain
5478   of offsets, with the field in the BRA item now becoming an offset to the
5479   first alternative. If there are no alternatives, it points to the end of the
5480   group. The length in the terminating ket is always the length of the whole
5481   bracketed item. If any of the ims options were changed inside the group,
5482   compile a resetting op-code following, except at the very end of the pattern.
5483   Return leaving the pointer at the terminating char. */
5484
5485   if (*ptr != '|')
5486     {
5487     if (lengthptr == NULL)
5488       {
5489       int branch_length = code - last_branch;
5490       do
5491         {
5492         int prev_length = GET(last_branch, 1);
5493         PUT(last_branch, 1, branch_length);
5494         branch_length = prev_length;
5495         last_branch -= branch_length;
5496         }
5497       while (branch_length > 0);
5498       }
5499
5500     /* Fill in the ket */
5501
5502     *code = OP_KET;
5503     PUT(code, 1, code - start_bracket);
5504     code += 1 + LINK_SIZE;
5505
5506     /* Resetting option if needed */
5507
5508     if ((options & PCRE_IMS) != oldims && *ptr == ')')
5509       {
5510       *code++ = OP_OPT;
5511       *code++ = oldims;
5512       length += 2;
5513       }
5514
5515     /* Retain the highest bracket number, in case resetting was used. */
5516
5517     cd->bracount = max_bracount;
5518
5519     /* Set values to pass back */
5520
5521     *codeptr = code;
5522     *ptrptr = ptr;
5523     *firstbyteptr = firstbyte;
5524     *reqbyteptr = reqbyte;
5525     if (lengthptr != NULL)
5526       {
5527       if (OFLOW_MAX - *lengthptr < length)
5528         {
5529         *errorcodeptr = ERR20;
5530         return FALSE;
5531         }
5532       *lengthptr += length;
5533       }
5534     return TRUE;
5535     }
5536
5537   /* Another branch follows. In the pre-compile phase, we can move the code
5538   pointer back to where it was for the start of the first branch. (That is,
5539   pretend that each branch is the only one.)
5540
5541   In the real compile phase, insert an ALT node. Its length field points back
5542   to the previous branch while the bracket remains open. At the end the chain
5543   is reversed. It's done like this so that the start of the bracket has a
5544   zero offset until it is closed, making it possible to detect recursion. */
5545
5546   if (lengthptr != NULL)
5547     {
5548     code = *codeptr + 1 + LINK_SIZE + skipbytes;
5549     length += 1 + LINK_SIZE;
5550     }
5551   else
5552     {
5553     *code = OP_ALT;
5554     PUT(code, 1, code - last_branch);
5555     bc.current = last_branch = code;
5556     code += 1 + LINK_SIZE;
5557     }
5558
5559   ptr++;
5560   }
5561 /* Control never reaches here */
5562 }
5563
5564
5565
5566
5567 /*************************************************
5568 *          Check for anchored expression         *
5569 *************************************************/
5570
5571 /* Try to find out if this is an anchored regular expression. Consider each
5572 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5573 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5574 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5575 counts, since OP_CIRC can match in the middle.
5576
5577 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5578 This is the code for \G, which means "match at start of match position, taking
5579 into account the match offset".
5580
5581 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5582 because that will try the rest of the pattern at all possible matching points,
5583 so there is no point trying again.... er ....
5584
5585 .... except when the .* appears inside capturing parentheses, and there is a
5586 subsequent back reference to those parentheses. We haven't enough information
5587 to catch that case precisely.
5588
5589 At first, the best we could do was to detect when .* was in capturing brackets
5590 and the highest back reference was greater than or equal to that level.
5591 However, by keeping a bitmap of the first 31 back references, we can catch some
5592 of the more common cases more precisely.
5593
5594 Arguments:
5595   code           points to start of expression (the bracket)
5596   options        points to the options setting
5597   bracket_map    a bitmap of which brackets we are inside while testing; this
5598                   handles up to substring 31; after that we just have to take
5599                   the less precise approach
5600   backref_map    the back reference bitmap
5601
5602 Returns:     TRUE or FALSE
5603 */
5604
5605 static BOOL
5606 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5607   unsigned int backref_map)
5608 {
5609 do {
5610    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5611      options, PCRE_MULTILINE, FALSE);
5612    register int op = *scode;
5613
5614    /* Non-capturing brackets */
5615
5616    if (op == OP_BRA)
5617      {
5618      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5619      }
5620
5621    /* Capturing brackets */
5622
5623    else if (op == OP_CBRA)
5624      {
5625      int n = GET2(scode, 1+LINK_SIZE);
5626      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5627      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5628      }
5629
5630    /* Other brackets */
5631
5632    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5633      {
5634      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5635      }
5636
5637    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5638    it isn't in brackets that are or may be referenced. */
5639
5640    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5641              op == OP_TYPEPOSSTAR))
5642      {
5643      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5644        return FALSE;
5645      }
5646
5647    /* Check for explicit anchoring */
5648
5649    else if (op != OP_SOD && op != OP_SOM &&
5650            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5651      return FALSE;
5652    code += GET(code, 1);
5653    }
5654 while (*code == OP_ALT);   /* Loop for each alternative */
5655 return TRUE;
5656 }
5657
5658
5659
5660 /*************************************************
5661 *         Check for starting with ^ or .*        *
5662 *************************************************/
5663
5664 /* This is called to find out if every branch starts with ^ or .* so that
5665 "first char" processing can be done to speed things up in multiline
5666 matching and for non-DOTALL patterns that start with .* (which must start at
5667 the beginning or after \n). As in the case of is_anchored() (see above), we
5668 have to take account of back references to capturing brackets that contain .*
5669 because in that case we can't make the assumption.
5670
5671 Arguments:
5672   code           points to start of expression (the bracket)
5673   bracket_map    a bitmap of which brackets we are inside while testing; this
5674                   handles up to substring 31; after that we just have to take
5675                   the less precise approach
5676   backref_map    the back reference bitmap
5677
5678 Returns:         TRUE or FALSE
5679 */
5680
5681 static BOOL
5682 is_startline(const uschar *code, unsigned int bracket_map,
5683   unsigned int backref_map)
5684 {
5685 do {
5686    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5687      NULL, 0, FALSE);
5688    register int op = *scode;
5689
5690    /* Non-capturing brackets */
5691
5692    if (op == OP_BRA)
5693      {
5694      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5695      }
5696
5697    /* Capturing brackets */
5698
5699    else if (op == OP_CBRA)
5700      {
5701      int n = GET2(scode, 1+LINK_SIZE);
5702      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5703      if (!is_startline(scode, new_map, backref_map)) return FALSE;
5704      }
5705
5706    /* Other brackets */
5707
5708    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5709      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5710
5711    /* .* means "start at start or after \n" if it isn't in brackets that
5712    may be referenced. */
5713
5714    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5715      {
5716      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5717      }
5718
5719    /* Check for explicit circumflex */
5720
5721    else if (op != OP_CIRC) return FALSE;
5722
5723    /* Move on to the next alternative */
5724
5725    code += GET(code, 1);
5726    }
5727 while (*code == OP_ALT);  /* Loop for each alternative */
5728 return TRUE;
5729 }
5730
5731
5732
5733 /*************************************************
5734 *       Check for asserted fixed first char      *
5735 *************************************************/
5736
5737 /* During compilation, the "first char" settings from forward assertions are
5738 discarded, because they can cause conflicts with actual literals that follow.
5739 However, if we end up without a first char setting for an unanchored pattern,
5740 it is worth scanning the regex to see if there is an initial asserted first
5741 char. If all branches start with the same asserted char, or with a bracket all
5742 of whose alternatives start with the same asserted char (recurse ad lib), then
5743 we return that char, otherwise -1.
5744
5745 Arguments:
5746   code       points to start of expression (the bracket)
5747   options    pointer to the options (used to check casing changes)
5748   inassert   TRUE if in an assertion
5749
5750 Returns:     -1 or the fixed first char
5751 */
5752
5753 static int
5754 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5755 {
5756 register int c = -1;
5757 do {
5758    int d;
5759    const uschar *scode =
5760      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5761    register int op = *scode;
5762
5763    switch(op)
5764      {
5765      default:
5766      return -1;
5767
5768      case OP_BRA:
5769      case OP_CBRA:
5770      case OP_ASSERT:
5771      case OP_ONCE:
5772      case OP_COND:
5773      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5774        return -1;
5775      if (c < 0) c = d; else if (c != d) return -1;
5776      break;
5777
5778      case OP_EXACT:       /* Fall through */
5779      scode += 2;
5780
5781      case OP_CHAR:
5782      case OP_CHARNC:
5783      case OP_PLUS:
5784      case OP_MINPLUS:
5785      case OP_POSPLUS:
5786      if (!inassert) return -1;
5787      if (c < 0)
5788        {
5789        c = scode[1];
5790        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5791        }
5792      else if (c != scode[1]) return -1;
5793      break;
5794      }
5795
5796    code += GET(code, 1);
5797    }
5798 while (*code == OP_ALT);
5799 return c;
5800 }
5801
5802
5803
5804 /*************************************************
5805 *        Compile a Regular Expression            *
5806 *************************************************/
5807
5808 /* This function takes a string and returns a pointer to a block of store
5809 holding a compiled version of the expression. The original API for this
5810 function had no error code return variable; it is retained for backwards
5811 compatibility. The new function is given a new name.
5812
5813 Arguments:
5814   pattern       the regular expression
5815   options       various option bits
5816   errorcodeptr  pointer to error code variable (pcre_compile2() only)
5817                   can be NULL if you don't want a code value
5818   errorptr      pointer to pointer to error text
5819   erroroffset   ptr offset in pattern where error was detected
5820   tables        pointer to character tables or NULL
5821
5822 Returns:        pointer to compiled data block, or NULL on error,
5823                 with errorptr and erroroffset set
5824 */
5825
5826 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
5827 pcre_compile(const char *pattern, int options, const char **errorptr,
5828   int *erroroffset, const unsigned char *tables)
5829 {
5830 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5831 }
5832
5833
5834 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
5835 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5836   const char **errorptr, int *erroroffset, const unsigned char *tables)
5837 {
5838 real_pcre *re;
5839 int length = 1;  /* For final END opcode */
5840 int firstbyte, reqbyte, newline;
5841 int errorcode = 0;
5842 int skipatstart = 0;
5843 #ifdef SUPPORT_UTF8
5844 BOOL utf8;
5845 #endif
5846 size_t size;
5847 uschar *code;
5848 const uschar *codestart;
5849 const uschar *ptr;
5850 compile_data compile_block;
5851 compile_data *cd = &compile_block;
5852
5853 /* This space is used for "compiling" into during the first phase, when we are
5854 computing the amount of memory that is needed. Compiled items are thrown away
5855 as soon as possible, so that a fairly large buffer should be sufficient for
5856 this purpose. The same space is used in the second phase for remembering where
5857 to fill in forward references to subpatterns. */
5858
5859 uschar cworkspace[COMPILE_WORK_SIZE];
5860
5861 /* Set this early so that early errors get offset 0. */
5862
5863 ptr = (const uschar *)pattern;
5864
5865 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5866 can do is just return NULL, but we can set a code value if there is a code
5867 pointer. */
5868
5869 if (errorptr == NULL)
5870   {
5871   if (errorcodeptr != NULL) *errorcodeptr = 99;
5872   return NULL;
5873   }
5874
5875 *errorptr = NULL;
5876 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5877
5878 /* However, we can give a message for this error */
5879
5880 if (erroroffset == NULL)
5881   {
5882   errorcode = ERR16;
5883   goto PCRE_EARLY_ERROR_RETURN2;
5884   }
5885
5886 *erroroffset = 0;
5887
5888 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5889
5890 #ifdef SUPPORT_UTF8
5891 utf8 = (options & PCRE_UTF8) != 0;
5892 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5893      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5894   {
5895   errorcode = ERR44;
5896   goto PCRE_EARLY_ERROR_RETURN2;
5897   }
5898 #else
5899 if ((options & PCRE_UTF8) != 0)
5900   {
5901   errorcode = ERR32;
5902   goto PCRE_EARLY_ERROR_RETURN;
5903   }
5904 #endif
5905
5906 if ((options & ~PUBLIC_OPTIONS) != 0)
5907   {
5908   errorcode = ERR17;
5909   goto PCRE_EARLY_ERROR_RETURN;
5910   }
5911
5912 /* Set up pointers to the individual character tables */
5913
5914 if (tables == NULL) tables = _pcre_default_tables;
5915 cd->lcc = tables + lcc_offset;
5916 cd->fcc = tables + fcc_offset;
5917 cd->cbits = tables + cbits_offset;
5918 cd->ctypes = tables + ctypes_offset;
5919
5920 /* Check for global one-time settings at the start of the pattern, and remember
5921 the offset for later. */
5922
5923 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5924   {
5925   int newnl = 0;
5926   int newbsr = 0;
5927
5928   if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5929     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5930   else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
5931     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5932   else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
5933     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5934   else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5935     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5936   else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
5937     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5938
5939   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5940     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5941   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5942     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5943
5944   if (newnl != 0)
5945     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5946   else if (newbsr != 0)
5947     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5948   else break;
5949   }
5950
5951 /* Check validity of \R options. */
5952
5953 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5954   {
5955   case 0:
5956   case PCRE_BSR_ANYCRLF:
5957   case PCRE_BSR_UNICODE:
5958   break;
5959   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5960   }
5961
5962 /* Handle different types of newline. The three bits give seven cases. The
5963 current code allows for fixed one- or two-byte sequences, plus "any" and
5964 "anycrlf". */
5965
5966 switch (options & PCRE_NEWLINE_BITS)
5967   {
5968   case 0: newline = NEWLINE; break;   /* Build-time default */
5969   case PCRE_NEWLINE_CR: newline = '\r'; break;
5970   case PCRE_NEWLINE_LF: newline = '\n'; break;
5971   case PCRE_NEWLINE_CR+
5972        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5973   case PCRE_NEWLINE_ANY: newline = -1; break;
5974   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5975   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5976   }
5977
5978 if (newline == -2)
5979   {
5980   cd->nltype = NLTYPE_ANYCRLF;
5981   }
5982 else if (newline < 0)
5983   {
5984   cd->nltype = NLTYPE_ANY;
5985   }
5986 else
5987   {
5988   cd->nltype = NLTYPE_FIXED;
5989   if (newline > 255)
5990     {
5991     cd->nllen = 2;
5992     cd->nl[0] = (newline >> 8) & 255;
5993     cd->nl[1] = newline & 255;
5994     }
5995   else
5996     {
5997     cd->nllen = 1;
5998     cd->nl[0] = newline;
5999     }
6000   }
6001
6002 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6003 references to help in deciding whether (.*) can be treated as anchored or not.
6004 */
6005
6006 cd->top_backref = 0;
6007 cd->backref_map = 0;
6008
6009 /* Reflect pattern for debugging output */
6010
6011 DPRINTF(("------------------------------------------------------------------\n"));
6012 DPRINTF(("%s\n", pattern));
6013
6014 /* Pretend to compile the pattern while actually just accumulating the length
6015 of memory required. This behaviour is triggered by passing a non-NULL final
6016 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6017 to compile parts of the pattern into; the compiled code is discarded when it is
6018 no longer needed, so hopefully this workspace will never overflow, though there
6019 is a test for its doing so. */
6020
6021 cd->bracount = cd->final_bracount = 0;
6022 cd->names_found = 0;
6023 cd->name_entry_size = 0;
6024 cd->name_table = NULL;
6025 cd->start_workspace = cworkspace;
6026 cd->start_code = cworkspace;
6027 cd->hwm = cworkspace;
6028 cd->start_pattern = (const uschar *)pattern;
6029 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6030 cd->req_varyopt = 0;
6031 cd->external_options = options;
6032 cd->external_flags = 0;
6033
6034 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6035 don't need to look at the result of the function here. The initial options have
6036 been put into the cd block so that they can be changed if an option setting is
6037 found within the regex right at the beginning. Bringing initial option settings
6038 outside can help speed up starting point checks. */
6039
6040 ptr += skipatstart;
6041 code = cworkspace;
6042 *code = OP_BRA;
6043 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6044   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6045   &length);
6046 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6047
6048 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6049   cd->hwm - cworkspace));
6050
6051 if (length > MAX_PATTERN_SIZE)
6052   {
6053   errorcode = ERR20;
6054   goto PCRE_EARLY_ERROR_RETURN;
6055   }
6056
6057 /* Compute the size of data block needed and get it, either from malloc or
6058 externally provided function. Integer overflow should no longer be possible
6059 because nowadays we limit the maximum value of cd->names_found and
6060 cd->name_entry_size. */
6061
6062 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6063 re = (real_pcre *)(pcre_malloc)(size);
6064
6065 if (re == NULL)
6066   {
6067   errorcode = ERR21;
6068   goto PCRE_EARLY_ERROR_RETURN;
6069   }
6070
6071 /* Put in the magic number, and save the sizes, initial options, internal
6072 flags, and character table pointer. NULL is used for the default character
6073 tables. The nullpad field is at the end; it's there to help in the case when a
6074 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6075 pointers. */
6076
6077 re->magic_number = MAGIC_NUMBER;
6078 re->size = size;
6079 re->options = cd->external_options;
6080 re->flags = cd->external_flags;
6081 re->dummy1 = 0;
6082 re->first_byte = 0;
6083 re->req_byte = 0;
6084 re->name_table_offset = sizeof(real_pcre);
6085 re->name_entry_size = cd->name_entry_size;
6086 re->name_count = cd->names_found;
6087 re->ref_count = 0;
6088 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6089 re->nullpad = NULL;
6090
6091 /* The starting points of the name/number translation table and of the code are
6092 passed around in the compile data block. The start/end pattern and initial
6093 options are already set from the pre-compile phase, as is the name_entry_size
6094 field. Reset the bracket count and the names_found field. Also reset the hwm
6095 field; this time it's used for remembering forward references to subpatterns.
6096 */
6097
6098 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6099 cd->bracount = 0;
6100 cd->names_found = 0;
6101 cd->name_table = (uschar *)re + re->name_table_offset;
6102 codestart = cd->name_table + re->name_entry_size * re->name_count;
6103 cd->start_code = codestart;
6104 cd->hwm = cworkspace;
6105 cd->req_varyopt = 0;
6106 cd->had_accept = FALSE;
6107
6108 /* Set up a starting, non-extracting bracket, then compile the expression. On
6109 error, errorcode will be set non-zero, so we don't need to look at the result
6110 of the function here. */
6111
6112 ptr = (const uschar *)pattern + skipatstart;
6113 code = (uschar *)codestart;
6114 *code = OP_BRA;
6115 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6116   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6117 re->top_bracket = cd->bracount;
6118 re->top_backref = cd->top_backref;
6119 re->flags = cd->external_flags;
6120
6121 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6122
6123 /* If not reached end of pattern on success, there's an excess bracket. */
6124
6125 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6126
6127 /* Fill in the terminating state and check for disastrous overflow, but
6128 if debugging, leave the test till after things are printed out. */
6129
6130 *code++ = OP_END;
6131
6132 #ifndef DEBUG
6133 if (code - codestart > length) errorcode = ERR23;
6134 #endif
6135
6136 /* Fill in any forward references that are required. */
6137
6138 while (errorcode == 0 && cd->hwm > cworkspace)
6139   {
6140   int offset, recno;
6141   const uschar *groupptr;
6142   cd->hwm -= LINK_SIZE;
6143   offset = GET(cd->hwm, 0);
6144   recno = GET(codestart, offset);
6145   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6146   if (groupptr == NULL) errorcode = ERR53;
6147     else PUT(((uschar *)codestart), offset, groupptr - codestart);
6148   }
6149
6150 /* Give an error if there's back reference to a non-existent capturing
6151 subpattern. */
6152
6153 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6154
6155 /* Failed to compile, or error while post-processing */
6156
6157 if (errorcode != 0)
6158   {
6159   (pcre_free)(re);
6160   PCRE_EARLY_ERROR_RETURN:
6161   *erroroffset = ptr - (const uschar *)pattern;
6162   PCRE_EARLY_ERROR_RETURN2:
6163   *errorptr = find_error_text(errorcode);
6164   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6165   return NULL;
6166   }
6167
6168 /* If the anchored option was not passed, set the flag if we can determine that
6169 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6170 as starting with .* when DOTALL is set).
6171
6172 Otherwise, if we know what the first byte has to be, save it, because that
6173 speeds up unanchored matches no end. If not, see if we can set the
6174 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6175 start with ^. and also when all branches start with .* for non-DOTALL matches.
6176 */
6177
6178 if ((re->options & PCRE_ANCHORED) == 0)
6179   {
6180   int temp_options = re->options;   /* May get changed during these scans */
6181   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6182     re->options |= PCRE_ANCHORED;
6183   else
6184     {
6185     if (firstbyte < 0)
6186       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6187     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6188       {
6189       int ch = firstbyte & 255;
6190       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6191          cd->fcc[ch] == ch)? ch : firstbyte;
6192       re->flags |= PCRE_FIRSTSET;
6193       }
6194     else if (is_startline(codestart, 0, cd->backref_map))
6195       re->flags |= PCRE_STARTLINE;
6196     }
6197   }
6198
6199 /* For an anchored pattern, we use the "required byte" only if it follows a
6200 variable length item in the regex. Remove the caseless flag for non-caseable
6201 bytes. */
6202
6203 if (reqbyte >= 0 &&
6204      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6205   {
6206   int ch = reqbyte & 255;
6207   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6208     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6209   re->flags |= PCRE_REQCHSET;
6210   }
6211
6212 /* Print out the compiled data if debugging is enabled. This is never the
6213 case when building a production library. */
6214
6215 #ifdef DEBUG
6216
6217 printf("Length = %d top_bracket = %d top_backref = %d\n",
6218   length, re->top_bracket, re->top_backref);
6219
6220 printf("Options=%08x\n", re->options);
6221
6222 if ((re->flags & PCRE_FIRSTSET) != 0)
6223   {
6224   int ch = re->first_byte & 255;
6225   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6226     "" : " (caseless)";
6227   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6228     else printf("First char = \\x%02x%s\n", ch, caseless);
6229   }
6230
6231 if ((re->flags & PCRE_REQCHSET) != 0)
6232   {
6233   int ch = re->req_byte & 255;
6234   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6235     "" : " (caseless)";
6236   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6237     else printf("Req char = \\x%02x%s\n", ch, caseless);
6238   }
6239
6240 pcre_printint(re, stdout, TRUE);
6241
6242 /* This check is done here in the debugging case so that the code that
6243 was compiled can be seen. */
6244
6245 if (code - codestart > length)
6246   {
6247   (pcre_free)(re);
6248   *errorptr = find_error_text(ERR23);
6249   *erroroffset = ptr - (uschar *)pattern;
6250   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6251   return NULL;
6252   }
6253 #endif   /* DEBUG */
6254
6255 return (pcre *)re;
6256 }
6257
6258 /* End of pcre_compile.c */