Modules/pypcre.c

   1
   2 /*************************************************
   3 *      Perl-Compatible Regular Expressions       *
   4 *************************************************/
   5
   6 /*   DO NOT EDIT THIS FILE! */
   7
   8 /* This file is automatically written by the merge-files.py script
   9 included with the PCRE distribution for Python; it's produced from
  10 several C files, and code is removed in the process.  If you want to
  11 modify the code or track down bugs, it will be much easier to work
  12 with the code in its original, multiple-file form.  Don't edit this
  13 file by hand, or submit patches to it.
  14
  15 The Python-specific PCRE distribution can be retrieved from
  16        http://starship.skyport.net/crew/amk/regex/
  17
  18 The unmodified original PCRE distribution is available at
  19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
  20 written by: Philip Hazel <ph10@cam.ac.uk>
  21
  22 Extensively modified by the Python String-SIG: <string-sig@python.org>
  23 Send bug reports to:                           <string-sig@python.org>
  24 (They'll figure out if it's a bug in PCRE or in the Python-specific
  25 changes.)
  26
  27            Copyright (c) 1997 University of Cambridge
  28
  29 -----------------------------------------------------------------------------
  30 Permission is granted to anyone to use this software for any purpose on any
  31 computer system, and to redistribute it freely, subject to the following
  32 restrictions:
  33
  34 1. This software is distributed in the hope that it will be useful,
  35    but WITHOUT ANY WARRANTY; without even the implied warranty of
  36    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  37
  38 2. The origin of this software must not be misrepresented, either by
  39    explicit claim or by omission.
  40
  41 3. Altered versions must be plainly marked as such, and must not be
  42    misrepresented as being the original software.
  43 -----------------------------------------------------------------------------
  44 */
  45
  46
  47 #define FOR_PYTHON
  48 #include "Python.h"
  49 #include "pcre-int.h"
  50 #include <ctype.h>
  51 #include "graminit.h"
  52
  53 /*************************************************
  54 *      Perl-Compatible Regular Expressions       *
  55 *************************************************/
  56
  57 /* This file is automatically written by the makechartables auxiliary
  58 program. If you edit it by hand, you might like to edit the Makefile to
  59 prevent its ever being regenerated. */
  60
  61 /* This table is a lower casing table. */
  62
  63 unsigned char pcre_lcc[] = {
  64     0,  1,  2,  3,  4,  5,  6,  7,
  65     8,  9, 10, 11, 12, 13, 14, 15,
  66    16, 17, 18, 19, 20, 21, 22, 23,
  67    24, 25, 26, 27, 28, 29, 30, 31,
  68    32, 33, 34, 35, 36, 37, 38, 39,
  69    40, 41, 42, 43, 44, 45, 46, 47,
  70    48, 49, 50, 51, 52, 53, 54, 55,
  71    56, 57, 58, 59, 60, 61, 62, 63,
  72    64, 97, 98, 99,100,101,102,103,
  73   104,105,106,107,108,109,110,111,
  74   112,113,114,115,116,117,118,119,
  75   120,121,122, 91, 92, 93, 94, 95,
  76    96, 97, 98, 99,100,101,102,103,
  77   104,105,106,107,108,109,110,111,
  78   112,113,114,115,116,117,118,119,
  79   120,121,122,123,124,125,126,127,
  80   128,129,130,131,132,133,134,135,
  81   136,137,138,139,140,141,142,143,
  82   144,145,146,147,148,149,150,151,
  83   152,153,154,155,156,157,158,159,
  84   160,161,162,163,164,165,166,167,
  85   168,169,170,171,172,173,174,175,
  86   176,177,178,179,180,181,182,183,
  87   184,185,186,187,188,189,190,191,
  88   192,193,194,195,196,197,198,199,
  89   200,201,202,203,204,205,206,207,
  90   208,209,210,211,212,213,214,215,
  91   216,217,218,219,220,221,222,223,
  92   224,225,226,227,228,229,230,231,
  93   232,233,234,235,236,237,238,239,
  94   240,241,242,243,244,245,246,247,
  95   248,249,250,251,252,253,254,255 };
  96
  97 /* This table is a case flipping table. */
  98
  99 unsigned char pcre_fcc[] = {
 100     0,  1,  2,  3,  4,  5,  6,  7,
 101     8,  9, 10, 11, 12, 13, 14, 15,
 102    16, 17, 18, 19, 20, 21, 22, 23,
 103    24, 25, 26, 27, 28, 29, 30, 31,
 104    32, 33, 34, 35, 36, 37, 38, 39,
 105    40, 41, 42, 43, 44, 45, 46, 47,
 106    48, 49, 50, 51, 52, 53, 54, 55,
 107    56, 57, 58, 59, 60, 61, 62, 63,
 108    64, 97, 98, 99,100,101,102,103,
 109   104,105,106,107,108,109,110,111,
 110   112,113,114,115,116,117,118,119,
 111   120,121,122, 91, 92, 93, 94, 95,
 112    96, 65, 66, 67, 68, 69, 70, 71,
 113    72, 73, 74, 75, 76, 77, 78, 79,
 114    80, 81, 82, 83, 84, 85, 86, 87,
 115    88, 89, 90,123,124,125,126,127,
 116   128,129,130,131,132,133,134,135,
 117   136,137,138,139,140,141,142,143,
 118   144,145,146,147,148,149,150,151,
 119   152,153,154,155,156,157,158,159,
 120   160,161,162,163,164,165,166,167,
 121   168,169,170,171,172,173,174,175,
 122   176,177,178,179,180,181,182,183,
 123   184,185,186,187,188,189,190,191,
 124   192,193,194,195,196,197,198,199,
 125   200,201,202,203,204,205,206,207,
 126   208,209,210,211,212,213,214,215,
 127   216,217,218,219,220,221,222,223,
 128   224,225,226,227,228,229,230,231,
 129   232,233,234,235,236,237,238,239,
 130   240,241,242,243,244,245,246,247,
 131   248,249,250,251,252,253,254,255 };
 132
 133 /* This table contains bit maps for digits, letters, 'word' chars, and
 134 white space. Each map is 32 bytes long and the bits run from the least
 135 significant end of each byte. */
 136
 137 unsigned char pcre_cbits[] = {
 138   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
 139   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 140   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 141   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 142
 143   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 144   0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
 145   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 146   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 147
 148   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
 149   0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
 150   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 151   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 152
 153   0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
 154   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 155   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 156   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
 157
 158 /* This table identifies various classes of character by individual bits:
 159   0x01   white space character
 160   0x02   letter
 161   0x04   decimal digit
 162   0x08   hexadecimal digit
 163   0x10   alphanumeric or '_'
 164   0x80   regular expression metacharacter or binary zero
 165 */
 166
 167 unsigned char pcre_ctypes[] = {
 168   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 169   0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /*   8- 15 */
 170   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 171   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 172   0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /*    - '  */
 173   0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /*  ( - /  */
 174   0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /*  0 - 7  */
 175   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /*  8 - ?  */
 176   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  @ - G  */
 177   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  H - O  */
 178   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  P - W  */
 179   0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /*  X - _  */
 180   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  ` - g  */
 181   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  h - o  */
 182   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  p - w  */
 183   0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /*  x -127 */
 184   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 185   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 186   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 187   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 188   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 189   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 190   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 191   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 192   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 193   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 194   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 195   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 196   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 197   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 198   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 199   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 200
 201 /* End of chartables.c */
 202 /*************************************************
 203 *      Perl-Compatible Regular Expressions       *
 204 *************************************************/
 205
 206 /*
 207 This is a library of functions to support regular expressions whose syntax
 208 and semantics are as close as possible to those of the Perl 5 language. See
 209 the file Tech.Notes for some information on the internals.
 210
 211 Written by: Philip Hazel <ph10@cam.ac.uk>
 212
 213            Copyright (c) 1998 University of Cambridge
 214
 215 -----------------------------------------------------------------------------
 216 Permission is granted to anyone to use this software for any purpose on any
 217 computer system, and to redistribute it freely, subject to the following
 218 restrictions:
 219
 220 1. This software is distributed in the hope that it will be useful,
 221    but WITHOUT ANY WARRANTY; without even the implied warranty of
 222    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 223
 224 2. The origin of this software must not be misrepresented, either by
 225    explicit claim or by omission.
 226
 227 3. Altered versions must be plainly marked as such, and must not be
 228    misrepresented as being the original software.
 229 -----------------------------------------------------------------------------
 230 */
 231
 232
 233 /* Include the internals header, which itself includes Standard C headers plus
 234 the external pcre header. */
 235
 236
 237
 238
 239 /*************************************************
 240 *          Create bitmap of starting chars       *
 241 *************************************************/
 242
 243 /* This function scans a compiled unanchored expression and attempts to build a
 244 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
 245 goes by, we may be able to get more clever at doing this.
 246
 247 Arguments:
 248   code         points to an expression
 249   start_bits   points to a 32-byte table, initialized to 0
 250
 251 Returns:       TRUE if table built, FALSE otherwise
 252 */
 253
 254 static BOOL
 255 set_start_bits(const uschar *code, uschar *start_bits)
 256 {
 257 register int c;
 258 volatile int dummy;
 259
 260 do
 261   {
 262   const uschar *tcode = code + 3;
 263   BOOL try_next = TRUE;
 264
 265   while (try_next)
 266     {
 267     try_next = FALSE;
 268
 269     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
 270       {
 271       if (!set_start_bits(tcode, start_bits)) return FALSE;
 272       }
 273
 274     else switch(*tcode)
 275       {
 276       default:
 277       return FALSE;
 278
 279       /* BRAZERO does the bracket, but carries on. */
 280
 281       case OP_BRAZERO:
 282       case OP_BRAMINZERO:
 283       if (!set_start_bits(++tcode, start_bits)) return FALSE;
 284       dummy = 1;
 285       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
 286       tcode += 3;
 287       try_next = TRUE;
 288       break;
 289
 290       /* Single-char * or ? sets the bit and tries the next item */
 291
 292       case OP_STAR:
 293       case OP_MINSTAR:
 294       case OP_QUERY:
 295       case OP_MINQUERY:
 296       start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
 297       tcode += 2;
 298       try_next = TRUE;
 299       break;
 300
 301       /* Single-char upto sets the bit and tries the next */
 302
 303       case OP_UPTO:
 304       case OP_MINUPTO:
 305       start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
 306       tcode += 4;
 307       try_next = TRUE;
 308       break;
 309
 310       /* At least one single char sets the bit and stops */
 311
 312       case OP_EXACT:       /* Fall through */
 313       tcode++;
 314
 315       case OP_CHARS:       /* Fall through */
 316       tcode++;
 317
 318       case OP_PLUS:
 319       case OP_MINPLUS:
 320       start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
 321       break;
 322
 323       /* Single character type sets the bits and stops */
 324
 325       case OP_NOT_DIGIT:
 326       for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
 327       break;
 328
 329       case OP_DIGIT:
 330       for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
 331       break;
 332
 333       case OP_NOT_WHITESPACE:
 334       for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
 335       break;
 336
 337       case OP_WHITESPACE:
 338       for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
 339       break;
 340
 341       case OP_NOT_WORDCHAR:
 342       for (c = 0; c < 32; c++)
 343         start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 344       break;
 345
 346       case OP_WORDCHAR:
 347       for (c = 0; c < 32; c++)
 348         start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 349       break;
 350
 351       /* One or more character type fudges the pointer and restarts, knowing
 352       it will hit a single character type and stop there. */
 353
 354       case OP_TYPEPLUS:
 355       case OP_TYPEMINPLUS:
 356       tcode++;
 357       try_next = TRUE;
 358       break;
 359
 360       case OP_TYPEEXACT:
 361       tcode += 3;
 362       try_next = TRUE;
 363       break;
 364
 365       /* Zero or more repeats of character types set the bits and then
 366       try again. */
 367
 368       case OP_TYPEUPTO:
 369       case OP_TYPEMINUPTO:
 370       tcode += 2;               /* Fall through */
 371
 372       case OP_TYPESTAR:
 373       case OP_TYPEMINSTAR:
 374       case OP_TYPEQUERY:
 375       case OP_TYPEMINQUERY:
 376       switch(tcode[1])
 377         {
 378         case OP_NOT_DIGIT:
 379         for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
 380         break;
 381
 382         case OP_DIGIT:
 383         for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
 384         break;
 385
 386         case OP_NOT_WHITESPACE:
 387         for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
 388         break;
 389
 390         case OP_WHITESPACE:
 391         for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
 392         break;
 393
 394         case OP_NOT_WORDCHAR:
 395         for (c = 0; c < 32; c++)
 396           start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 397         break;
 398
 399         case OP_WORDCHAR:
 400         for (c = 0; c < 32; c++)
 401           start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 402         break;
 403         }
 404
 405       tcode += 2;
 406       try_next = TRUE;
 407       break;
 408
 409       /* Character class: set the bits and either carry on or not,
 410       according to the repeat count. */
 411
 412       case OP_CLASS:
 413       case OP_NEGCLASS:
 414         {
 415         tcode++;
 416         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
 417         tcode += 32;
 418         switch (*tcode)
 419           {
 420           case OP_CRSTAR:
 421           case OP_CRMINSTAR:
 422           case OP_CRQUERY:
 423           case OP_CRMINQUERY:
 424           tcode++;
 425           try_next = TRUE;
 426           break;
 427
 428           case OP_CRRANGE:
 429           case OP_CRMINRANGE:
 430           if (((tcode[1] << 8) + tcode[2]) == 0)
 431             {
 432             tcode += 5;
 433             try_next = TRUE;
 434             }
 435           break;
 436           }
 437         }
 438       break; /* End of class handling */
 439
 440       }      /* End of switch */
 441     }        /* End of try_next loop */
 442
 443   code += (code[1] << 8) + code[2];   /* Advance to next branch */
 444   }
 445 while (*code == OP_ALT);
 446 return TRUE;
 447 }
 448
 449
 450
 451 /*************************************************
 452 *          Study a compiled expression           *
 453 *************************************************/
 454
 455 /* This function is handed a compiled expression that it must study to produce
 456 information that will speed up the matching. It returns a pcre_extra block
 457 which then gets handed back to pcre_exec().
 458
 459 Arguments:
 460   re        points to the compiled expression
 461   options   contains option bits
 462   errorptr  points to where to place error messages;
 463             set NULL unless error
 464
 465 Returns:    pointer to a pcre_extra block,
 466             NULL on error or if no optimization possible
 467 */
 468
 469 pcre_extra *
 470 pcre_study(const pcre *external_re, int options, const char **errorptr)
 471 {
 472 BOOL caseless;
 473 uschar start_bits[32];
 474 real_pcre_extra *extra;
 475 const real_pcre *re = (const real_pcre *)external_re;
 476
 477 *errorptr = NULL;
 478
 479 if (re == NULL || re->magic_number != MAGIC_NUMBER)
 480   {
 481   *errorptr = "argument is not a compiled regular expression";
 482   return NULL;
 483   }
 484
 485 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
 486   {
 487   *errorptr = "unknown or incorrect option bit(s) set";
 488   return NULL;
 489   }
 490
 491 /* Caseless can either be from the compiled regex or from options. */
 492
 493 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
 494
 495 /* For an anchored pattern, or an unanchored pattern that has a first char, or a
 496 multiline pattern that matches only at "line starts", no further processing at
 497 present. */
 498
 499 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
 500   return NULL;
 501
 502 /* See if we can find a fixed set of initial characters for the pattern. */
 503
 504 memset(start_bits, 0, 32 * sizeof(uschar));
 505 if (!set_start_bits(re->code, start_bits)) return NULL;
 506
 507 /* If this studying is caseless, scan the created bit map and duplicate the
 508 bits for any letters. */
 509
 510 if (caseless)
 511   {
 512   register int c;
 513   for (c = 0; c < 256; c++)
 514     {
 515     if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
 516         (pcre_ctypes[c] & ctype_letter) != 0)
 517       {
 518       int d = pcre_fcc[c];
 519       start_bits[d/8] |= (1 << (d&7));
 520       }
 521     }
 522   }
 523
 524 /* Get an "extra" block and put the information therein. */
 525
 526 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
 527
 528 if (extra == NULL)
 529   {
 530   *errorptr = "failed to get memory";
 531   return NULL;
 532   }
 533
 534 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
 535 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
 536
 537 return (pcre_extra *)extra;
 538 }
 539
 540 /* End of study.c */
 541 /*************************************************
 542 *      Perl-Compatible Regular Expressions       *
 543 *************************************************/
 544
 545 /*
 546 This is a library of functions to support regular expressions whose syntax
 547 and semantics are as close as possible to those of the Perl 5 language. See
 548 the file Tech.Notes for some information on the internals.
 549
 550 Written by: Philip Hazel <ph10@cam.ac.uk>
 551
 552            Copyright (c) 1998 University of Cambridge
 553
 554 -----------------------------------------------------------------------------
 555 Permission is granted to anyone to use this software for any purpose on any
 556 computer system, and to redistribute it freely, subject to the following
 557 restrictions:
 558
 559 1. This software is distributed in the hope that it will be useful,
 560    but WITHOUT ANY WARRANTY; without even the implied warranty of
 561    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 562
 563 2. The origin of this software must not be misrepresented, either by
 564    explicit claim or by omission.
 565
 566 3. Altered versions must be plainly marked as such, and must not be
 567    misrepresented as being the original software.
 568 -----------------------------------------------------------------------------
 569 */
 570
 571
 572 /* Define DEBUG to get debugging output on stdout. */
 573
 574 /* #define DEBUG */
 575
 576 /* Use a macro for debugging printing, 'cause that eliminates the the use
 577 of #ifdef inline, and there are *still* stupid compilers about that don't like
 578 indented pre-processor statements. I suppose it's only been 10 years... */
 579
 580 #undef DPRINTF
 581 #ifdef DEBUG
 582 #define DPRINTF(p) printf p
 583 #else
 584 #define DPRINTF(p) /*nothing*/
 585 #endif
 586
 587 /* Include the internals header, which itself includes Standard C headers plus
 588 the external pcre header. */
 589
 590
 591
 592
 593 #ifndef Py_eval_input
 594 /* For Python 1.4, graminit.h has to be explicitly included */
 595 #define Py_eval_input eval_input
 596
 597 #endif /* FOR_PYTHON */
 598
 599 /* Allow compilation as C++ source code, should anybody want to do that. */
 600
 601 #ifdef __cplusplus
 602 #define class pcre_class
 603 #endif
 604
 605
 606 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
 607
 608 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
 609 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
 610
 611 /* Text forms of OP_ values and things, for debugging (not all used) */
 612
 613 #ifdef DEBUG
 614 static const char *OP_names[] = {
 615   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
 616   "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
 617   "localized \\B", "localized \\b", "localized \\W", "localized \\w",
 618   "^", "$", "Any", "chars",
 619   "not",
 620   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 621   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 622   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 623   "*", "*?", "+", "+?", "?", "??", "{", "{",
 624   "class", "negclass", "classL", "Ref",
 625   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
 626   "Brazero", "Braminzero", "Bra"
 627 };
 628 #endif
 629
 630 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 631 are simple data values; negative values are for special things like \d and so
 632 on. Zero means further processing is needed (for things like \x), or the escape
 633 is invalid. */
 634
 635 static const short int escapes[] = {
 636     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 637     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 638   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
 639     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 640     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 641     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 642   '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
 643     0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
 644     0,      0,   '\r', -ESC_s,   '\t',      0,   '\v', -ESC_w,   /* p - w */
 645     0,      0,      0                                            /* x - z */
 646 };
 647
 648 /* Definition to allow mutual recursion */
 649
 650 static BOOL
 651 compile_regex(int, int *, uschar **, const uschar **, const char **,
 652               PyObject *);
 653
 654 /* Structure for passing "static" information around between the functions
 655 doing the matching, so that they are thread-safe. */
 656
 657 typedef struct match_data {
 658   int    errorcode;             /* As it says */
 659   int   *offset_vector;         /* Offset vector */
 660   int    offset_end;            /* One past the end */
 661   BOOL   offset_overflow;       /* Set if too many extractions */
 662   BOOL   caseless;              /* Case-independent flag */
 663   BOOL   runtime_caseless;      /* Caseless forced at run time */
 664   BOOL   multiline;             /* Multiline flag */
 665   BOOL   notbol;                /* NOTBOL flag */
 666   BOOL   noteol;                /* NOTEOL flag */
 667   BOOL   dotall;                /* Dot matches any char */
 668   BOOL   endonly;               /* Dollar not before final \n */
 669   const uschar *start_subject;  /* Start of the subject string */
 670   const uschar *end_subject;    /* End of the subject string */
 671   jmp_buf fail_env;             /* Environment for longjump() break out */
 672   const uschar *end_match_ptr;  /* Subject position at end match */
 673   int     end_offset_top;       /* Highwater mark at end of match */
 674   jmp_buf error_env;          /* For longjmp() if an error occurs deep inside a
 675                                    matching operation */
 676   int    length;                /* Length of the allocated stacks */
 677   int    point;                 /* Point to add next item pushed onto stacks */
 678   /* Pointers to the 6 stacks */
 679   int *off_num, *offset_top, *r1, *r2;
 680   const uschar **eptr, **ecode;
 681 } match_data;
 682
 683
 684
 685 /*************************************************
 686 *               Global variables                 *
 687 *************************************************/
 688
 689 /* PCRE is thread-clean and doesn't use any global variables in the normal
 690 sense. However, it calls memory allocation and free functions via the two
 691 indirections below, which are can be changed by the caller, but are shared
 692 between all threads. */
 693
 694 void *(*pcre_malloc)(size_t) = malloc;
 695 void  (*pcre_free)(void *) = free;
 696
 697
 698
 699
 700 /*************************************************
 701 *          Return version string                 *
 702 *************************************************/
 703
 704 const char *
 705 pcre_version(void)
 706 {
 707 return PCRE_VERSION;
 708 }
 709
 710
 711
 712
 713 /*************************************************
 714 *       Return info about a compiled pattern     *
 715 *************************************************/
 716
 717 /* This function picks potentially useful data out of the private
 718 structure.
 719
 720 Arguments:
 721   external_re   points to compiled code
 722   optptr        where to pass back the options
 723   first_char    where to pass back the first character,
 724                 or -1 if multiline and all branches start ^,
 725                 or -2 otherwise
 726
 727 Returns:        number of identifying extraction brackets
 728                 or negative values on error
 729 */
 730
 731 int
 732 pcre_info(const pcre *external_re, int *optptr, int *first_char)
 733 {
 734 const real_pcre *re = (real_pcre *)external_re;
 735 if (re == NULL) return PCRE_ERROR_NULL;
 736 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 737 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
 738 if (first_char != NULL)
 739   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 740      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 741 return re->top_bracket;
 742 }
 743
 744
 745
 746
 747 #ifdef DEBUG
 748 /*************************************************
 749 *        Debugging function to print chars       *
 750 *************************************************/
 751
 752 /* Print a sequence of chars in printable format, stopping at the end of the
 753 subject if the requested.
 754
 755 Arguments:
 756   p           points to characters
 757   length      number to print
 758   is_subject  TRUE if printing from within md->start_subject
 759   md          pointer to matching data block, if is_subject is TRUE
 760
 761 Returns:     nothing
 762 */
 763
 764 static void
 765 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 766 {
 767 int c;
 768 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 769 while (length-- > 0)
 770   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 771 }
 772 #endif
 773
 774
 775
 776
 777 /*************************************************
 778 *         Check subpattern for empty operand     *
 779 *************************************************/
 780
 781 /* This function checks a bracketed subpattern to see if any of the paths
 782 through it could match an empty string. This is used to diagnose an error if
 783 such a subpattern is followed by a quantifier with an unlimited upper bound.
 784
 785 Argument:
 786   code      points to the opening bracket
 787
 788 Returns:    TRUE or FALSE
 789 */
 790
 791 static BOOL
 792 could_be_empty(uschar *code)
 793 {
 794 do {
 795   uschar *cc = code + 3;
 796
 797   /* Scan along the opcodes for this branch; as soon as we find something
 798   that matches a non-empty string, break out and advance to test the next
 799   branch. If we get to the end of the branch, return TRUE for the whole
 800   sub-expression. */
 801
 802   for (;;)
 803     {
 804     /* Test an embedded subpattern; if it could not be empty, break the
 805     loop. Otherwise carry on in the branch. */
 806
 807     if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
 808       {
 809       if (!could_be_empty(cc)) break;
 810       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 811       cc += 3;
 812       }
 813
 814     else switch (*cc)
 815       {
 816       /* Reached end of a branch: the subpattern may match the empty string */
 817
 818       case OP_ALT:
 819       case OP_KET:
 820       case OP_KETRMAX:
 821       case OP_KETRMIN:
 822       return TRUE;
 823
 824       /* Skip over entire bracket groups with zero lower bound */
 825
 826       case OP_BRAZERO:
 827       case OP_BRAMINZERO:
 828       cc++;
 829       /* Fall through */
 830
 831       /* Skip over assertive subpatterns */
 832
 833       case OP_ASSERT:
 834       case OP_ASSERT_NOT:
 835       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 836       cc += 3;
 837       break;
 838
 839       /* Skip over things that don't match chars */
 840
 841       case OP_SOD:
 842       case OP_EOD:
 843       case OP_CIRC:
 844       case OP_DOLL:
 845       case OP_NOT_WORD_BOUNDARY:
 846       case OP_WORD_BOUNDARY:
 847       case OP_NOT_WORD_BOUNDARY_L:
 848       case OP_WORD_BOUNDARY_L:
 849       cc++;
 850       break;
 851
 852       /* Skip over simple repeats with zero lower bound */
 853
 854       case OP_STAR:
 855       case OP_MINSTAR:
 856       case OP_QUERY:
 857       case OP_MINQUERY:
 858       case OP_NOTSTAR:
 859       case OP_NOTMINSTAR:
 860       case OP_NOTQUERY:
 861       case OP_NOTMINQUERY:
 862       case OP_TYPESTAR:
 863       case OP_TYPEMINSTAR:
 864       case OP_TYPEQUERY:
 865       case OP_TYPEMINQUERY:
 866       cc += 2;
 867       break;
 868
 869       /* Skip over UPTOs (lower bound is zero) */
 870
 871       case OP_UPTO:
 872       case OP_MINUPTO:
 873       case OP_TYPEUPTO:
 874       case OP_TYPEMINUPTO:
 875       cc += 4;
 876       break;
 877
 878       /* Check a class or a back reference for a zero minimum */
 879
 880       case OP_CLASS:
 881       case OP_NEGCLASS:
 882       case OP_REF:
 883       case OP_CLASS_L:
 884         switch(*cc)
 885           {
 886           case (OP_REF):    cc += 2; break;
 887           case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
 888           case (OP_CLASS_L): cc += 1+1+32; break;
 889           }
 890
 891       switch (*cc)
 892         {
 893         case OP_CRSTAR:
 894         case OP_CRMINSTAR:
 895         case OP_CRQUERY:
 896         case OP_CRMINQUERY:
 897         cc++;
 898         break;
 899
 900         case OP_CRRANGE:
 901         case OP_CRMINRANGE:
 902         if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
 903         cc += 3;
 904         break;
 905
 906         default:
 907         goto NEXT_BRANCH;
 908         }
 909       break;
 910
 911       /* Anything else matches at least one character */
 912
 913       default:
 914       goto NEXT_BRANCH;
 915       }
 916     }
 917
 918   NEXT_BRANCH:
 919   code += (code[1] << 8) + code[2];
 920   }
 921 while (*code == OP_ALT);
 922
 923 /* No branches match the empty string */
 924
 925 return FALSE;
 926 }
 927
 928 /* Determine the length of a group ID in an expression like
 929    (?P<foo_123>...)
 930 Arguments:
 931   ptr        pattern position pointer (say that 3 times fast)
 932   finalchar  the character that will mark the end of the ID
 933   errorptr   points to the pointer to the error message
 934 */
 935
 936 static int
 937 get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
 938 {
 939   const uschar *start = ptr;
 940
 941   /* If the first character is not in \w, or is in \w but is a digit,
 942      report an error */
 943   if (!(pcre_ctypes[*ptr] & ctype_word) ||
 944       (pcre_ctypes[*ptr++] & ctype_digit))
 945     {
 946       *errorptr = "(?P identifier must start with a letter or underscore";
 947       return 0;
 948     }
 949
 950   /* Increment ptr until we either hit a null byte, the desired
 951      final character, or a non-word character */
 952   for(; (*ptr != 0) && (*ptr != finalchar) &&
 953         (pcre_ctypes[*ptr] & ctype_word); ptr++)
 954     {
 955       /* Empty loop body */
 956     }
 957   if (*ptr==finalchar)
 958     return ptr-start;
 959   if (*ptr==0)
 960     {
 961       *errorptr = "unterminated (?P identifier";
 962       return 0;
 963     }
 964   *errorptr = "illegal character in (?P identifier";
 965   return 0;
 966 }
 967
 968 /*************************************************
 969 *            Handle escapes                      *
 970 *************************************************/
 971
 972 /* This function is called when a \ has been encountered. It either returns a
 973 positive value for a simple escape such as \n, or a negative value which
 974 encodes one of the more complicated things such as \d. On entry, ptr is
 975 pointing at the \. On exit, it is on the final character of the escape
 976 sequence.
 977
 978 Arguments:
 979   ptrptr     points to the pattern position pointer
 980   errorptr   points to the pointer to the error message
 981   bracount   number of previous extracting brackets
 982   options    the options bits
 983   isclass    TRUE if inside a character class
 984
 985 Returns:     zero or positive => a data character
 986              negative => a special escape sequence
 987              on error, errorptr is set
 988 */
 989
 990 static int
 991 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 992              int options, BOOL isclass)
 993 {
 994 const uschar *ptr = *ptrptr;
 995 int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
 996 int i;
 997
 998 if (c == 0) *errorptr = ERR1;
 999
1000 /* Digits or letters may have special meaning; all others are literals. */
1001
1002 else if (c < '0' || c > 'z') {}
1003
1004 /* Do an initial lookup in a table. A non-zero result is something that can be
1005 returned immediately. Otherwise further processing may be required. */
1006
1007 else if ((i = escapes[c - '0']) != 0) c = i;
1008
1009 /* Escapes that need further processing, or are illegal. */
1010
1011 else
1012   {
1013
1014   switch (c)
1015     {
1016     /* The handling of escape sequences consisting of a string of digits
1017     starting with one that is not zero is not straightforward. By experiment,
1018     the way Perl works seems to be as follows:
1019
1020     Outside a character class, the digits are read as a decimal number. If the
1021     number is less than 10, or if there are that many previous extracting
1022     left brackets, then it is a back reference. Otherwise, up to three octal
1023     digits are read to form an escaped byte. Thus \123 is likely to be octal
1024     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1025     value is greater than 377, the least significant 8 bits are taken. Inside a
1026     character class, \ followed by a digit is always an octal number. */
1027
1028     case '1': case '2': case '3': case '4': case '5':
1029     case '6': case '7': case '8': case '9':
1030
1031     {
1032       /* PYTHON: Try to compute an octal value for a character */
1033       for(c=0, i=0; ptr[i]!=0 && i<3; i++)
1034         {
1035           if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1036             c = (c * 8 + ptr[i]-'0') & 255;
1037           else
1038             break; /* Non-octal character--break out of the loop */
1039         }
1040       /* It's a character if there were exactly 3 octal digits, or if
1041          we're inside a character class and there was at least one
1042          octal digit. */
1043       if ( (i == 3) || (isclass && i!=0) )
1044         {
1045           ptr += i-1;
1046           break;
1047         }
1048       c = ptr[0]; /* Restore the first character after the \ */
1049       c -= '0'; i = 1;
1050       while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1051         {
1052           c = c * 10 + ptr[1] - '0';
1053           ptr++; i++;
1054         }
1055       if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1056       c = -(ESC_REF + c);
1057     }
1058   break;
1059
1060     /* \0 always starts an octal number, but we may drop through to here with a
1061     larger first octal digit */
1062
1063     case '0':
1064     c -= '0';
1065     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1066       ptr[1] != '8' && ptr[1] != '9')
1067         c = (c * 8 + *(++ptr) - '0') & 255;
1068     break;
1069
1070     /* Special escapes not starting with a digit are straightforward */
1071
1072     case 'x':
1073   c = 0;
1074   while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
1075     {
1076     ptr++;
1077     c = c * 16 + pcre_lcc[*ptr] -
1078       (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1079     c &= 255;
1080     }
1081   break;
1082
1083
1084     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1085     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1086     for Perl compatibility, it is a literal. */
1087
1088     default:
1089     if ((options & PCRE_EXTRA) != 0) switch(c)
1090       {
1091       case 'X':
1092       c = -ESC_X;      /* This could be a lookup if it ever got into Perl */
1093       break;
1094
1095       default:
1096       *errorptr = ERR3;
1097       break;
1098       }
1099     break;
1100     }
1101   }
1102
1103 *ptrptr = ptr;
1104 return c;
1105 }
1106
1107
1108
1109 /*************************************************
1110 *            Check for counted repeat            *
1111 *************************************************/
1112
1113 /* This function is called when a '{' is encountered in a place where it might
1114 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1115 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1116 where the ddds are digits.
1117
1118 Arguments:
1119   p         pointer to the first char after '{'
1120
1121 Returns:    TRUE or FALSE
1122 */
1123
1124 static BOOL
1125 is_counted_repeat(const uschar *p)
1126 {
1127 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1128 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1129 if (*p == '}') return TRUE;
1130
1131 if (*p++ != ',') return FALSE;
1132 if (*p == '}') return TRUE;
1133
1134 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1135 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1136 return (*p == '}');
1137 }
1138
1139
1140
1141 /*************************************************
1142 *         Read repeat counts                     *
1143 *************************************************/
1144
1145 /* Read an item of the form {n,m} and return the values. This is called only
1146 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1147 so the syntax is guaranteed to be correct, but we need to check the values.
1148
1149 Arguments:
1150   p          pointer to first char after '{'
1151   minp       pointer to int for min
1152   maxp       pointer to int for max
1153              returned as -1 if no max
1154   errorptr   points to pointer to error message
1155
1156 Returns:     pointer to '}' on success;
1157              current ptr on error, with errorptr set
1158 */
1159
1160 static const uschar *
1161 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1162 {
1163 int min = 0;
1164 int max = -1;
1165
1166 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1167
1168 if (*p == '}') max = min; else
1169   {
1170   if (*(++p) != '}')
1171     {
1172     max = 0;
1173     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1174     if (max < min)
1175       {
1176       *errorptr = ERR4;
1177       return p;
1178       }
1179     }
1180   }
1181
1182 /* Do paranoid checks, then fill in the required variables, and pass back the
1183 pointer to the terminating '}'. */
1184
1185 if (min > 65535 || max > 65535)
1186   *errorptr = ERR5;
1187 else
1188   {
1189   *minp = min;
1190   *maxp = max;
1191   }
1192 return p;
1193 }
1194
1195
1196
1197 /*************************************************
1198 *           Compile one branch                   *
1199 *************************************************/
1200
1201 /* Scan the pattern, compiling it into the code vector.
1202
1203 Arguments:
1204   options    the option bits
1205   bracket    points to number of brackets used
1206   code       points to the pointer to the current code point
1207   ptrptr     points to the current pattern pointer
1208   errorptr   points to pointer to error message
1209
1210 Returns:     TRUE on success
1211              FALSE, with *errorptr set on error
1212 */
1213
1214 static BOOL
1215 compile_branch(int options, int *brackets, uschar **codeptr,
1216                const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
1217 {
1218 int repeat_type, op_type;
1219 int repeat_min, repeat_max;
1220 int bravalue, length;
1221 int greedy_default, greedy_non_default;
1222 register int c;
1223 register uschar *code = *codeptr;
1224 const uschar *ptr = *ptrptr;
1225 const uschar *oldptr;
1226 uschar *previous = NULL;
1227 uschar class[32];
1228 uschar *class_flag;  /* Pointer to the single-byte flag for OP_CLASS_L */
1229
1230 /* Set up the default and non-default settings for greediness */
1231
1232 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1233 greedy_non_default = greedy_default ^ 1;
1234
1235 /* Switch on next character until the end of the branch */
1236
1237 for (;; ptr++)
1238   {
1239   BOOL negate_class;
1240   int  class_charcount;
1241   int  class_lastchar;
1242
1243   c = *ptr;
1244   if ((options & PCRE_EXTENDED) != 0)
1245     {
1246     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1247     if (c == '#')
1248       {
1249       while ((c = *(++ptr)) != 0 && c != '\n');
1250       continue;
1251       }
1252     }
1253
1254   switch(c)
1255     {
1256     /* The branch terminates at end of string, |, or ). */
1257
1258     case 0:
1259     case '|':
1260     case ')':
1261     *codeptr = code;
1262     *ptrptr = ptr;
1263     return TRUE;
1264
1265     /* Handle single-character metacharacters */
1266
1267     case '^':
1268     previous = NULL;
1269     *code++ = OP_CIRC;
1270     break;
1271
1272     case '$':
1273     previous = NULL;
1274     *code++ = OP_DOLL;
1275     break;
1276
1277     case '.':
1278     previous = code;
1279     *code++ = OP_ANY;
1280     break;
1281
1282     /* Character classes. These always build a 32-byte bitmap of the permitted
1283     characters, except in the special case where there is only one character.
1284     For negated classes, we build the map as usual, then invert it at the end.
1285     */
1286
1287     case '[':
1288     previous = code;
1289     if (options & PCRE_LOCALE)
1290       {
1291         *code++ = OP_CLASS_L;
1292         /* Set the flag for localized classes (like \w) to 0 */
1293         class_flag = code;
1294         *class_flag = 0;
1295       }
1296     else
1297       {
1298         *code++ = OP_CLASS;
1299         class_flag = NULL;
1300       }
1301
1302     /* If the first character is '^', set the negation flag, and use a
1303     different opcode. This only matters if caseless matching is specified at
1304     runtime. */
1305
1306     if ((c = *(++ptr)) == '^')
1307       {
1308       negate_class = TRUE;
1309       if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
1310       c = *(++ptr);
1311       }
1312     else negate_class = FALSE;
1313
1314     /* Keep a count of chars so that we can optimize the case of just a single
1315     character. */
1316
1317     class_charcount = 0;
1318     class_lastchar = -1;
1319
1320     /* Initialize the 32-char bit map to all zeros. We have to build the
1321     map in a temporary bit of store, in case the class contains only 1
1322     character, because in that case the compiled code doesn't use the
1323     bit map. */
1324
1325     memset(class, 0, 32 * sizeof(uschar));
1326
1327     /* Process characters until ] is reached. By writing this as a "do" it
1328     means that an initial ] is taken as a data character. */
1329
1330     do
1331       {
1332       if (c == 0)
1333         {
1334         *errorptr = ERR6;
1335         goto FAILED;
1336         }
1337
1338       /* Backslash may introduce a single character, or it may introduce one
1339       of the specials, which just set a flag. Escaped items are checked for
1340       validity in the pre-compiling pass. The sequence \b is a special case.
1341       Inside a class (and only there) it is treated as backspace. Elsewhere
1342       it marks a word boundary. Other escapes have preset maps ready to
1343       or into the one we are building. We assume they have more than one
1344       character in them, so set class_count bigger than one. */
1345
1346       if (c == '\\')
1347         {
1348         c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1349         if (-c == ESC_b) c = '\b';
1350         else if (c < 0)
1351           {
1352           class_charcount = 10;
1353           switch (-c)
1354             {
1355             case ESC_d:
1356               {
1357                 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1358               }
1359             continue;
1360
1361             case ESC_D:
1362               {
1363                 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1364               }
1365             continue;
1366
1367             case ESC_w:
1368             if (options & PCRE_LOCALE)
1369               {
1370                 *class_flag |= 1;
1371               }
1372             else
1373               {
1374                 for (c = 0; c < 32; c++)
1375                   class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1376               }
1377             continue;
1378
1379             case ESC_W:
1380             if (options & PCRE_LOCALE)
1381               {
1382                 *class_flag |= 2;
1383               }
1384             else
1385               {
1386                 for (c = 0; c < 32; c++)
1387                   class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1388               }
1389             continue;
1390
1391             case ESC_s:
1392               {
1393                 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1394               }
1395             continue;
1396
1397             case ESC_S:
1398               {
1399                 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1400               }
1401             continue;
1402
1403             default:
1404             *errorptr = ERR7;
1405             goto FAILED;
1406             }
1407           }
1408         /* Fall through if single character */
1409         }
1410
1411       /* A single character may be followed by '-' to form a range. However,
1412       Perl does not permit ']' to be the end of the range. A '-' character
1413       here is treated as a literal. */
1414
1415       if (ptr[1] == '-' && ptr[2] != ']')
1416         {
1417         int d;
1418         ptr += 2;
1419         d = *ptr;
1420
1421         if (d == 0)
1422           {
1423           *errorptr = ERR6;
1424           goto FAILED;
1425           }
1426
1427         /* The second part of a range can be a single-character escape, but
1428         not any of the other escapes. */
1429
1430         if (d == '\\')
1431           {
1432           d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1433           if (d < 0)
1434             {
1435             if (d == -ESC_b) d = '\b'; else
1436               {
1437               *errorptr = ERR7;
1438               goto FAILED;
1439               }
1440             }
1441           }
1442
1443         if (d < c)
1444           {
1445           *errorptr = ERR8;
1446           goto FAILED;
1447           }
1448
1449         for (; c <= d; c++)
1450           {
1451           class[c/8] |= (1 << (c&7));
1452           if ((options & PCRE_CASELESS) != 0)
1453             {
1454             int uc = pcre_fcc[c];           /* flip case */
1455             class[uc/8] |= (1 << (uc&7));
1456             }
1457           class_charcount++;                /* in case a one-char range */
1458           class_lastchar = c;
1459           }
1460         continue;   /* Go get the next char in the class */
1461         }
1462
1463       /* Handle a lone single character - we can get here for a normal
1464       non-escape char, or after \ that introduces a single character. */
1465
1466       class [c/8] |= (1 << (c&7));
1467       if ((options & PCRE_CASELESS) != 0)
1468         {
1469         c = pcre_fcc[c];   /* flip case */
1470         class[c/8] |= (1 << (c&7));
1471         }
1472       class_charcount++;
1473       class_lastchar = c;
1474       }
1475
1476     /* Loop until ']' reached; the check for end of string happens inside the
1477     loop. This "while" is the end of the "do" above. */
1478
1479     while ((c = *(++ptr)) != ']');
1480
1481     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1482     precisely one character. This doesn't need the whole 32-byte bit map.
1483     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1484     it's negative. */
1485
1486     if (class_charcount == 1 && class_lastchar >= 0)
1487       {
1488       if (negate_class)
1489         {
1490         code[-1] = OP_NOT;
1491         }
1492       else
1493         {
1494         code[-1] = OP_CHARS;
1495         *code++ = 1;
1496         }
1497       *code++ = class_lastchar;
1498       }
1499
1500     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1501     the code vector. */
1502
1503     else
1504       {
1505         /* If this is a localized opcode, bump the code pointer up */
1506         if (class_flag) code++;
1507       if (negate_class)
1508         {
1509           if (class_flag) *class_flag = (*class_flag) ^ 63;
1510           for (c = 0; c < 32; c++) code[c] = ~class[c];
1511         }
1512       else
1513         memcpy(code, class, 32);
1514       code += 32;
1515       }
1516     break;
1517
1518     /* Various kinds of repeat */
1519
1520     case '{':
1521     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1522     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1523     if (*errorptr != NULL) goto FAILED;
1524     goto REPEAT;
1525
1526     case '*':
1527     repeat_min = 0;
1528     repeat_max = -1;
1529     goto REPEAT;
1530
1531     case '+':
1532     repeat_min = 1;
1533     repeat_max = -1;
1534     goto REPEAT;
1535
1536     case '?':
1537     repeat_min = 0;
1538     repeat_max = 1;
1539
1540     REPEAT:
1541     if (previous == NULL)
1542       {
1543       *errorptr = ERR9;
1544       goto FAILED;
1545       }
1546
1547     /* If the next character is '?' this is a minimizing repeat, by default,
1548     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1549     next character. */
1550
1551     if (ptr[1] == '?')
1552       { repeat_type = greedy_non_default; ptr++; }
1553     else repeat_type = greedy_default;
1554
1555     /* If the maximum is zero then the minimum must also be zero; Perl allows
1556     this case, so we do too - by simply omitting the item altogether. */
1557
1558     if (repeat_max == 0) code = previous;
1559
1560     /* If previous was a string of characters, chop off the last one and use it
1561     as the subject of the repeat. If there was only one character, we can
1562     abolish the previous item altogether. */
1563
1564     else if (*previous == OP_CHARS)
1565       {
1566       int len = previous[1];
1567       if (len == 1)
1568         {
1569         c = previous[2];
1570         code = previous;
1571         }
1572       else
1573         {
1574         c = previous[len+1];
1575         previous[1]--;
1576         code--;
1577         }
1578       op_type = 0;                 /* Use single-char op codes */
1579       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1580       }
1581
1582     /* If previous was a single negated character ([^a] or similar), we use
1583     one of the special opcodes, replacing it. The code is shared with single-
1584     character repeats by adding a suitable offset into repeat_type. */
1585
1586     else if ((int)*previous == OP_NOT)
1587       {
1588       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1589       c = previous[1];
1590       code = previous;
1591       goto OUTPUT_SINGLE_REPEAT;
1592       }
1593
1594     /* If previous was a character type match (\d or similar), abolish it and
1595     create a suitable repeat item. The code is shared with single-character
1596     repeats by adding a suitable offset into repeat_type. */
1597
1598     else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
1599       {
1600       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1601       c = *previous;
1602       code = previous;
1603
1604       OUTPUT_SINGLE_REPEAT:
1605       repeat_type += op_type;      /* Combine both values for many cases */
1606
1607       /* A minimum of zero is handled either as the special case * or ?, or as
1608       an UPTO, with the maximum given. */
1609
1610       if (repeat_min == 0)
1611         {
1612         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1613           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1614         else
1615           {
1616           *code++ = OP_UPTO + repeat_type;
1617           *code++ = repeat_max >> 8;
1618           *code++ = (repeat_max & 255);
1619           }
1620         }
1621
1622       /* The case {1,} is handled as the special case + */
1623
1624       else if (repeat_min == 1 && repeat_max == -1)
1625         *code++ = OP_PLUS + repeat_type;
1626
1627       /* The case {n,n} is just an EXACT, while the general case {n,m} is
1628       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1629
1630       else
1631         {
1632         if (repeat_min != 1)
1633           {
1634           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1635           *code++ = repeat_min >> 8;
1636           *code++ = (repeat_min & 255);
1637           }
1638
1639         /* If the minimum is 1 and the previous item was a character string,
1640         we either have to put back the item that got canceled if the string
1641         length was 1, or add the character back onto the end of a longer
1642         string. For a character type nothing need be done; it will just get
1643         put back naturally. Note that the final character is always going to
1644         get added below. */
1645
1646         else if (*previous == OP_CHARS)
1647           {
1648           if (code == previous) code += 2; else previous[1]++;
1649           }
1650
1651         /*  For a single negated character we also have to put back the
1652         item that got canceled. */
1653
1654         else if (*previous == OP_NOT) code++;
1655
1656         /* If the maximum is unlimited, insert an OP_STAR. */
1657
1658         if (repeat_max < 0)
1659           {
1660           *code++ = c;
1661           *code++ = OP_STAR + repeat_type;
1662           }
1663
1664         /* Else insert an UPTO if the max is greater than the min. */
1665
1666         else if (repeat_max != repeat_min)
1667           {
1668           *code++ = c;
1669           repeat_max -= repeat_min;
1670           *code++ = OP_UPTO + repeat_type;
1671           *code++ = repeat_max >> 8;
1672           *code++ = (repeat_max & 255);
1673           }
1674         }
1675
1676       /* The character or character type itself comes last in all cases. */
1677
1678       *code++ = c;
1679       }
1680
1681     /* If previous was a character class or a back reference, we put the repeat
1682     stuff after it. */
1683
1684     else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1685              *previous==OP_CLASS_L || *previous == OP_REF)
1686       {
1687       if (repeat_min == 0 && repeat_max == -1)
1688         *code++ = OP_CRSTAR + repeat_type;
1689       else if (repeat_min == 1 && repeat_max == -1)
1690         *code++ = OP_CRPLUS + repeat_type;
1691       else if (repeat_min == 0 && repeat_max == 1)
1692         *code++ = OP_CRQUERY + repeat_type;
1693       else
1694         {
1695         *code++ = OP_CRRANGE + repeat_type;
1696         *code++ = repeat_min >> 8;
1697         *code++ = repeat_min & 255;
1698         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
1699         *code++ = repeat_max >> 8;
1700         *code++ = repeat_max & 255;
1701         }
1702       }
1703
1704     /* If previous was a bracket group, we may have to replicate it in certain
1705     cases. If the maximum repeat count is unlimited, check that the bracket
1706     group cannot match the empty string, and diagnose an error if it can. */
1707
1708     else if ((int)*previous >= OP_BRA)
1709       {
1710       int i;
1711       int len = code - previous;
1712
1713       if (repeat_max == -1 && could_be_empty(previous))
1714         {
1715         *errorptr = ERR10;
1716         goto FAILED;
1717         }
1718
1719       /* If the minimum is greater than zero, and the maximum is unlimited or
1720       equal to the minimum, the first copy remains where it is, and is
1721       replicated up to the minimum number of times. This case includes the +
1722       repeat, but of course no replication is needed in that case. */
1723
1724       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1725         {
1726         for (i = 1; i < repeat_min; i++)
1727           {
1728           memcpy(code, previous, len);
1729           code += len;
1730           }
1731         }
1732
1733       /* If the minimum is zero, stick BRAZERO in front of the first copy.
1734       Then, if there is a fixed upper limit, replicated up to that many times,
1735       sticking BRAZERO in front of all the optional ones. */
1736
1737       else
1738         {
1739         if (repeat_min == 0)
1740           {
1741           memmove(previous+1, previous, len);
1742           code++;
1743           *previous++ = OP_BRAZERO + repeat_type;
1744           }
1745
1746         for (i = 1; i < repeat_min; i++)
1747           {
1748           memcpy(code, previous, len);
1749           code += len;
1750           }
1751
1752         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1753           {
1754           *code++ = OP_BRAZERO + repeat_type;
1755           memcpy(code, previous, len);
1756           code += len;
1757           }
1758         }
1759
1760       /* If the maximum is unlimited, set a repeater in the final copy. */
1761
1762       if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1763       }
1764
1765     /* Else there's some kind of shambles */
1766
1767     else
1768       {
1769       *errorptr = ERR11;
1770       goto FAILED;
1771       }
1772
1773     /* In all case we no longer have a previous item. */
1774
1775     previous = NULL;
1776     break;
1777
1778
1779     /* Start of nested bracket sub-expression, or comment or lookahead.
1780     First deal with special things that can come after a bracket; all are
1781     introduced by ?, and the appearance of any of them means that this is not a
1782     referencing group. They were checked for validity in the first pass over
1783     the string, so we don't have to check for syntax errors here.  */
1784
1785     case '(':
1786     previous = code;              /* Only real brackets can be repeated */
1787     if (*(++ptr) == '?')
1788       {
1789       bravalue = OP_BRA;
1790
1791       switch (*(++ptr))
1792         {
1793         case '#':
1794         case 'i':
1795         case 'L':
1796         case 'm':
1797         case 's':
1798         case 'x':
1799         ptr++;
1800         while (*ptr != ')') ptr++;
1801         previous = NULL;
1802         continue;
1803
1804         case ':':                 /* Non-extracting bracket */
1805         ptr++;
1806         break;
1807
1808         case '=':                 /* Assertions can't be repeated */
1809         bravalue = OP_ASSERT;
1810         ptr++;
1811         previous = NULL;
1812         break;
1813
1814         case '!':
1815         bravalue = OP_ASSERT_NOT;
1816         ptr++;
1817         previous = NULL;
1818         break;
1819
1820         case ('P'):
1821           ptr++;
1822           if (*ptr=='<')
1823             {
1824               /* (?P<groupname>...) */
1825               int idlen;
1826               PyObject *string, *intobj;
1827
1828               ptr++;
1829               idlen = get_group_id(ptr, '>', errorptr);
1830               if (*errorptr) {
1831                 goto FAILED;
1832               }
1833               string = PyString_FromStringAndSize((char*)ptr, idlen);
1834               intobj = PyInt_FromLong( brackets[0] + 1 );
1835               if (intobj == NULL || string == NULL)
1836                 {
1837                   Py_XDECREF(string);
1838                   Py_XDECREF(intobj);
1839                   *errorptr = "exception raised";
1840                   goto FAILED;
1841                 }
1842               PyDict_SetItem(dictionary, string, intobj);
1843               Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
1844               ptr += idlen+1;  /* Point to rest of expression */
1845               goto do_grouping_bracket;
1846             }
1847           if (*ptr=='=')
1848             {
1849               /* (?P=groupname) */
1850               int idlen, refnum;
1851               PyObject *string, *intobj;
1852
1853               ptr++;
1854               idlen = get_group_id(ptr, ')', errorptr);
1855               if (*errorptr) {
1856                 goto FAILED;
1857               }
1858               string = PyString_FromStringAndSize((char *)ptr, idlen);
1859               if (string==NULL) {
1860                   *errorptr = "exception raised";
1861                   goto FAILED;
1862                 }
1863               intobj = PyDict_GetItem(dictionary, string);
1864               if (intobj==NULL) {
1865                 Py_DECREF(string);
1866                 *errorptr = "?P= group identifier isn't defined";
1867                 goto FAILED;
1868               }
1869
1870               refnum = PyInt_AsLong(intobj);
1871               Py_DECREF(string);
1872               /* The caller doesn't own the reference to the value
1873                  returned from PyDict_GetItem, so intobj is not
1874                  DECREF'ed. */
1875
1876               *code++ = OP_REF;
1877               *code++ = refnum;
1878               /* The continue will cause the top-level for() loop to
1879                  be resumed, so ptr will be immediately incremented.
1880                  Therefore, the following line adds just idlen, not
1881                  idlen+1 */
1882               ptr += idlen;
1883               continue;
1884             }
1885           /* The character after ?P is neither < nor =, so
1886              report an error.  Add more Python-extensions here. */
1887           *errorptr="unknown after (?P";
1888           goto FAILED;
1889
1890         case '>':                         /* "Match once" brackets */
1891         if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */
1892           {
1893           bravalue = OP_ONCE;
1894           ptr++;
1895           previous = NULL;
1896           break;
1897           }
1898         /* Else fall through */
1899
1900         default:
1901         *errorptr = ERR12;
1902         goto FAILED;
1903         }
1904       }
1905
1906     /* Else we have a referencing group */
1907
1908     else
1909       {
1910       do_grouping_bracket:
1911       if (++(*brackets) > EXTRACT_MAX)
1912         {
1913         *errorptr = ERR13;
1914         goto FAILED;
1915         }
1916       bravalue = OP_BRA + *brackets;
1917       }
1918
1919     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1920     code into a non-register variable in order to be able to pass its address
1921     because some compilers complain otherwise. */
1922
1923     *code = bravalue;
1924       {
1925       uschar *mcode = code;
1926       if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
1927         goto FAILED;
1928       code = mcode;
1929       }
1930
1931     if (*ptr != ')')
1932       {
1933       *errorptr = ERR14;
1934       goto FAILED;
1935       }
1936     break;
1937
1938     /* Check \ for being a real metacharacter; if not, fall through and handle
1939     it as a data character at the start of a string. Escape items are checked
1940     for validity in the pre-compiling pass. */
1941
1942     case '\\':
1943     oldptr = ptr;
1944     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1945
1946     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1947     are arranged to be the negation of the corresponding OP_values. For the
1948     back references, the values are ESC_REF plus the reference number. Only
1949     back references and those types that consume a character may be repeated.
1950     We can test for values between ESC_b and ESC_Z for the latter; this may
1951     have to change if any new ones are ever created. */
1952
1953     if (c < 0)
1954       {
1955       if (-c >= ESC_REF)
1956         {
1957         int refnum = -c - ESC_REF;
1958         if (*brackets < refnum)
1959           {
1960           *errorptr = ERR15;
1961           goto FAILED;
1962           }
1963         previous = code;
1964         *code++ = OP_REF;
1965         *code++ = refnum;
1966         }
1967       else
1968         {
1969         previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1970         if ( (options & PCRE_LOCALE) != 0)
1971           {
1972             switch (c)
1973               {
1974                 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1975                 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1976                 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1977                 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1978               }
1979           }
1980         *code++ = -c;
1981         }
1982       continue;
1983       }
1984
1985     /* Data character: Reset and fall through */
1986
1987     ptr = oldptr;
1988     c = '\\';
1989
1990     /* Handle a run of data characters until a metacharacter is encountered.
1991     The first character is guaranteed not to be whitespace or # when the
1992     extended flag is set. */
1993
1994     NORMAL_CHAR:
1995     default:
1996     previous = code;
1997     *code = OP_CHARS;
1998     code += 2;
1999     length = 0;
2000
2001     do
2002       {
2003       if ((options & PCRE_EXTENDED) != 0)
2004         {
2005         if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2006         if (c == '#')
2007           {
2008           while ((c = *(++ptr)) != 0 && c != '\n');
2009           if (c == 0) break;
2010           continue;
2011           }
2012         }
2013
2014       /* Backslash may introduce a data char or a metacharacter. Escaped items
2015       are checked for validity in the pre-compiling pass. Stop the string
2016       before a metaitem. */
2017
2018       if (c == '\\')
2019         {
2020         oldptr = ptr;
2021         c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
2022         if (c < 0) { ptr = oldptr; break; }
2023         }
2024
2025       /* Ordinary character or single-char escape */
2026
2027       *code++ = c;
2028       length++;
2029       }
2030
2031     /* This "while" is the end of the "do" above. */
2032
2033     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2034
2035     /* Compute the length and set it in the data vector, and advance to
2036     the next state. */
2037
2038     previous[1] = length;
2039     if (length < 255) ptr--;
2040     break;
2041     }
2042   }                   /* end of big loop */
2043
2044 /* Control never reaches here by falling through, only by a goto for all the
2045 error states. Pass back the position in the pattern so that it can be displayed
2046 to the user for diagnosing the error. */
2047
2048 FAILED:
2049 *ptrptr = ptr;
2050 return FALSE;
2051 }
2052
2053
2054
2055
2056 /*************************************************
2057 *     Compile sequence of alternatives           *
2058 *************************************************/
2059
2060 /* On entry, ptr is pointing past the bracket character, but on return
2061 it points to the closing bracket, or vertical bar, or end of string.
2062 The code variable is pointing at the byte into which the BRA operator has been
2063 stored.
2064
2065 Argument:
2066   options   the option bits
2067   brackets  -> int containing the number of extracting brackets used
2068   codeptr   -> the address of the current code pointer
2069   ptrptr    -> the address of the current pattern pointer
2070   errorptr  -> pointer to error message
2071
2072 Returns:    TRUE on success
2073 */
2074
2075 static BOOL
2076 compile_regex(int options, int *brackets, uschar **codeptr,
2077   const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
2078 {
2079 const uschar *ptr = *ptrptr;
2080 uschar *code = *codeptr;
2081 uschar *start_bracket = code;
2082
2083 for (;;)
2084   {
2085   int length;
2086   uschar *last_branch = code;
2087
2088   code += 3;
2089   if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
2090     {
2091     *ptrptr = ptr;
2092     return FALSE;
2093     }
2094
2095   /* Fill in the length of the last branch */
2096
2097   length = code - last_branch;
2098   last_branch[1] = length >> 8;
2099   last_branch[2] = length & 255;
2100
2101   /* Reached end of expression, either ')' or end of pattern. Insert a
2102   terminating ket and the length of the whole bracketed item, and return,
2103   leaving the pointer at the terminating char. */
2104
2105   if (*ptr != '|')
2106     {
2107     length = code - start_bracket;
2108     *code++ = OP_KET;
2109     *code++ = length >> 8;
2110     *code++ = length & 255;
2111     *codeptr = code;
2112     *ptrptr = ptr;
2113     return TRUE;
2114     }
2115
2116   /* Another branch follows; insert an "or" node and advance the pointer. */
2117
2118   *code = OP_ALT;
2119   ptr++;
2120   }
2121 /* Control never reaches here */
2122 }
2123
2124
2125
2126 /*************************************************
2127 *          Check for anchored expression         *
2128 *************************************************/
2129
2130 /* Try to find out if this is an anchored regular expression. Consider each
2131 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2132 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2133 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2134 counts, since OP_CIRC can match in the middle.
2135
2136 A branch is also implicitly anchored if it starts with .* because that will try
2137 the rest of the pattern at all possible matching points, so there is no point
2138 trying them again.
2139
2140 Argument:  points to start of expression (the bracket)
2141 Returns:   TRUE or FALSE
2142 */
2143
2144 static BOOL
2145 is_anchored(register const uschar *code, BOOL multiline)
2146 {
2147 do {
2148    int op = (int)code[3];
2149    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2150      { if (!is_anchored(code+3, multiline)) return FALSE; }
2151    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2152      { if (code[4] != OP_ANY) return FALSE; }
2153    else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2154    code += (code[1] << 8) + code[2];
2155    }
2156 while (*code == OP_ALT);
2157 return TRUE;
2158 }
2159
2160
2161
2162 /*************************************************
2163 *     Check for start with \n line expression    *
2164 *************************************************/
2165
2166 /* This is called for multiline expressions to try to find out if every branch
2167 starts with ^ so that "first char" processing can be done to speed things up.
2168
2169 Argument:  points to start of expression (the bracket)
2170 Returns:   TRUE or FALSE
2171 */
2172
2173 static BOOL
2174 is_startline(const uschar *code)
2175 {
2176 do {
2177    if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2178      { if (!is_startline(code+3)) return FALSE; }
2179    else if (code[3] != OP_CIRC) return FALSE;
2180    code += (code[1] << 8) + code[2];
2181    }
2182 while (*code == OP_ALT);
2183 return TRUE;
2184 }
2185
2186
2187
2188 /*************************************************
2189 *          Check for fixed first char            *
2190 *************************************************/
2191
2192 /* Try to find out if there is a fixed first character. This is called for
2193 unanchored expressions, as it speeds up their processing quite considerably.
2194 Consider each alternative branch. If they all start with the same char, or with
2195 a bracket all of whose alternatives start with the same char (recurse ad lib),
2196 then we return that char, otherwise -1.
2197
2198 Argument:  points to start of expression (the bracket)
2199 Returns:   -1 or the fixed first char
2200 */
2201
2202 static int
2203 find_firstchar(uschar *code)
2204 {
2205 register int c = -1;
2206 do
2207   {
2208   register int charoffset = 4;
2209
2210   if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2211     {
2212     register int d;
2213     if ((d = find_firstchar(code+3)) < 0) return -1;
2214     if (c < 0) c = d; else if (c != d) return -1;
2215     }
2216
2217   else switch(code[3])
2218     {
2219     default:
2220     return -1;
2221
2222     case OP_EXACT:       /* Fall through */
2223     charoffset++;
2224
2225     case OP_CHARS:       /* Fall through */
2226     charoffset++;
2227
2228     case OP_PLUS:
2229     case OP_MINPLUS:
2230     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2231     break;
2232     }
2233   code += (code[1] << 8) + code[2];
2234   }
2235 while (*code == OP_ALT);
2236 return c;
2237 }
2238
2239
2240
2241 /*************************************************
2242 *        Compile a Regular Expression            *
2243 *************************************************/
2244
2245 /* This function takes a string and returns a pointer to a block of store
2246 holding a compiled version of the expression.
2247
2248 Arguments:
2249   pattern      the regular expression
2250   options      various option bits
2251   errorptr     pointer to pointer to error text
2252   erroroffset  ptr offset in pattern where error was detected
2253
2254 Returns:       pointer to compiled data block, or NULL on error,
2255                with errorptr and erroroffset set
2256 */
2257
2258 pcre *
2259 pcre_compile(const char *pattern, int options, const char **errorptr,
2260              int *erroroffset, PyObject *dictionary)
2261 {
2262 real_pcre *re;
2263 int spaces = 0;
2264 int length = 3;      /* For initial BRA plus length */
2265 int runlength;
2266 int c, size;
2267 int bracount = 0;
2268 int brastack[200];
2269 int top_backref = 0;
2270 unsigned int brastackptr = 0;
2271 uschar *code;
2272 const uschar *ptr;
2273
2274 #ifdef DEBUG
2275 uschar *code_base, *code_end;
2276 #endif
2277
2278 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2279 can do is just return NULL. */
2280
2281 if (errorptr == NULL) return NULL;
2282 *errorptr = NULL;
2283
2284 /* However, we can give a message for this error */
2285
2286 if (erroroffset == NULL)
2287   {
2288   *errorptr = ERR16;
2289   return NULL;
2290   }
2291 *erroroffset = 0;
2292
2293 if ((options & ~PUBLIC_OPTIONS) != 0)
2294   {
2295   *errorptr = ERR17;
2296   return NULL;
2297   }
2298
2299 DPRINTF(("------------------------------------------------------------------\n"));
2300 DPRINTF(("%s\n", pattern));
2301
2302 /* The first thing to do is to make a pass over the pattern to compute the
2303 amount of store required to hold the compiled code. This does not have to be
2304 perfect as long as errors are overestimates. At the same time we can detect any
2305 internal flag settings. Make an attempt to correct for any counted white space
2306 if an "extended" flag setting appears late in the pattern. We can't be so
2307 clever for #-comments. */
2308
2309 ptr = (const uschar *)(pattern - 1);
2310 while ((c = *(++ptr)) != 0)
2311   {
2312   int min, max;
2313   int class_charcount;
2314
2315   if ((pcre_ctypes[c] & ctype_space) != 0)
2316     {
2317     if ((options & PCRE_EXTENDED) != 0) continue;
2318     spaces++;
2319     }
2320
2321   if (c == '#' && (options & PCRE_EXTENDED) != 0)
2322     {
2323     while ((c = *(++ptr)) != 0 && c != '\n');
2324     continue;
2325     }
2326
2327   switch(c)
2328     {
2329     /* A backslashed item may be an escaped "normal" character or a
2330     character type. For a "normal" character, put the pointers and
2331     character back so that tests for whitespace etc. in the input
2332     are done correctly. */
2333
2334     case '\\':
2335       {
2336       const uschar *save_ptr = ptr;
2337       c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2338       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2339       if (c >= 0)
2340         {
2341         ptr = save_ptr;
2342         c = '\\';
2343         goto NORMAL_CHAR;
2344         }
2345       }
2346     length++;
2347
2348     /* A back reference needs an additional char, plus either one or 5
2349     bytes for a repeat. We also need to keep the value of the highest
2350     back reference. */
2351
2352     if (c <= -ESC_REF)
2353       {
2354       int refnum = -c - ESC_REF;
2355       if (refnum > top_backref) top_backref = refnum;
2356       length++;   /* For single back reference */
2357       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2358         {
2359         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2360         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2361         if ((min == 0 && (max == 1 || max == -1)) ||
2362           (min == 1 && max == -1))
2363             length++;
2364         else length += 5;
2365         if (ptr[1] == '?') ptr++;
2366         }
2367       }
2368     continue;
2369
2370     case '^':
2371     case '.':
2372     case '$':
2373     case '*':     /* These repeats won't be after brackets; */
2374     case '+':     /* those are handled separately */
2375     case '?':
2376     length++;
2377     continue;
2378
2379     /* This covers the cases of repeats after a single char, metachar, class,
2380     or back reference. */
2381
2382     case '{':
2383     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2384     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2385     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2386     if ((min == 0 && (max == 1 || max == -1)) ||
2387       (min == 1 && max == -1))
2388         length++;
2389     else
2390       {
2391       length--;   /* Uncount the original char or metachar */
2392       if (min == 1) length++; else if (min > 0) length += 4;
2393       if (max > 0) length += 4; else length += 2;
2394       }
2395     if (ptr[1] == '?') ptr++;
2396     continue;
2397
2398     /* An alternation contains an offset to the next branch or ket. */
2399     case '|':
2400     length += 3;
2401     continue;
2402
2403     /* A character class uses 33 characters. Don't worry about character types
2404     that aren't allowed in classes - they'll get picked up during the compile.
2405     A character class that contains only one character uses 2 or 3 bytes,
2406     depending on whether it is negated or not. Notice this where we can. */
2407
2408     case '[':
2409     class_charcount = 0;
2410     if (*(++ptr) == '^') ptr++;
2411     do
2412       {
2413       if (*ptr == '\\')
2414         {
2415         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
2416         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2417         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2418         }
2419       else class_charcount++;
2420       ptr++;
2421       }
2422     while (*ptr != 0 && *ptr != ']');
2423
2424     /* Repeats for negated single chars are handled by the general code */
2425
2426     if (class_charcount == 1) length += 3; else
2427       {
2428       length += 33;
2429       if (options & PCRE_LOCALE) length++;  /* Add a byte for the localization flag */
2430
2431       /* A repeat needs either 1 or 5 bytes. */
2432
2433       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2434         {
2435         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2436         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2437         if ((min == 0 && (max == 1 || max == -1)) ||
2438           (min == 1 && max == -1))
2439             length++;
2440         else length += 5;
2441         if (ptr[1] == '?') ptr++;
2442         }
2443       }
2444     continue;
2445
2446     /* Brackets may be genuine groups or special things */
2447
2448     case '(':
2449
2450     /* Handle special forms of bracket, which all start (? */
2451
2452     if (ptr[1] == '?') switch (c = ptr[2])
2453       {
2454       /* Skip over comments entirely */
2455       case '#':
2456       ptr += 3;
2457       while (*ptr != 0 && *ptr != ')') ptr++;
2458       if (*ptr == 0)
2459         {
2460         *errorptr = ERR18;
2461         goto PCRE_ERROR_RETURN;
2462         }
2463       continue;
2464
2465       /* Non-referencing groups and lookaheads just move the pointer on, and
2466       then behave like a non-special bracket, except that they don't increment
2467       the count of extracting brackets. */
2468
2469       case ':':
2470       case '=':
2471       case '!':
2472       ptr += 2;
2473       break;
2474
2475       case ('P'):
2476         {
2477           int idlen;
2478           switch (*ptr++) {
2479           case ('<'):
2480             idlen = get_group_id(ptr++, '>', errorptr);
2481             if (*errorptr) goto PCRE_ERROR_RETURN;
2482             ptr += idlen+1;
2483             break;
2484           case ('='):
2485             idlen = get_group_id(ptr++, ')', errorptr);
2486             if (*errorptr) goto PCRE_ERROR_RETURN;
2487             ptr += idlen+1;
2488             length++;
2489             break;
2490           }
2491         }
2492         break;
2493
2494       /* Ditto for the "once only" bracket, allowed only if the extra bit
2495       is set. */
2496
2497       case '>':
2498       if ((options & PCRE_EXTRA) != 0)
2499         {
2500         ptr += 2;
2501         break;
2502         }
2503       /* Else fall through */
2504
2505       /* Else loop setting valid options until ) is met. Anything else is an
2506       error. */
2507
2508       default:
2509       ptr += 2;
2510       for (;; ptr++)
2511         {
2512         if ((c = *ptr) == 'i')
2513           {
2514           options |= PCRE_CASELESS;
2515           continue;
2516           }
2517         else if ((c = *ptr) == 'L')
2518           {
2519           options |= PCRE_LOCALE;
2520           continue;
2521           }
2522         else if ((c = *ptr) == 'm')
2523           {
2524           options |= PCRE_MULTILINE;
2525           continue;
2526           }
2527         else if (c == 's')
2528           {
2529           options |= PCRE_DOTALL;
2530           continue;
2531           }
2532         else if (c == 'x')
2533           {
2534           options |= PCRE_EXTENDED;
2535           length -= spaces;          /* Already counted spaces */
2536           continue;
2537           }
2538         else if (c == ')') break;
2539
2540         *errorptr = ERR12;
2541         goto PCRE_ERROR_RETURN;
2542         }
2543       continue;                      /* End of this bracket handling */
2544       }
2545
2546     /* Extracting brackets must be counted so we can process escapes in a
2547     Perlish way. */
2548
2549     else bracount++;
2550
2551     /* Non-special forms of bracket. Save length for computing whole length
2552     at end if there's a repeat that requires duplication of the group. */
2553
2554     if (brastackptr >= sizeof(brastack)/sizeof(int))
2555       {
2556       *errorptr = ERR19;
2557       goto PCRE_ERROR_RETURN;
2558       }
2559
2560     brastack[brastackptr++] = length;
2561     length += 3;
2562     continue;
2563
2564     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2565     have to replicate this bracket up to that many times. If brastackptr is
2566     0 this is an unmatched bracket which will generate an error, but take care
2567     not to try to access brastack[-1]. */
2568
2569     case ')':
2570     length += 3;
2571       {
2572       int minval = 1;
2573       int maxval = 1;
2574       int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
2575
2576       /* Leave ptr at the final char; for read_repeat_counts this happens
2577       automatically; for the others we need an increment. */
2578
2579       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2580         {
2581         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2582         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583         }
2584       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2585       else if (c == '+') { maxval = -1; ptr++; }
2586       else if (c == '?') { minval = 0; ptr++; }
2587
2588       /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2589       if there is a limited maximum we have to replicate up to maxval-1 times
2590       and allow for a BRAZERO item before each optional copy, as we also have
2591       to do before the first copy if the minimum is zero. */
2592
2593       if (minval == 0) length++;
2594         else if (minval > 1) length += (minval - 1) * duplength;
2595       if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2596       }
2597     continue;
2598
2599     /* Non-special character. For a run of such characters the length required
2600     is the number of characters + 2, except that the maximum run length is 255.
2601     We won't get a skipped space or a non-data escape or the start of a #
2602     comment as the first character, so the length can't be zero. */
2603
2604     NORMAL_CHAR:
2605     default:
2606     length += 2;
2607     runlength = 0;
2608     do
2609       {
2610       if ((pcre_ctypes[c] & ctype_space) != 0)
2611         {
2612         if ((options & PCRE_EXTENDED) != 0) continue;
2613         spaces++;
2614         }
2615
2616       if (c == '#' && (options & PCRE_EXTENDED) != 0)
2617         {
2618         while ((c = *(++ptr)) != 0 && c != '\n');
2619         continue;
2620         }
2621
2622       /* Backslash may introduce a data char or a metacharacter; stop the
2623       string before the latter. */
2624
2625       if (c == '\\')
2626         {
2627         const uschar *saveptr = ptr;
2628         c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2629         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630         if (c < 0) { ptr = saveptr; break; }
2631         }
2632
2633       /* Ordinary character or single-char escape */
2634
2635       runlength++;
2636       }
2637
2638     /* This "while" is the end of the "do" above. */
2639
2640     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2641
2642     ptr--;
2643     length += runlength;
2644     continue;
2645     }
2646   }
2647
2648 length += 4;    /* For final KET and END */
2649
2650 if (length > 65539)
2651   {
2652   *errorptr = ERR20;
2653   return NULL;
2654   }
2655
2656 /* Compute the size of data block needed and get it, either from malloc or
2657 externally provided function. We specify "code[0]" in the offsetof() expression
2658 rather than just "code", because it has been reported that one broken compiler
2659 fails on "code" because it is also an independent variable. It should make no
2660 difference to the value of the offsetof(). */
2661
2662 size = length + offsetof(real_pcre, code[0]);
2663 re = (real_pcre *)(pcre_malloc)(size+50);
2664
2665 if (re == NULL)
2666   {
2667   *errorptr = ERR21;
2668   return NULL;
2669   }
2670
2671 /* Put in the magic number and the options. */
2672
2673 re->magic_number = MAGIC_NUMBER;
2674 re->options = options;
2675
2676 /* Set up a starting, non-extracting bracket, then compile the expression. On
2677 error, *errorptr will be set non-NULL, so we don't need to look at the result
2678 of the function here. */
2679
2680 ptr = (const uschar *)pattern;
2681 code = re->code;
2682 *code = OP_BRA;
2683 bracount = 0;
2684 (void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2685 re->top_bracket = bracount;
2686 re->top_backref = top_backref;
2687
2688 /* If not reached end of pattern on success, there's an excess bracket. */
2689
2690 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2691
2692 /* Fill in the terminating state and check for disastrous overflow, but
2693 if debugging, leave the test till after things are printed out. */
2694
2695 *code++ = OP_END;
2696
2697
2698 #ifndef DEBUG
2699 if (code - re->code > length) *errorptr = ERR23;
2700 #endif
2701
2702 /* Failed to compile */
2703
2704 if (*errorptr != NULL)
2705   {
2706   (pcre_free)(re);
2707   PCRE_ERROR_RETURN:
2708   *erroroffset = ptr - (const uschar *)pattern;
2709   return NULL;
2710   }
2711
2712 /* If the anchored option was not passed, set flag if we can determine that it
2713 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2714 we can determine what the first character has to be, because that speeds up
2715 unanchored matches no end. In the case of multiline matches, an alternative is
2716 to set the PCRE_STARTLINE flag if all branches start with ^. */
2717
2718 if ((options & PCRE_ANCHORED) == 0)
2719   {
2720   if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2721     re->options |= PCRE_ANCHORED;
2722   else
2723     {
2724     int ch = find_firstchar(re->code);
2725     if (ch >= 0)
2726       {
2727       re->first_char = ch;
2728       re->options |= PCRE_FIRSTSET;
2729       }
2730     else if (is_startline(re->code))
2731       re->options |= PCRE_STARTLINE;
2732     }
2733   }
2734
2735 /* Print out the compiled data for debugging */
2736
2737 #ifdef DEBUG
2738
2739 printf("Length = %d top_bracket = %d top_backref=%d\n",
2740   length, re->top_bracket, re->top_backref);
2741
2742 if (re->options != 0)
2743   {
2744   printf("%s%s%s%s%s%s%s%s\n",
2745     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2746     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2747     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2748     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2749     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2750     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2751     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2752     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2753   }
2754
2755 if ((re->options & PCRE_FIRSTSET) != 0)
2756   {
2757   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2758     else printf("First char = \\x%02x\n", re->first_char);
2759   }
2760
2761 code_end = code;
2762 code_base = code = re->code;
2763
2764 while (code < code_end)
2765   {
2766   int charlength;
2767
2768   printf("%3d ", code - code_base);
2769
2770   if (*code >= OP_BRA)
2771     {
2772     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2773     code += 2;
2774     }
2775
2776   else switch(*code)
2777     {
2778     case OP_CHARS:
2779     charlength = *(++code);
2780     printf("%3d ", charlength);
2781     while (charlength-- > 0)
2782       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2783     break;
2784
2785     case OP_KETRMAX:
2786     case OP_KETRMIN:
2787     case OP_ALT:
2788     case OP_KET:
2789     case OP_ASSERT:
2790     case OP_ASSERT_NOT:
2791     case OP_ONCE:
2792     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2793     code += 2;
2794     break;
2795
2796     case OP_STAR:
2797     case OP_MINSTAR:
2798     case OP_PLUS:
2799     case OP_MINPLUS:
2800     case OP_QUERY:
2801     case OP_MINQUERY:
2802     case OP_TYPESTAR:
2803     case OP_TYPEMINSTAR:
2804     case OP_TYPEPLUS:
2805     case OP_TYPEMINPLUS:
2806     case OP_TYPEQUERY:
2807     case OP_TYPEMINQUERY:
2808     if (*code >= OP_TYPESTAR)
2809       printf("    %s", OP_names[code[1]]);
2810     else if (isprint(c = code[1])) printf("    %c", c);
2811       else printf("    \\x%02x", c);
2812     printf("%s", OP_names[*code++]);
2813     break;
2814
2815     case OP_EXACT:
2816     case OP_UPTO:
2817     case OP_MINUPTO:
2818     if (isprint(c = code[3])) printf("    %c{", c);
2819       else printf("    \\x%02x{", c);
2820     if (*code != OP_EXACT) printf("0,");
2821     printf("%d}", (code[1] << 8) + code[2]);
2822     if (*code == OP_MINUPTO) printf("?");
2823     code += 3;
2824     break;
2825
2826     case OP_TYPEEXACT:
2827     case OP_TYPEUPTO:
2828     case OP_TYPEMINUPTO:
2829     printf("    %s{", OP_names[code[3]]);
2830     if (*code != OP_TYPEEXACT) printf(",");
2831     printf("%d}", (code[1] << 8) + code[2]);
2832     if (*code == OP_TYPEMINUPTO) printf("?");
2833     code += 3;
2834     break;
2835
2836     case OP_NOT:
2837     if (isprint(c = *(++code))) printf("    [^%c]", c);
2838       else printf("    [^\\x%02x]", c);
2839     break;
2840
2841     case OP_NOTSTAR:
2842     case OP_NOTMINSTAR:
2843     case OP_NOTPLUS:
2844     case OP_NOTMINPLUS:
2845     case OP_NOTQUERY:
2846     case OP_NOTMINQUERY:
2847     if (isprint(c = code[1])) printf("    [^%c]", c);
2848       else printf("    [^\\x%02x]", c);
2849     printf("%s", OP_names[*code++]);
2850     break;
2851
2852     case OP_NOTEXACT:
2853     case OP_NOTUPTO:
2854     case OP_NOTMINUPTO:
2855     if (isprint(c = code[3])) printf("    [^%c]{", c);
2856       else printf("    [^\\x%02x]{", c);
2857     if (*code != OP_NOTEXACT) printf(",");
2858     printf("%d}", (code[1] << 8) + code[2]);
2859     if (*code == OP_NOTMINUPTO) printf("?");
2860     code += 3;
2861     break;
2862
2863     case OP_REF:
2864     printf("    \\%d", *(++code));
2865     code ++;
2866     goto CLASS_REF_REPEAT;
2867
2868     case OP_CLASS:
2869     case OP_NEGCLASS:
2870     case OP_CLASS_L:
2871       {
2872       int i, min, max;
2873
2874       if (*code==OP_CLASS_L)
2875         {
2876           code++;
2877           printf("Locflag = %i ", *code++);
2878           printf("    [");
2879         }
2880       else
2881         {
2882           if (*code++ == OP_CLASS) printf("    [");
2883           else printf("   ^[");
2884         }
2885
2886
2887       for (i = 0; i < 256; i++)
2888         {
2889         if ((code[i/8] & (1 << (i&7))) != 0)
2890           {
2891           int j;
2892           for (j = i+1; j < 256; j++)
2893             if ((code[j/8] & (1 << (j&7))) == 0) break;
2894           if (i == '-' || i == ']') printf("\\");
2895           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2896           if (--j > i)
2897             {
2898             printf("-");
2899             if (j == '-' || j == ']') printf("\\");
2900             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2901             }
2902           i = j;
2903           }
2904         }
2905       printf("]");
2906       code += 32;
2907       /*      code ++;*/
2908
2909       CLASS_REF_REPEAT:
2910
2911       switch(*code)
2912         {
2913         case OP_CRSTAR:
2914         case OP_CRMINSTAR:
2915         case OP_CRPLUS:
2916         case OP_CRMINPLUS:
2917         case OP_CRQUERY:
2918         case OP_CRMINQUERY:
2919         printf("%s", OP_names[*code]);
2920         break;
2921
2922         case OP_CRRANGE:
2923         case OP_CRMINRANGE:
2924         min = (code[1] << 8) + code[2];
2925         max = (code[3] << 8) + code[4];
2926         if (max == 0) printf("{%d,}", min);
2927         else printf("{%d,%d}", min, max);
2928         if (*code == OP_CRMINRANGE) printf("?");
2929         code += 4;
2930         break;
2931
2932         default:
2933         code--;
2934         }
2935       }
2936     break;
2937
2938     /* Anything else is just a one-node item */
2939
2940     default:
2941     printf("    %s", OP_names[*code]);
2942     break;
2943     }
2944
2945   code++;
2946   printf("\n");
2947   }
2948 printf("------------------------------------------------------------------\n");
2949
2950 /* This check is done here in the debugging case so that the code that
2951 was compiled can be seen. */
2952
2953 if (code - re->code > length)
2954   {
2955   printf("length=%i, code length=%i\n", length, code-re->code);
2956   *errorptr = ERR23;
2957   (pcre_free)(re);
2958   *erroroffset = ptr - (uschar *)pattern;
2959   return NULL;
2960   }
2961 #endif
2962
2963 return (pcre *)re;
2964 }
2965
2966
2967
2968 /*************************************************
2969 *        Match a character type                  *
2970 *************************************************/
2971
2972 /* Not used in all the places it might be as it's sometimes faster
2973 to put the code inline.
2974
2975 Arguments:
2976   type        the character type
2977   c           the character
2978   dotall      the dotall flag
2979
2980 Returns:      TRUE if character is of the type
2981 */
2982
2983 static BOOL
2984 match_type(int type, int c, BOOL dotall)
2985 {
2986
2987 #ifdef DEBUG
2988 if (isprint(c)) printf("matching subject %c against ", c);
2989   else printf("matching subject \\x%02x against ", c);
2990 printf("%s\n", OP_names[type]);
2991 #endif
2992
2993 switch(type)
2994   {
2995   case OP_ANY:            return dotall || c != '\n';
2996   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;
2997   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;
2998   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2999   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;
3000   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;
3001   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;
3002   case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3003   case OP_WORDCHAR_L:     return (c=='_' || isalnum(c));
3004   }
3005 return FALSE;
3006 }
3007
3008
3009
3010 /*************************************************
3011 *          Match a back-reference                *
3012 *************************************************/
3013
3014 /* If a back reference hasn't been set, the match fails.
3015
3016 Arguments:
3017   number      reference number
3018   eptr        points into the subject
3019   length      length to be matched
3020   md          points to match data block
3021
3022 Returns:      TRUE if matched
3023 */
3024
3025 static BOOL
3026 match_ref(int number, register const uschar *eptr, int length, match_data *md)
3027 {
3028 const uschar *p = md->start_subject + md->offset_vector[number];
3029
3030 #ifdef DEBUG
3031 if (eptr >= md->end_subject)
3032   printf("matching subject <null>");
3033 else
3034   {
3035   printf("matching subject ");
3036   pchars(eptr, length, TRUE, md);
3037   }
3038 printf(" against backref ");
3039 pchars(p, length, FALSE, md);
3040 printf("\n");
3041 #endif
3042
3043 /* Always fail if not enough characters left */
3044
3045 if (length > md->end_subject - p) return FALSE;
3046
3047 /* Separate the caseless case for speed */
3048
3049 if (md->caseless)
3050   { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3051 else
3052   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3053
3054 return TRUE;
3055 }
3056
3057 static int free_stack(match_data *md)
3058 {
3059 /* Free any stack space that was allocated by the call to match(). */
3060 if (md->off_num)    PyMem_DEL(md->off_num);
3061 if (md->offset_top) PyMem_DEL(md->offset_top);
3062 if (md->r1)         PyMem_DEL(md->r1);
3063 if (md->r2)         PyMem_DEL(md->r2);
3064 if (md->eptr)       PyMem_DEL((char *)md->eptr);
3065 if (md->ecode)      PyMem_DEL((char *)md->ecode);
3066 return 0;
3067 }
3068
3069 static int grow_stack(match_data *md)
3070 {
3071   if (md->length != 0)
3072     {
3073       md->length = md->length + md->length/2;
3074     }
3075   else
3076     {
3077       int string_len = md->end_subject - md->start_subject + 1;
3078       if (string_len < 80) {md->length = string_len; }
3079       else {md->length = 80;}
3080     }
3081   PyMem_RESIZE(md->offset_top, int, md->length);
3082   /* Can't realloc a pointer-to-const; cast const away. */
3083   md->eptr = (const uschar **)PyMem_Realloc((void *)md->eptr,
3084                                             sizeof(uschar *) * md->length);
3085   md->ecode = (const uschar **)PyMem_Realloc((void *)md->ecode,
3086                                              sizeof(uschar *) * md->length);
3087   PyMem_RESIZE(md->off_num, int, md->length);
3088   PyMem_RESIZE(md->r1, int, md->length);
3089   PyMem_RESIZE(md->r2, int, md->length);
3090   if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3091       md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3092     {
3093       PyErr_NoMemory();
3094       longjmp(md->error_env, 1);
3095     }
3096   return 0;
3097 }
3098
3099
3100 /*************************************************
3101 *         Match from current position            *
3102 *************************************************/
3103
3104 /* On entry ecode points to the first opcode, and eptr to the first character.
3105
3106 Arguments:
3107    eptr        pointer in subject
3108    ecode       position in code
3109    offset_top  current top pointer
3110    md          pointer to "static" info for the match
3111
3112 Returns:       TRUE if matched
3113 */
3114
3115 static BOOL
3116 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
3117   match_data *md)
3118 {
3119   int save_stack_position = md->point;
3120 match_loop:
3121
3122 #define SUCCEED goto succeed
3123 #define FAIL    goto fail
3124
3125 for (;;)
3126   {
3127   int min, max, ctype;
3128   register int i;
3129   register int c;
3130   BOOL minimize = FALSE;
3131
3132   /* Opening bracket. Check the alternative branches in turn, failing if none
3133   match. We have to set the start offset if required and there is space
3134   in the offset vector so that it is available for subsequent back references
3135   if the bracket matches. However, if the bracket fails, we must put back the
3136   previous value of both offsets in case they were set by a previous copy of
3137   the same bracket. Don't worry about setting the flag for the error case here;
3138   that is handled in the code for KET. */
3139
3140   if ((int)*ecode >= OP_BRA)
3141     {
3142     int number = (*ecode - OP_BRA) << 1;
3143     int save_offset1 = 0, save_offset2 = 0;
3144
3145     DPRINTF(("start bracket %d\n", number/2));
3146
3147     if (number > 0 && number < md->offset_end)
3148       {
3149       save_offset1 = md->offset_vector[number];
3150       save_offset2 = md->offset_vector[number+1];
3151       md->offset_vector[number] = eptr - md->start_subject;
3152
3153       DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
3154       }
3155
3156     /* Recurse for all the alternatives. */
3157
3158     do
3159       {
3160       if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3161       ecode += (ecode[1] << 8) + ecode[2];
3162       }
3163     while (*ecode == OP_ALT);
3164
3165     DPRINTF(("bracket %d failed\n", number/2));
3166
3167     if (number > 0 && number < md->offset_end)
3168       {
3169       md->offset_vector[number] = save_offset1;
3170       md->offset_vector[number+1] = save_offset2;
3171       }
3172
3173     FAIL;
3174     }
3175
3176   /* Other types of node can be handled by a switch */
3177
3178   switch(*ecode)
3179     {
3180     case OP_END:
3181     md->end_match_ptr = eptr;          /* Record where we ended */
3182     md->end_offset_top = offset_top;   /* and how many extracts were taken */
3183     SUCCEED;
3184
3185     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3186     whole thing doesn't match, so we have to get out via a longjmp(). */
3187
3188     case OP_CUT:
3189     if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3190     longjmp(md->fail_env, 1);
3191
3192     /* Assertion brackets. Check the alternative branches in turn - the
3193     matching won't pass the KET for an assertion. If any one branch matches,
3194     the assertion is true. */
3195
3196     case OP_ASSERT:
3197     do
3198       {
3199       if (match(eptr, ecode+3, offset_top, md)) break;
3200       ecode += (ecode[1] << 8) + ecode[2];
3201       }
3202     while (*ecode == OP_ALT);
3203     if (*ecode == OP_KET) FAIL;
3204
3205     /* Continue from after the assertion, updating the offsets high water
3206     mark, since extracts may have been taken during the assertion. */
3207
3208     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3209     ecode += 3;
3210     offset_top = md->end_offset_top;
3211     continue;
3212
3213     /* Negative assertion: all branches must fail to match */
3214
3215     case OP_ASSERT_NOT:
3216     do
3217       {
3218       if (match(eptr, ecode+3, offset_top, md)) FAIL;
3219       ecode += (ecode[1] << 8) + ecode[2];
3220       }
3221     while (*ecode == OP_ALT);
3222     ecode += 3;
3223     continue;
3224
3225     /* "Once" brackets are like assertion brackets except that after a match,
3226     the point in the subject string is not moved back. Thus there can never be
3227     a move back into the brackets. Check the alternative branches in turn - the
3228     matching won't pass the KET for this kind of subpattern. If any one branch
3229     matches, we carry on, leaving the subject pointer. */
3230
3231     case OP_ONCE:
3232     do
3233       {
3234       if (match(eptr, ecode+3, offset_top, md)) break;
3235       ecode += (ecode[1] << 8) + ecode[2];
3236       }
3237     while (*ecode == OP_ALT);
3238     if (*ecode == OP_KET) FAIL;
3239
3240     /* Continue as from after the assertion, updating the offsets high water
3241     mark, since extracts may have been taken. */
3242
3243     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3244     ecode += 3;
3245     offset_top = md->end_offset_top;
3246     eptr = md->end_match_ptr;
3247     continue;
3248
3249     /* An alternation is the end of a branch; scan along to find the end of the
3250     bracketed group and go to there. */
3251
3252     case OP_ALT:
3253     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3254     break;
3255
3256     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3257     that it may occur zero times. It may repeat infinitely, or not at all -
3258     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3259     repeat limits are compiled as a number of copies, with the optional ones
3260     preceded by BRAZERO or BRAMINZERO. */
3261
3262     case OP_BRAZERO:
3263       {
3264       const uschar *next = ecode+1;
3265       if (match(eptr, next, offset_top, md)) SUCCEED;
3266       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3267       ecode = next + 3;
3268       }
3269     break;
3270
3271     case OP_BRAMINZERO:
3272       {
3273       const uschar *next = ecode+1;
3274       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3275       if (match(eptr, next+3, offset_top, md)) SUCCEED;
3276       ecode++;
3277       }
3278     break;;
3279
3280     /* End of a group, repeated or non-repeating. If we are at the end of
3281     an assertion "group", stop matching and SUCCEED, but record the
3282     current high water mark for use by positive assertions. */
3283
3284     case OP_KET:
3285     case OP_KETRMIN:
3286     case OP_KETRMAX:
3287       {
3288       int number;
3289       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3290
3291       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
3292         {
3293         md->end_match_ptr = eptr;      /* For ONCE */
3294         md->end_offset_top = offset_top;
3295         SUCCEED;
3296         }
3297
3298       /* In all other cases we have to check the group number back at the
3299       start and if necessary complete handling an extraction by setting the
3300       final offset and bumping the high water mark. */
3301
3302       number = (*prev - OP_BRA) << 1;
3303
3304       DPRINTF(("end bracket %d\n", number/2));
3305
3306       if (number > 0)
3307         {
3308         if (number >= md->offset_end) md->offset_overflow = TRUE; else
3309           {
3310           md->offset_vector[number+1] = eptr - md->start_subject;
3311           if (offset_top <= number) offset_top = number + 2;
3312           }
3313         }
3314
3315       /* For a non-repeating ket, just advance to the next node and continue at
3316       this level. */
3317
3318       if (*ecode == OP_KET)
3319         {
3320         ecode += 3;
3321         break;
3322         }
3323
3324       /* The repeating kets try the rest of the pattern or restart from the
3325       preceding bracket, in the appropriate order. */
3326
3327       if (*ecode == OP_KETRMIN)
3328         {
3329         const uschar *ptr;
3330         if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3331         /* Handle alternation inside the BRA...KET; push the additional
3332            alternatives onto the stack */
3333         ptr=prev;
3334         do {
3335           ptr += (ptr[1]<<8)+ ptr[2];
3336           if (*ptr==OP_ALT)
3337             {
3338               if (md->length == md->point)
3339                 {
3340                   grow_stack(md);
3341                 }
3342               md->offset_top[md->point] = offset_top;
3343               md->eptr[md->point]       = eptr;
3344               md->ecode[md->point]      = ptr+3;
3345               md->r1[md->point]         = 0;
3346               md->r2[md->point]         = 0;
3347               md->off_num[md->point]    = 0;
3348               md->point++;
3349             }
3350         } while (*ptr==OP_ALT);
3351         ecode=prev+3; goto match_loop;
3352         }
3353       else  /* OP_KETRMAX */
3354         {
3355         const uschar *ptr;
3356         /*int points_pushed=0;*/
3357
3358         /* Push one failure point, that will resume matching at the code after
3359            the KETRMAX opcode. */
3360         if (md->length == md->point)
3361           {
3362             grow_stack(md);
3363           }
3364         md->offset_top[md->point] = offset_top;
3365         md->eptr[md->point]       = eptr;
3366         md->ecode[md->point]      = ecode+3;
3367         md->r1[md->point]         = md->offset_vector[number];
3368         md->r2[md->point]         = md->offset_vector[number+1];
3369         md->off_num[md->point]    = number;
3370         md->point++;
3371
3372         md->offset_vector[number] = eptr - md->start_subject;
3373         /* Handle alternation inside the BRA...KET; push each of the
3374            additional alternatives onto the stack */
3375         ptr=prev;
3376         do {
3377           ptr += (ptr[1]<<8)+ ptr[2];
3378           if (*ptr==OP_ALT)
3379             {
3380               if (md->length == md->point)
3381                 if (md->length == md->point)
3382                   {
3383                     grow_stack(md);
3384                   }
3385               md->offset_top[md->point] = offset_top;
3386               md->eptr[md->point]       = eptr;
3387               md->ecode[md->point]      = ptr+3;
3388               md->r1[md->point]         = 0;
3389               md->r2[md->point]         = 0;
3390               md->off_num[md->point]    = 0;
3391               md->point++;
3392               /*points_pushed++;*/
3393             }
3394         } while (*ptr==OP_ALT);
3395         /* Jump to the first (or only) alternative and resume trying to match */
3396         ecode=prev+3; goto match_loop;
3397         }
3398       }
3399
3400     /* Start of subject unless notbol, or after internal newline if multiline */
3401
3402     case OP_CIRC:
3403     if (md->notbol && eptr == md->start_subject) FAIL;
3404     if (md->multiline)
3405       {
3406       if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3407       ecode++;
3408       break;
3409       }
3410     /* ... else fall through */
3411
3412     /* Start of subject assertion */
3413
3414     case OP_SOD:
3415     if (eptr != md->start_subject) FAIL;
3416     ecode++;
3417     break;
3418
3419     /* Assert before internal newline if multiline, or before
3420     a terminating newline unless endonly is set, else end of subject unless
3421     noteol is set. */
3422
3423     case OP_DOLL:
3424     if (md->noteol && eptr >= md->end_subject) FAIL;
3425     if (md->multiline)
3426       {
3427       if (eptr < md->end_subject && *eptr != '\n') FAIL;
3428       ecode++;
3429       break;
3430       }
3431     else if (!md->endonly)
3432       {
3433       if (eptr < md->end_subject - 1 ||
3434          (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3435       ecode++;
3436       break;
3437       }
3438     /* ... else fall through */
3439
3440     /* End of subject assertion */
3441
3442     case OP_EOD:
3443     if (eptr < md->end_subject) FAIL;
3444     ecode++;
3445     break;
3446
3447     /* Word boundary assertions */
3448
3449     case OP_NOT_WORD_BOUNDARY:
3450     case OP_WORD_BOUNDARY:
3451       {
3452       BOOL prev_is_word = (eptr != md->start_subject) &&
3453         ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3454       BOOL cur_is_word = (eptr < md->end_subject) &&
3455         ((pcre_ctypes[*eptr] & ctype_word) != 0);
3456       if ((*ecode++ == OP_WORD_BOUNDARY)?
3457            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3458         FAIL;
3459       }
3460     break;
3461
3462     case OP_NOT_WORD_BOUNDARY_L:
3463     case OP_WORD_BOUNDARY_L:
3464       {
3465         BOOL prev_is_word = (eptr != md->start_subject) &&
3466           (isalnum(eptr[-1]) || eptr[-1]=='_');
3467         BOOL cur_is_word = (eptr < md->end_subject) &&
3468           (isalnum(*eptr) || *eptr=='_');
3469         if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3470             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3471           FAIL;
3472       }
3473       break;
3474
3475
3476     /* Match a single character type; inline for speed */
3477
3478     case OP_ANY:
3479     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3480     if (eptr++ >= md->end_subject) FAIL;
3481     ecode++;
3482     break;
3483
3484     case OP_NOT_DIGIT:
3485     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3486       FAIL;
3487     ecode++;
3488     break;
3489
3490     case OP_DIGIT:
3491     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3492       FAIL;
3493     ecode++;
3494     break;
3495
3496     case OP_NOT_WHITESPACE:
3497     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3498       FAIL;
3499     ecode++;
3500     break;
3501
3502     case OP_WHITESPACE:
3503     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3504       FAIL;
3505     ecode++;
3506     break;
3507
3508     case OP_NOT_WORDCHAR:
3509     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3510       FAIL;
3511     ecode++;
3512     break;
3513
3514     case OP_WORDCHAR:
3515     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3516       FAIL;
3517     ecode++;
3518     break;
3519
3520     case OP_NOT_WORDCHAR_L:
3521     if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
3522       FAIL;
3523     eptr++;
3524     ecode++;
3525     break;
3526
3527     case OP_WORDCHAR_L:
3528     if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
3529       FAIL;
3530     eptr++;
3531     ecode++;
3532     break;
3533
3534     /* Match a back reference, possibly repeatedly. Look past the end of the
3535     item to see if there is repeat information following. The code is similar
3536     to that for character classes, but repeated for efficiency. Then obey
3537     similar code to character type repeats - written out again for speed.
3538     However, if the referenced string is the empty string, always treat
3539     it as matched, any number of times (otherwise there could be infinite
3540     loops). */
3541
3542     case OP_REF:
3543       {
3544       int length;
3545       int number = ecode[1] << 1;                /* Doubled reference number */
3546       ecode += 2;                                /* Advance past the item */
3547
3548       if (number >= offset_top || md->offset_vector[number] < 0)
3549         {
3550         md->errorcode = PCRE_ERROR_BADREF;
3551         FAIL;
3552         }
3553
3554       length = md->offset_vector[number+1] - md->offset_vector[number];
3555
3556       switch (*ecode)
3557         {
3558         case OP_CRSTAR:
3559         case OP_CRMINSTAR:
3560         case OP_CRPLUS:
3561         case OP_CRMINPLUS:
3562         case OP_CRQUERY:
3563         case OP_CRMINQUERY:
3564         c = *ecode++ - OP_CRSTAR;
3565         minimize = (c & 1) != 0;
3566         min = rep_min[c];                 /* Pick up values from tables; */
3567         max = rep_max[c];                 /* zero for max => infinity */
3568         if (max == 0) max = INT_MAX;
3569         break;
3570
3571         case OP_CRRANGE:
3572         case OP_CRMINRANGE:
3573         minimize = (*ecode == OP_CRMINRANGE);
3574         min = (ecode[1] << 8) + ecode[2];
3575         max = (ecode[3] << 8) + ecode[4];
3576         if (max == 0) max = INT_MAX;
3577         ecode += 5;
3578         break;
3579
3580         default:               /* No repeat follows */
3581         if (!match_ref(number, eptr, length, md)) FAIL;
3582         eptr += length;
3583         continue;              /* With the main loop */
3584         }
3585
3586       /* If the length of the reference is zero, just continue with the
3587       main loop. */
3588
3589       if (length == 0) continue;
3590
3591       /* First, ensure the minimum number of matches are present. We get back
3592       the length of the reference string explicitly rather than passing the
3593       address of eptr, so that eptr can be a register variable. */
3594
3595       for (i = 1; i <= min; i++)
3596         {
3597         if (!match_ref(number, eptr, length, md)) FAIL;
3598         eptr += length;
3599         }
3600
3601       /* If min = max, continue at the same level without recursion.
3602       They are not both allowed to be zero. */
3603
3604       if (min == max) continue;
3605
3606       /* If minimizing, keep trying and advancing the pointer */
3607
3608       if (minimize)
3609         {
3610         for (i = min;; i++)
3611           {
3612           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3613           if (i >= max || !match_ref(number, eptr, length, md))
3614             FAIL;
3615           eptr += length;
3616           }
3617         /* Control never gets here */
3618         }
3619
3620       /* If maximizing, find the longest string and work backwards */
3621
3622       else
3623         {
3624         const uschar *pp = eptr;
3625         for (i = min; i < max; i++)
3626           {
3627           if (!match_ref(number, eptr, length, md)) break;
3628           eptr += length;
3629           }
3630         while (eptr >= pp)
3631           {
3632           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3633           eptr -= length;
3634           }
3635         FAIL;
3636         }
3637       }
3638     /* Control never gets here */
3639
3640     /* Match a character class, possibly repeatedly. Look past the end of the
3641     item to see if there is repeat information following. Then obey similar
3642     code to character type repeats - written out again for speed. If caseless
3643     matching was set at runtime but not at compile time, we have to check both
3644     versions of a character, and we have to behave differently for positive and
3645     negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3646     treated differently. */
3647
3648     case OP_CLASS:
3649     case OP_NEGCLASS:
3650       {
3651       BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
3652       const uschar *data = ecode + 1;  /* Save for matching */
3653       ecode += 33;                     /* Advance past the item */
3654
3655       switch (*ecode)
3656         {
3657         case OP_CRSTAR:
3658         case OP_CRMINSTAR:
3659         case OP_CRPLUS:
3660         case OP_CRMINPLUS:
3661         case OP_CRQUERY:
3662         case OP_CRMINQUERY:
3663         c = *ecode++ - OP_CRSTAR;
3664         minimize = (c & 1) != 0;
3665         min = rep_min[c];                 /* Pick up values from tables; */
3666         max = rep_max[c];                 /* zero for max => infinity */
3667         if (max == 0) max = INT_MAX;
3668         break;
3669
3670         case OP_CRRANGE:
3671         case OP_CRMINRANGE:
3672         minimize = (*ecode == OP_CRMINRANGE);
3673         min = (ecode[1] << 8) + ecode[2];
3674         max = (ecode[3] << 8) + ecode[4];
3675         if (max == 0) max = INT_MAX;
3676         ecode += 5;
3677         break;
3678
3679         default:               /* No repeat follows */
3680           min = max = 1;
3681           break;
3682         }
3683
3684       /* First, ensure the minimum number of matches are present. */
3685
3686       for (i = 1; i <= min; i++)
3687         {
3688         if (eptr >= md->end_subject) FAIL;
3689         c = *eptr++;
3690
3691         /* Either not runtime caseless, or it was a positive class. For
3692         runtime caseless, continue if either case is in the map. */
3693
3694         if (!nasty_case)
3695           {
3696           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3697           if (md->runtime_caseless)
3698             {
3699             c = pcre_fcc[c];
3700             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3701             }
3702           }
3703
3704         /* Runtime caseless and it was a negative class. Continue only if
3705         both cases are in the map. */
3706
3707         else
3708           {
3709            if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3710            c = pcre_fcc[c];
3711            if ((data[c/8] & (1 << (c&7))) != 0) continue;
3712            }
3713
3714         FAIL;
3715         }
3716
3717       /* If max == min we can continue with the main loop without the
3718       need to recurse. */
3719
3720       if (min == max) continue;
3721
3722       /* If minimizing, keep testing the rest of the expression and advancing
3723       the pointer while it matches the class. */
3724
3725       if (minimize)
3726         {
3727         for (i = min;; i++)
3728           {
3729           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3730           if (i >= max || eptr >= md->end_subject) FAIL;
3731           c = *eptr++;
3732
3733           /* Either not runtime caseless, or it was a positive class. For
3734           runtime caseless, continue if either case is in the map. */
3735
3736           if (!nasty_case)
3737             {
3738             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3739             if (md->runtime_caseless)
3740               {
3741               c = pcre_fcc[c];
3742               if ((data[c/8] & (1 << (c&7))) != 0) continue;
3743               }
3744             }
3745
3746           /* Runtime caseless and it was a negative class. Continue only if
3747           both cases are in the map. */
3748
3749           else
3750              {
3751              if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3752              c = pcre_fcc[c];
3753              if ((data[c/8] & (1 << (c&7))) != 0) continue;
3754              }
3755
3756           FAIL;
3757           }
3758         /* Control never gets here */
3759         }
3760
3761       /* If maximizing, find the longest possible run, then work backwards. */
3762
3763       else
3764         {
3765         const uschar *pp = eptr;
3766         for (i = min; i < max; eptr++, i++)
3767           {
3768           if (eptr >= md->end_subject) break;
3769           c = *eptr;
3770
3771           /* Either not runtime caseless, or it was a positive class. For
3772           runtime caseless, continue if either case is in the map. */
3773
3774           if (!nasty_case)
3775             {
3776             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3777             if (md->runtime_caseless)
3778               {
3779               c = pcre_fcc[c];
3780               if ((data[c/8] & (1 << (c&7))) != 0) continue;
3781               }
3782             }
3783
3784           /* Runtime caseless and it was a negative class. Continue only if
3785           both cases are in the map. */
3786
3787           else
3788             {
3789             if ((data[c/8] & (1 << (c&7))) == 0) break;
3790             c = pcre_fcc[c];
3791             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3792             }
3793
3794           break;
3795           }
3796
3797         while (eptr >= pp)
3798           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3799         FAIL;
3800         }
3801       }
3802     /* Control never gets here */
3803
3804    /* OP_CLASS_L opcode: handles localized character classes */
3805
3806    case OP_CLASS_L:
3807      {
3808       const uschar *data = ecode + 1;  /* Save for matching */
3809       const uschar locale_flag = *data;
3810       ecode++; data++;          /* The localization support adds an extra byte */
3811
3812       ecode += 33;               /* Advance past the item */
3813
3814       switch (*ecode)
3815         {
3816         case OP_CRSTAR:
3817         case OP_CRMINSTAR:
3818         case OP_CRPLUS:
3819         case OP_CRMINPLUS:
3820         case OP_CRQUERY:
3821         case OP_CRMINQUERY:
3822         c = *ecode++ - OP_CRSTAR;
3823         minimize = (c & 1) != 0;
3824         min = rep_min[c];                 /* Pick up values from tables; */
3825         max = rep_max[c];                 /* zero for max => infinity */
3826         if (max == 0) max = INT_MAX;
3827         break;
3828
3829         case OP_CRRANGE:
3830         case OP_CRMINRANGE:
3831         minimize = (*ecode == OP_CRMINRANGE);
3832         min = (ecode[1] << 8) + ecode[2];
3833         max = (ecode[3] << 8) + ecode[4];
3834         if (max == 0) max = INT_MAX;
3835         ecode += 5;
3836         break;
3837
3838         default:               /* No repeat follows */
3839         if (eptr >= md->end_subject) FAIL;
3840         c = *eptr++;
3841         if ((data[c/8] & (1 << (c&7))) != 0) continue;    /* With main loop */
3842         if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3843         if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3844 #if 0
3845         if ( (locale_flag &  4) && isdigit(c) ) continue;    /* Locale \d */
3846         if ( (locale_flag &  8) && !isdigit(c) ) continue;   /* Locale \D */
3847         if ( (locale_flag & 16) && isspace(c) ) continue;    /* Locale \s */
3848         if ( (locale_flag & 32) && !isspace(c) ) continue;   /* Locale \S */
3849 #endif
3850
3851         if (md->runtime_caseless)
3852           {
3853           c = pcre_fcc[c];
3854           if ((data[c/8] & (1 << (c&7))) != 0) continue;  /* With main loop */
3855
3856           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3857           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3858           }
3859         FAIL;
3860         }
3861
3862       /* First, ensure the minimum number of matches are present. */
3863
3864       for (i = 1; i <= min; i++)
3865         {
3866         if (eptr >= md->end_subject) FAIL;
3867         c = *eptr++;
3868         if ((data[c/8] & (1 << (c&7))) != 0) continue;
3869         if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3870         if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3871
3872         if (md->runtime_caseless)
3873           {
3874           c = pcre_fcc[c];
3875           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3876           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3877           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3878           }
3879         FAIL;
3880         }
3881
3882       /* If max == min we can continue with the main loop without the
3883       need to recurse. */
3884
3885       if (min == max) continue;
3886
3887       /* If minimizing, keep testing the rest of the expression and advancing
3888       the pointer while it matches the class. */
3889
3890       if (minimize)
3891         {
3892         for (i = min;; i++)
3893           {
3894           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3895           if (i >= max || eptr >= md->end_subject) FAIL;
3896           c = *eptr++;
3897           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3898           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3899           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3900
3901           if (md->runtime_caseless)
3902             {
3903             c = pcre_fcc[c];
3904             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3905             if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3906             if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3907             }
3908           FAIL;
3909           }
3910         /* Control never gets here */
3911         }
3912
3913       /* If maximizing, find the longest possible run, then work backwards. */
3914
3915       else
3916         {
3917         const uschar *pp = eptr;
3918         for (i = min; i < max; eptr++, i++)
3919           {
3920           if (eptr >= md->end_subject) break;
3921           c = *eptr;
3922           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3923           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3924           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3925           if (md->runtime_caseless)
3926             {
3927             c = pcre_fcc[c];
3928             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3929             if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3930             if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3931             }
3932           break;
3933           }
3934
3935         while (eptr >= pp)
3936           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3937         FAIL;
3938         }
3939       }
3940     /* Control never gets here */
3941
3942     /* Match a run of characters */
3943
3944     case OP_CHARS:
3945       {
3946       register int length = ecode[1];
3947       ecode += 2;
3948
3949 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
3950       if (eptr >= md->end_subject)
3951         printf("matching subject <null> against pattern ");
3952       else
3953         {
3954         printf("matching subject ");
3955         pchars(eptr, length, TRUE, md);
3956         printf(" against pattern ");
3957         }
3958       pchars(ecode, length, FALSE, md);
3959       printf("\n");
3960 #endif
3961
3962       if (length > md->end_subject - eptr) FAIL;
3963       if (md->caseless)
3964         {
3965         while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3966         }
3967       else
3968         {
3969         while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3970         }
3971       }
3972     break;
3973
3974     /* Match a single character repeatedly; different opcodes share code. */
3975
3976     case OP_EXACT:
3977     min = max = (ecode[1] << 8) + ecode[2];
3978     ecode += 3;
3979     goto REPEATCHAR;
3980
3981     case OP_UPTO:
3982     case OP_MINUPTO:
3983     min = 0;
3984     max = (ecode[1] << 8) + ecode[2];
3985     minimize = *ecode == OP_MINUPTO;
3986     ecode += 3;
3987     goto REPEATCHAR;
3988
3989     case OP_STAR:
3990     case OP_MINSTAR:
3991     case OP_PLUS:
3992     case OP_MINPLUS:
3993     case OP_QUERY:
3994     case OP_MINQUERY:
3995     c = *ecode++ - OP_STAR;
3996     minimize = (c & 1) != 0;
3997     min = rep_min[c];                 /* Pick up values from tables; */
3998     max = rep_max[c];                 /* zero for max => infinity */
3999     if (max == 0) max = INT_MAX;
4000
4001     /* Common code for all repeated single-character matches. We can give
4002     up quickly if there are fewer than the minimum number of characters left in
4003     the subject. */
4004
4005     REPEATCHAR:
4006     if (min > md->end_subject - eptr) FAIL;
4007     c = *ecode++;
4008
4009     /* The code is duplicated for the caseless and caseful cases, for speed,
4010     since matching characters is likely to be quite common. First, ensure the
4011     minimum number of matches are present. If min = max, continue at the same
4012     level without recursing. Otherwise, if minimizing, keep trying the rest of
4013     the expression and advancing one matching character if failing, up to the
4014     maximum. Alternatively, if maximizing, find the maximum number of
4015     characters and work backwards. */
4016
4017     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4018       max, eptr));
4019
4020     if (md->caseless)
4021       {
4022       c = pcre_lcc[c];
4023       for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4024       if (min == max) continue;
4025       if (minimize)
4026         {
4027         for (i = min;; i++)
4028           {
4029           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4030           if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4031             FAIL;
4032           }
4033         /* Control never gets here */
4034         }
4035       else
4036         {
4037         const uschar *pp = eptr;
4038         for (i = min; i < max; i++)
4039           {
4040           if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4041           eptr++;
4042           }
4043         while (eptr >= pp)
4044           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4045         FAIL;
4046         }
4047       /* Control never gets here */
4048       }
4049
4050     /* Caseful comparisons */
4051
4052     else
4053       {
4054       for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4055       if (min == max) continue;
4056       if (minimize)
4057         {
4058         for (i = min;; i++)
4059           {
4060           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4061           if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4062           }
4063         /* Control never gets here */
4064         }
4065       else
4066         {
4067         const uschar *pp = eptr;
4068         for (i = min; i < max; i++)
4069           {
4070           if (eptr >= md->end_subject || c != *eptr) break;
4071           eptr++;
4072           }
4073         while (eptr >= pp)
4074          if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4075         FAIL;
4076         }
4077       }
4078     /* Control never gets here */
4079
4080     /* Match a negated single character */
4081
4082     case OP_NOT:
4083     if (eptr >= md->end_subject) FAIL;
4084     ecode++;
4085     if (md->caseless)
4086       {
4087       if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4088       }
4089     else
4090       {
4091       if (*ecode++ == *eptr++) FAIL;
4092       }
4093     break;
4094
4095     /* Match a negated single character repeatedly. This is almost a repeat of
4096     the code for a repeated single character, but I haven't found a nice way of
4097     commoning these up that doesn't require a test of the positive/negative
4098     option for each character match. Maybe that wouldn't add very much to the
4099     time taken, but character matching *is* what this is all about... */
4100
4101     case OP_NOTEXACT:
4102     min = max = (ecode[1] << 8) + ecode[2];
4103     ecode += 3;
4104     goto REPEATNOTCHAR;
4105
4106     case OP_NOTUPTO:
4107     case OP_NOTMINUPTO:
4108     min = 0;
4109     max = (ecode[1] << 8) + ecode[2];
4110     minimize = *ecode == OP_NOTMINUPTO;
4111     ecode += 3;
4112     goto REPEATNOTCHAR;
4113
4114     case OP_NOTSTAR:
4115     case OP_NOTMINSTAR:
4116     case OP_NOTPLUS:
4117     case OP_NOTMINPLUS:
4118     case OP_NOTQUERY:
4119     case OP_NOTMINQUERY:
4120     c = *ecode++ - OP_NOTSTAR;
4121     minimize = (c & 1) != 0;
4122     min = rep_min[c];                 /* Pick up values from tables; */
4123     max = rep_max[c];                 /* zero for max => infinity */
4124     if (max == 0) max = INT_MAX;
4125
4126     /* Common code for all repeated single-character matches. We can give
4127     up quickly if there are fewer than the minimum number of characters left in
4128     the subject. */
4129
4130     REPEATNOTCHAR:
4131     if (min > md->end_subject - eptr) FAIL;
4132     c = *ecode++;
4133
4134     /* The code is duplicated for the caseless and caseful cases, for speed,
4135     since matching characters is likely to be quite common. First, ensure the
4136     minimum number of matches are present. If min = max, continue at the same
4137     level without recursing. Otherwise, if minimizing, keep trying the rest of
4138     the expression and advancing one matching character if failing, up to the
4139     maximum. Alternatively, if maximizing, find the maximum number of
4140     characters and work backwards. */
4141
4142     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4143       max, eptr));
4144
4145     if (md->caseless)
4146       {
4147       c = pcre_lcc[c];
4148       for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4149       if (min == max) continue;
4150       if (minimize)
4151         {
4152         for (i = min;; i++)
4153           {
4154           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4155           if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4156             FAIL;
4157           }
4158         /* Control never gets here */
4159         }
4160       else
4161         {
4162         const uschar *pp = eptr;
4163         for (i = min; i < max; i++)
4164           {
4165           if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4166           eptr++;
4167           }
4168         while (eptr >= pp)
4169           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4170         FAIL;
4171         }
4172       /* Control never gets here */
4173       }
4174
4175     /* Caseful comparisons */
4176
4177     else
4178       {
4179       for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4180       if (min == max) continue;
4181       if (minimize)
4182         {
4183         for (i = min;; i++)
4184           {
4185           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4186           if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4187           }
4188         /* Control never gets here */
4189         }
4190       else
4191         {
4192         const uschar *pp = eptr;
4193         for (i = min; i < max; i++)
4194           {
4195           if (eptr >= md->end_subject || c == *eptr) break;
4196           eptr++;
4197           }
4198         while (eptr >= pp)
4199          if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4200         FAIL;
4201         }
4202       }
4203     /* Control never gets here */
4204
4205     /* Match a single character type repeatedly; several different opcodes
4206     share code. This is very similar to the code for single characters, but we
4207     repeat it in the interests of efficiency. */
4208
4209     case OP_TYPEEXACT:
4210     min = max = (ecode[1] << 8) + ecode[2];
4211     minimize = TRUE;
4212     ecode += 3;
4213     goto REPEATTYPE;
4214
4215     case OP_TYPEUPTO:
4216     case OP_TYPEMINUPTO:
4217     min = 0;
4218     max = (ecode[1] << 8) + ecode[2];
4219     minimize = *ecode == OP_TYPEMINUPTO;
4220     ecode += 3;
4221     goto REPEATTYPE;
4222
4223     case OP_TYPESTAR:
4224     case OP_TYPEMINSTAR:
4225     case OP_TYPEPLUS:
4226     case OP_TYPEMINPLUS:
4227     case OP_TYPEQUERY:
4228     case OP_TYPEMINQUERY:
4229     c = *ecode++ - OP_TYPESTAR;
4230     minimize = (c & 1) != 0;
4231     min = rep_min[c];                 /* Pick up values from tables; */
4232     max = rep_max[c];                 /* zero for max => infinity */
4233     if (max == 0) max = INT_MAX;
4234
4235     /* Common code for all repeated single character type matches */
4236
4237     REPEATTYPE:
4238     ctype = *ecode++;      /* Code for the character type */
4239
4240     /* First, ensure the minimum number of matches are present. Use inline
4241     code for maximizing the speed, and do the type test once at the start
4242     (i.e. keep it out of the loop). Also test that there are at least the
4243     minimum number of characters before we start. */
4244
4245     if (min > md->end_subject - eptr) FAIL;
4246     if (min > 0) switch(ctype)
4247       {
4248       case OP_ANY:
4249       if (!md->dotall)
4250         { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4251       else eptr += min;
4252       break;
4253
4254       case OP_NOT_DIGIT:
4255       for (i = 1; i <= min; i++)
4256         if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4257       break;
4258
4259       case OP_DIGIT:
4260       for (i = 1; i <= min; i++)
4261         if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4262       break;
4263
4264       case OP_NOT_WHITESPACE:
4265       for (i = 1; i <= min; i++)
4266         if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4267       break;
4268
4269       case OP_WHITESPACE:
4270       for (i = 1; i <= min; i++)
4271         if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4272       break;
4273
4274       case OP_NOT_WORDCHAR:
4275       for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4276         FAIL;
4277       break;
4278
4279       case OP_WORDCHAR:
4280       for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4281         FAIL;
4282       break;
4283
4284       case OP_NOT_WORDCHAR_L:
4285       for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
4286         FAIL;
4287       break;
4288
4289       case OP_WORDCHAR_L:
4290       for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
4291         FAIL;
4292       break;
4293       }
4294
4295     /* If min = max, continue at the same level without recursing */
4296
4297     if (min == max) continue;
4298
4299     /* If minimizing, we have to test the rest of the pattern before each
4300     subsequent match, so inlining isn't much help; just use the function. */
4301
4302     if (minimize)
4303       {
4304       for (i = min;; i++)
4305         {
4306         if (match(eptr, ecode, offset_top, md)) SUCCEED;
4307         if (i >= max || eptr >= md->end_subject ||
4308           !match_type(ctype, *eptr++, md->dotall))
4309             FAIL;
4310         }
4311       /* Control never gets here */
4312       }
4313
4314     /* If maximizing it is worth using inline code for speed, doing the type
4315     test once at the start (i.e. keep it out of the loop). */
4316
4317     else
4318       {
4319       const uschar *pp = eptr;
4320       switch(ctype)
4321         {
4322         case OP_ANY:
4323         if (!md->dotall)
4324           {
4325           for (i = min; i < max; i++)
4326             {
4327             if (eptr >= md->end_subject || *eptr == '\n') break;
4328             eptr++;
4329             }
4330           }
4331         else
4332           {
4333           c = max - min;
4334           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4335           eptr += c;
4336           }
4337         break;
4338
4339         case OP_NOT_DIGIT:
4340         for (i = min; i < max; i++)
4341           {
4342           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4343             break;
4344           eptr++;
4345           }
4346         break;
4347
4348         case OP_DIGIT:
4349         for (i = min; i < max; i++)
4350           {
4351           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4352             break;
4353           eptr++;
4354           }
4355         break;
4356
4357         case OP_NOT_WHITESPACE:
4358         for (i = min; i < max; i++)
4359           {
4360           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4361             break;
4362           eptr++;
4363           }
4364         break;
4365
4366         case OP_WHITESPACE:
4367         for (i = min; i < max; i++)
4368           {
4369           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4370             break;
4371           eptr++;
4372           }
4373         break;
4374
4375         case OP_NOT_WORDCHAR:
4376         for (i = min; i < max; i++)
4377           {
4378           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4379             break;
4380           eptr++;
4381           }
4382         break;
4383
4384         case OP_WORDCHAR:
4385         for (i = min; i < max; i++)
4386           {
4387             if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4388               break;
4389             eptr++;
4390           }
4391         break;
4392         case OP_NOT_WORDCHAR_L:
4393           for (i = min; i < max; i++)
4394          {
4395          if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
4396            break;
4397          eptr++;
4398          }
4399        break;
4400
4401        case OP_WORDCHAR_L:
4402        for (i = min; i < max; i++)
4403          {
4404          if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
4405              break;
4406           eptr++;
4407           }
4408         break;
4409         }
4410
4411       while (eptr >= pp)
4412         if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4413       FAIL;
4414       }
4415     /* Control never gets here */
4416
4417     /* There's been some horrible disaster. */
4418
4419     default:
4420     DPRINTF(("Unknown opcode %d\n", *ecode));
4421     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4422     FAIL;
4423     }
4424
4425   /* Do not stick any code in here without much thought; it is assumed
4426   that "continue" in the code above comes out to here to repeat the main
4427   loop. */
4428
4429   }             /* End of main loop */
4430 /* Control never reaches here */
4431
4432 fail:
4433  if (md->point > save_stack_position)
4434  {
4435    /* If there are still points remaining on the stack, pop the next one off */
4436    int off_num;
4437
4438    md->point--;
4439    offset_top = md->offset_top[md->point];
4440    eptr       = md->eptr[md->point];
4441    ecode      = md->ecode[md->point];
4442    off_num    = md->off_num[md->point];
4443    md->offset_vector[off_num]   = md->r1[md->point];
4444    md->offset_vector[off_num+1] = md->r2[md->point];
4445    goto match_loop;
4446   }
4447    /* Failure, and nothing left on the stack, so end this function call */
4448
4449  /* Restore the top of the stack to where it was before this function
4450     call.  This lets us use one stack for everything; recursive calls
4451     can push and pop information, and may increase the stack.  When
4452     the call returns, the parent function can resume pushing and
4453     popping wherever it was. */
4454
4455  md->point = save_stack_position;
4456  return FALSE;
4457
4458 succeed:
4459  return TRUE;
4460 }
4461
4462
4463
4464 /*************************************************
4465 *         Segregate setjmp()                     *
4466 *************************************************/
4467
4468 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4469 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4470 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4471 since it's needed only for the extension \X option, and with any luck, a good
4472 compiler will spot the tail recursion and compile it efficiently.
4473
4474 Arguments:
4475    eptr        pointer in subject
4476    ecode       position in code
4477    offset_top  current top pointer
4478    md          pointer to "static" info for the match
4479
4480 Returns:       TRUE if matched
4481 */
4482
4483 static BOOL
4484 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4485   match_data *match_block)
4486 {
4487 return setjmp(match_block->fail_env) == 0 &&
4488       match(eptr, ecode, offset_top, match_block);
4489 }
4490
4491
4492
4493 /*************************************************
4494 *         Execute a Regular Expression           *
4495 *************************************************/
4496
4497 /* This function applies a compiled re to a subject string and picks out
4498 portions of the string if it matches. Two elements in the vector are set for
4499 each substring: the offsets to the start and end of the substring.
4500
4501 Arguments:
4502   external_re     points to the compiled expression
4503   external_extra  points to "hints" from pcre_study() or is NULL
4504   subject         points to the subject string
4505   length          length of subject string (may contain binary zeros)
4506   options         option bits
4507   offsets         points to a vector of ints to be filled in with offsets
4508   offsetcount     the number of elements in the vector
4509
4510 Returns:          > 0 => success; value is the number of elements filled in
4511                   = 0 => success, but offsets is not big enough
4512                    -1 => failed to match
4513                  < -1 => some kind of unexpected problem
4514 */
4515
4516 int
4517 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4518   const char *subject, int length, int start_pos, int options,
4519   int *offsets, int offsetcount)
4520 {
4521   /* The "volatile" directives are to make gcc -Wall stop complaining
4522      that these variables can be clobbered by the longjmp.  Hopefully
4523      they won't cost too much performance. */
4524 volatile int resetcount, ocount;
4525 volatile int first_char = -1;
4526 const uschar * volatile start_bits = NULL;
4527 const uschar * volatile start_match = (const uschar *)subject + start_pos;
4528 match_data match_block;
4529 const uschar *end_subject;
4530 const real_pcre *re = (const real_pcre *)external_re;
4531 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4532 volatile BOOL using_temporary_offsets = FALSE;
4533 volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4534 volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4535
4536 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4537
4538 if (re == NULL || subject == NULL ||
4539    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4540 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4541
4542 match_block.start_subject = (const uschar *)subject;
4543 match_block.end_subject = match_block.start_subject + length;
4544 end_subject = match_block.end_subject;
4545
4546 match_block.caseless  = ((re->options | options) & PCRE_CASELESS) != 0;
4547 match_block.runtime_caseless = match_block.caseless &&
4548   (re->options & PCRE_CASELESS) == 0;
4549
4550 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4551 match_block.dotall    = ((re->options | options) & PCRE_DOTALL) != 0;
4552 match_block.endonly   = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4553
4554 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4555 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4556
4557 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4558
4559 /* Set the stack state to empty */
4560   match_block.off_num = match_block.offset_top = NULL;
4561   match_block.r1 = match_block.r2 = NULL;
4562   match_block.eptr = match_block.ecode = NULL;
4563   match_block.point = match_block.length = 0;
4564
4565 /* If the expression has got more back references than the offsets supplied can
4566 hold, we get a temporary bit of working store to use during the matching.
4567 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4568 of 2. */
4569
4570 ocount = offsetcount & (-2);
4571 if (re->top_backref > 0 && re->top_backref >= ocount/2)
4572   {
4573   ocount = re->top_backref * 2 + 2;
4574   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4575   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4576   using_temporary_offsets = TRUE;
4577   DPRINTF(("Got memory to hold back references\n"));
4578   }
4579 else match_block.offset_vector = offsets;
4580
4581 match_block.offset_end = ocount;
4582 match_block.offset_overflow = FALSE;
4583
4584 /* Compute the minimum number of offsets that we need to reset each time. Doing
4585 this makes a huge difference to execution time when there aren't many brackets
4586 in the pattern. */
4587
4588 resetcount = 2 + re->top_bracket * 2;
4589 if (resetcount > offsetcount) resetcount = ocount;
4590
4591 /* If MULTILINE is set at exec time but was not set at compile time, and the
4592 anchored flag is set, we must re-check because a setting provoked by ^ in the
4593 pattern is not right in multi-line mode. Calling is_anchored() again here does
4594 the right check, because multiline is now set. If it now yields FALSE, the
4595 expression must have had ^ starting some of its branches. Check to see if
4596 that is true for *all* branches, and if so, set the startline flag. */
4597
4598 if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4599     !is_anchored(re->code, match_block.multiline))
4600   {
4601   anchored = FALSE;
4602   if (is_startline(re->code)) startline = TRUE;
4603   }
4604
4605 /* Set up the first character to match, if available. The first_char value is
4606 never set for an anchored regular expression, but the anchoring may be forced
4607 at run time, so we have to test for anchoring. The first char may be unset for
4608 an unanchored pattern, of course. If there's no first char and the pattern was
4609 studied, the may be a bitmap of possible first characters. However, we can
4610 use this only if the caseless state of the studying was correct. */
4611
4612 if (!anchored)
4613   {
4614   if ((re->options & PCRE_FIRSTSET) != 0)
4615     {
4616     first_char = re->first_char;
4617     if (match_block.caseless) first_char = pcre_lcc[first_char];
4618     }
4619   else
4620     if (!startline && extra != NULL &&
4621       (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4622       ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4623         start_bits = extra->start_bits;
4624   }
4625
4626 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4627
4628 do
4629   {
4630   int rc;
4631   register int *iptr = match_block.offset_vector;
4632   register int *iend = iptr + resetcount;
4633
4634   /* Reset the maximum number of extractions we might see. */
4635
4636   while (iptr < iend) *iptr++ = -1;
4637
4638   /* Advance to a unique first char if possible */
4639
4640   if (first_char >= 0)
4641     {
4642     if (match_block.caseless)
4643       while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4644         start_match++;
4645     else
4646       while (start_match < end_subject && *start_match != first_char)
4647         start_match++;
4648     }
4649
4650   /* Or to just after \n for a multiline match if possible */
4651
4652   else if (startline)
4653     {
4654     if (start_match > match_block.start_subject)
4655       {
4656       while (start_match < end_subject && start_match[-1] != '\n')
4657         start_match++;
4658       }
4659     }
4660
4661   /* Or to a non-unique first char */
4662
4663   else if (start_bits != NULL)
4664     {
4665     while (start_match < end_subject)
4666       {
4667       register int c = *start_match;
4668       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4669       }
4670     }
4671
4672 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
4673   printf(">>>> Match against: ");
4674   pchars(start_match, end_subject - start_match, TRUE, &match_block);
4675   printf("\n");
4676 #endif
4677
4678   /* When a match occurs, substrings will be set for all internal extractions;
4679   we just need to set up the whole thing as substring 0 before returning. If
4680   there were too many extractions, set the return code to zero. In the case
4681   where we had to get some local store to hold offsets for backreferences, copy
4682   those back references that we can. In this case there need not be overflow
4683   if certain parts of the pattern were not used.
4684
4685   Before starting the match, we have to set up a longjmp() target to enable
4686   the "cut" operation to fail a match completely without backtracking. This
4687   is done in a separate function to avoid compiler warnings. We need not do
4688   it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4689   enabled. */
4690
4691   /* To handle errors such as running out of memory for the failure
4692      stack, we need to save this location via setjmp(), so
4693      error-handling code can call longjmp() to jump out of deeply-nested code. */
4694   if (setjmp(match_block.error_env)==0)
4695     {
4696
4697   if ((re->options & PCRE_EXTRA) != 0)
4698     {
4699     if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4700       continue;
4701     }
4702   else if (!match(start_match, re->code, 2, &match_block)) continue;
4703
4704   /* Copy the offset information from temporary store if necessary */
4705
4706   if (using_temporary_offsets)
4707     {
4708     if (offsetcount >= 4)
4709       {
4710       memcpy(offsets + 2, match_block.offset_vector + 2,
4711         (offsetcount - 2) * sizeof(int));
4712       DPRINTF(("Copied offsets from temporary memory\n"));
4713       }
4714     if (match_block.end_offset_top > offsetcount)
4715       match_block.offset_overflow = TRUE;
4716
4717     DPRINTF(("Freeing temporary memory\n"));
4718     (pcre_free)(match_block.offset_vector);
4719     }
4720
4721   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4722
4723   if (match_block.offset_end < 2) rc = 0; else
4724     {
4725     offsets[0] = start_match - match_block.start_subject;
4726     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4727     }
4728
4729   DPRINTF((">>>> returning %d\n", rc));
4730   free_stack(&match_block);
4731   return rc;
4732   }  /* End of (if setjmp(match_block.error_env)...) */
4733   free_stack(&match_block);
4734
4735   /* Return an error code; pcremodule.c will preserve the exception */
4736   if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4737   }
4738 while (!anchored &&
4739        match_block.errorcode == PCRE_ERROR_NOMATCH &&
4740        start_match++ < end_subject);
4741
4742 if (using_temporary_offsets)
4743   {
4744   DPRINTF(("Freeing temporary memory\n"));
4745   (pcre_free)(match_block.offset_vector);
4746   }
4747
4748 #ifdef DEBUG
4749 printf(">>>> returning %d\n", match_block.errorcode);
4750 #endif
4751
4752  free_stack(&match_block);
4753  return match_block.errorcode;
4754 }
4755
4756 /* End of pcre.c */