Modules/pypcre.c

   1
   2 /*************************************************
   3 *      Perl-Compatible Regular Expressions       *
   4 *************************************************/
   5
   6 /*   DO NOT EDIT THIS FILE! */
   7
   8 /* This file is automatically written by the merge-files.py script
   9 included with the PCRE distribution for Python; it's produced from
  10 several C files, and code is removed in the process.  If you want to
  11 modify the code or track down bugs, it will be much easier to work
  12 with the code in its original, multiple-file form.  Don't edit this
  13 file by hand, or submit patches to it.
  14
  15 The Python-specific PCRE distribution can be retrieved from
  16        http://starship.skyport.net/crew/amk/regex/
  17
  18 The unmodified original PCRE distribution is available at
  19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
  20 written by: Philip Hazel <ph10@cam.ac.uk>
  21
  22 Extensively modified by the Python String-SIG: <string-sig@python.org>
  23 Send bug reports to:                           <string-sig@python.org>
  24 (They'll figure out if it's a bug in PCRE or in the Python-specific
  25 changes.)
  26
  27            Copyright (c) 1997 University of Cambridge
  28
  29 -----------------------------------------------------------------------------
  30 Permission is granted to anyone to use this software for any purpose on any
  31 computer system, and to redistribute it freely, subject to the following
  32 restrictions:
  33
  34 1. This software is distributed in the hope that it will be useful,
  35    but WITHOUT ANY WARRANTY; without even the implied warranty of
  36    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  37
  38 2. The origin of this software must not be misrepresented, either by
  39    explicit claim or by omission.
  40
  41 3. Altered versions must be plainly marked as such, and must not be
  42    misrepresented as being the original software.
  43 -----------------------------------------------------------------------------
  44 */
  45
  46
  47 #define FOR_PYTHON
  48 #include "pcre-int.h"
  49 #include "Python.h"
  50 #include "mymalloc.h"
  51 #include <ctype.h>
  52 #include "graminit.h"
  53
  54 /*************************************************
  55 *      Perl-Compatible Regular Expressions       *
  56 *************************************************/
  57
  58 /* This file is automatically written by the makechartables auxiliary
  59 program. If you edit it by hand, you might like to edit the Makefile to
  60 prevent its ever being regenerated. */
  61
  62 /* This table is a lower casing table. */
  63
  64 unsigned char pcre_lcc[] = {
  65     0,  1,  2,  3,  4,  5,  6,  7,
  66     8,  9, 10, 11, 12, 13, 14, 15,
  67    16, 17, 18, 19, 20, 21, 22, 23,
  68    24, 25, 26, 27, 28, 29, 30, 31,
  69    32, 33, 34, 35, 36, 37, 38, 39,
  70    40, 41, 42, 43, 44, 45, 46, 47,
  71    48, 49, 50, 51, 52, 53, 54, 55,
  72    56, 57, 58, 59, 60, 61, 62, 63,
  73    64, 97, 98, 99,100,101,102,103,
  74   104,105,106,107,108,109,110,111,
  75   112,113,114,115,116,117,118,119,
  76   120,121,122, 91, 92, 93, 94, 95,
  77    96, 97, 98, 99,100,101,102,103,
  78   104,105,106,107,108,109,110,111,
  79   112,113,114,115,116,117,118,119,
  80   120,121,122,123,124,125,126,127,
  81   128,129,130,131,132,133,134,135,
  82   136,137,138,139,140,141,142,143,
  83   144,145,146,147,148,149,150,151,
  84   152,153,154,155,156,157,158,159,
  85   160,161,162,163,164,165,166,167,
  86   168,169,170,171,172,173,174,175,
  87   176,177,178,179,180,181,182,183,
  88   184,185,186,187,188,189,190,191,
  89   192,193,194,195,196,197,198,199,
  90   200,201,202,203,204,205,206,207,
  91   208,209,210,211,212,213,214,215,
  92   216,217,218,219,220,221,222,223,
  93   224,225,226,227,228,229,230,231,
  94   232,233,234,235,236,237,238,239,
  95   240,241,242,243,244,245,246,247,
  96   248,249,250,251,252,253,254,255 };
  97
  98 /* This table is a case flipping table. */
  99
 100 unsigned char pcre_fcc[] = {
 101     0,  1,  2,  3,  4,  5,  6,  7,
 102     8,  9, 10, 11, 12, 13, 14, 15,
 103    16, 17, 18, 19, 20, 21, 22, 23,
 104    24, 25, 26, 27, 28, 29, 30, 31,
 105    32, 33, 34, 35, 36, 37, 38, 39,
 106    40, 41, 42, 43, 44, 45, 46, 47,
 107    48, 49, 50, 51, 52, 53, 54, 55,
 108    56, 57, 58, 59, 60, 61, 62, 63,
 109    64, 97, 98, 99,100,101,102,103,
 110   104,105,106,107,108,109,110,111,
 111   112,113,114,115,116,117,118,119,
 112   120,121,122, 91, 92, 93, 94, 95,
 113    96, 65, 66, 67, 68, 69, 70, 71,
 114    72, 73, 74, 75, 76, 77, 78, 79,
 115    80, 81, 82, 83, 84, 85, 86, 87,
 116    88, 89, 90,123,124,125,126,127,
 117   128,129,130,131,132,133,134,135,
 118   136,137,138,139,140,141,142,143,
 119   144,145,146,147,148,149,150,151,
 120   152,153,154,155,156,157,158,159,
 121   160,161,162,163,164,165,166,167,
 122   168,169,170,171,172,173,174,175,
 123   176,177,178,179,180,181,182,183,
 124   184,185,186,187,188,189,190,191,
 125   192,193,194,195,196,197,198,199,
 126   200,201,202,203,204,205,206,207,
 127   208,209,210,211,212,213,214,215,
 128   216,217,218,219,220,221,222,223,
 129   224,225,226,227,228,229,230,231,
 130   232,233,234,235,236,237,238,239,
 131   240,241,242,243,244,245,246,247,
 132   248,249,250,251,252,253,254,255 };
 133
 134 /* This table contains bit maps for digits, letters, 'word' chars, and
 135 white space. Each map is 32 bytes long and the bits run from the least
 136 significant end of each byte. */
 137
 138 unsigned char pcre_cbits[] = {
 139   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
 140   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 141   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 142   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 143
 144   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 145   0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
 146   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 147   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 148
 149   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
 150   0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
 151   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 152   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 153
 154   0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
 155   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 156   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 157   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
 158
 159 /* This table identifies various classes of character by individual bits:
 160   0x01   white space character
 161   0x02   letter
 162   0x04   decimal digit
 163   0x08   hexadecimal digit
 164   0x10   alphanumeric or '_'
 165   0x80   regular expression metacharacter or binary zero
 166 */
 167
 168 unsigned char pcre_ctypes[] = {
 169   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 170   0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /*   8- 15 */
 171   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 172   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 173   0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /*    - '  */
 174   0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /*  ( - /  */
 175   0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /*  0 - 7  */
 176   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /*  8 - ?  */
 177   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  @ - G  */
 178   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  H - O  */
 179   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  P - W  */
 180   0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /*  X - _  */
 181   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  ` - g  */
 182   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  h - o  */
 183   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  p - w  */
 184   0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /*  x -127 */
 185   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 186   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 187   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 188   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 189   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 190   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 191   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 192   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 193   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 194   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 195   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 196   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 197   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 198   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 199   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 200   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 201
 202 /* End of chartables.c */
 203 /*************************************************
 204 *      Perl-Compatible Regular Expressions       *
 205 *************************************************/
 206
 207 /*
 208 This is a library of functions to support regular expressions whose syntax
 209 and semantics are as close as possible to those of the Perl 5 language. See
 210 the file Tech.Notes for some information on the internals.
 211
 212 Written by: Philip Hazel <ph10@cam.ac.uk>
 213
 214            Copyright (c) 1998 University of Cambridge
 215
 216 -----------------------------------------------------------------------------
 217 Permission is granted to anyone to use this software for any purpose on any
 218 computer system, and to redistribute it freely, subject to the following
 219 restrictions:
 220
 221 1. This software is distributed in the hope that it will be useful,
 222    but WITHOUT ANY WARRANTY; without even the implied warranty of
 223    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 224
 225 2. The origin of this software must not be misrepresented, either by
 226    explicit claim or by omission.
 227
 228 3. Altered versions must be plainly marked as such, and must not be
 229    misrepresented as being the original software.
 230 -----------------------------------------------------------------------------
 231 */
 232
 233
 234 /* Include the internals header, which itself includes Standard C headers plus
 235 the external pcre header. */
 236
 237
 238
 239
 240 /*************************************************
 241 *          Create bitmap of starting chars       *
 242 *************************************************/
 243
 244 /* This function scans a compiled unanchored expression and attempts to build a
 245 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
 246 goes by, we may be able to get more clever at doing this.
 247
 248 Arguments:
 249   code         points to an expression
 250   start_bits   points to a 32-byte table, initialized to 0
 251
 252 Returns:       TRUE if table built, FALSE otherwise
 253 */
 254
 255 static BOOL
 256 set_start_bits(const uschar *code, uschar *start_bits)
 257 {
 258 register int c;
 259
 260 do
 261   {
 262   const uschar *tcode = code + 3;
 263   BOOL try_next = TRUE;
 264
 265   while (try_next)
 266     {
 267     try_next = FALSE;
 268
 269     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
 270       {
 271       if (!set_start_bits(tcode, start_bits)) return FALSE;
 272       }
 273
 274     else switch(*tcode)
 275       {
 276       default:
 277       return FALSE;
 278
 279       /* BRAZERO does the bracket, but carries on. */
 280
 281       case OP_BRAZERO:
 282       case OP_BRAMINZERO:
 283       if (!set_start_bits(++tcode, start_bits)) return FALSE;
 284       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
 285       tcode += 3;
 286       try_next = TRUE;
 287       break;
 288
 289       /* Single-char * or ? sets the bit and tries the next item */
 290
 291       case OP_STAR:
 292       case OP_MINSTAR:
 293       case OP_QUERY:
 294       case OP_MINQUERY:
 295       start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
 296       tcode += 2;
 297       try_next = TRUE;
 298       break;
 299
 300       /* Single-char upto sets the bit and tries the next */
 301
 302       case OP_UPTO:
 303       case OP_MINUPTO:
 304       start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
 305       tcode += 4;
 306       try_next = TRUE;
 307       break;
 308
 309       /* At least one single char sets the bit and stops */
 310
 311       case OP_EXACT:       /* Fall through */
 312       tcode++;
 313
 314       case OP_CHARS:       /* Fall through */
 315       tcode++;
 316
 317       case OP_PLUS:
 318       case OP_MINPLUS:
 319       start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
 320       break;
 321
 322       /* Single character type sets the bits and stops */
 323
 324       case OP_NOT_DIGIT:
 325       for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
 326       break;
 327
 328       case OP_DIGIT:
 329       for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
 330       break;
 331
 332       case OP_NOT_WHITESPACE:
 333       for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
 334       break;
 335
 336       case OP_WHITESPACE:
 337       for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
 338       break;
 339
 340       case OP_NOT_WORDCHAR:
 341       for (c = 0; c < 32; c++)
 342         start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 343       break;
 344
 345       case OP_WORDCHAR:
 346       for (c = 0; c < 32; c++)
 347         start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 348       break;
 349
 350       /* One or more character type fudges the pointer and restarts, knowing
 351       it will hit a single character type and stop there. */
 352
 353       case OP_TYPEPLUS:
 354       case OP_TYPEMINPLUS:
 355       tcode++;
 356       try_next = TRUE;
 357       break;
 358
 359       case OP_TYPEEXACT:
 360       tcode += 3;
 361       try_next = TRUE;
 362       break;
 363
 364       /* Zero or more repeats of character types set the bits and then
 365       try again. */
 366
 367       case OP_TYPEUPTO:
 368       case OP_TYPEMINUPTO:
 369       tcode += 2;               /* Fall through */
 370
 371       case OP_TYPESTAR:
 372       case OP_TYPEMINSTAR:
 373       case OP_TYPEQUERY:
 374       case OP_TYPEMINQUERY:
 375       switch(tcode[1])
 376         {
 377         case OP_NOT_DIGIT:
 378         for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
 379         break;
 380
 381         case OP_DIGIT:
 382         for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
 383         break;
 384
 385         case OP_NOT_WHITESPACE:
 386         for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
 387         break;
 388
 389         case OP_WHITESPACE:
 390         for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
 391         break;
 392
 393         case OP_NOT_WORDCHAR:
 394         for (c = 0; c < 32; c++)
 395           start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 396         break;
 397
 398         case OP_WORDCHAR:
 399         for (c = 0; c < 32; c++)
 400           start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 401         break;
 402         }
 403
 404       tcode += 2;
 405       try_next = TRUE;
 406       break;
 407
 408       /* Character class: set the bits and either carry on or not,
 409       according to the repeat count. */
 410
 411       case OP_CLASS:
 412       case OP_NEGCLASS:
 413         {
 414         tcode++;
 415         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
 416         tcode += 32;
 417         switch (*tcode)
 418           {
 419           case OP_CRSTAR:
 420           case OP_CRMINSTAR:
 421           case OP_CRQUERY:
 422           case OP_CRMINQUERY:
 423           tcode++;
 424           try_next = TRUE;
 425           break;
 426
 427           case OP_CRRANGE:
 428           case OP_CRMINRANGE:
 429           if (((tcode[1] << 8) + tcode[2]) == 0)
 430             {
 431             tcode += 5;
 432             try_next = TRUE;
 433             }
 434           break;
 435           }
 436         }
 437       break; /* End of class handling */
 438
 439       }      /* End of switch */
 440     }        /* End of try_next loop */
 441
 442   code += (code[1] << 8) + code[2];   /* Advance to next branch */
 443   }
 444 while (*code == OP_ALT);
 445 return TRUE;
 446 }
 447
 448
 449
 450 /*************************************************
 451 *          Study a compiled expression           *
 452 *************************************************/
 453
 454 /* This function is handed a compiled expression that it must study to produce
 455 information that will speed up the matching. It returns a pcre_extra block
 456 which then gets handed back to pcre_exec().
 457
 458 Arguments:
 459   re        points to the compiled expression
 460   options   contains option bits
 461   errorptr  points to where to place error messages;
 462             set NULL unless error
 463
 464 Returns:    pointer to a pcre_extra block,
 465             NULL on error or if no optimization possible
 466 */
 467
 468 pcre_extra *
 469 pcre_study(const pcre *external_re, int options, const char **errorptr)
 470 {
 471 BOOL caseless;
 472 uschar start_bits[32];
 473 real_pcre_extra *extra;
 474 const real_pcre *re = (const real_pcre *)external_re;
 475
 476 *errorptr = NULL;
 477
 478 if (re == NULL || re->magic_number != MAGIC_NUMBER)
 479   {
 480   *errorptr = "argument is not a compiled regular expression";
 481   return NULL;
 482   }
 483
 484 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
 485   {
 486   *errorptr = "unknown or incorrect option bit(s) set";
 487   return NULL;
 488   }
 489
 490 /* Caseless can either be from the compiled regex or from options. */
 491
 492 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
 493
 494 /* For an anchored pattern, or an unchored pattern that has a first char, or a
 495 multiline pattern that matches only at "line starts", no further processing at
 496 present. */
 497
 498 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
 499   return NULL;
 500
 501 /* See if we can find a fixed set of initial characters for the pattern. */
 502
 503 memset(start_bits, 0, 32 * sizeof(uschar));
 504 if (!set_start_bits(re->code, start_bits)) return NULL;
 505
 506 /* If this studying is caseless, scan the created bit map and duplicate the
 507 bits for any letters. */
 508
 509 if (caseless)
 510   {
 511   register int c;
 512   for (c = 0; c < 256; c++)
 513     {
 514     if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
 515         (pcre_ctypes[c] & ctype_letter) != 0)
 516       {
 517       int d = pcre_fcc[c];
 518       start_bits[d/8] |= (1 << (d&7));
 519       }
 520     }
 521   }
 522
 523 /* Get an "extra" block and put the information therein. */
 524
 525 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
 526
 527 if (extra == NULL)
 528   {
 529   *errorptr = "failed to get memory";
 530   return NULL;
 531   }
 532
 533 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
 534 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
 535
 536 return (pcre_extra *)extra;
 537 }
 538
 539 /* End of study.c */
 540 /*************************************************
 541 *      Perl-Compatible Regular Expressions       *
 542 *************************************************/
 543
 544 /*
 545 This is a library of functions to support regular expressions whose syntax
 546 and semantics are as close as possible to those of the Perl 5 language. See
 547 the file Tech.Notes for some information on the internals.
 548
 549 Written by: Philip Hazel <ph10@cam.ac.uk>
 550
 551            Copyright (c) 1998 University of Cambridge
 552
 553 -----------------------------------------------------------------------------
 554 Permission is granted to anyone to use this software for any purpose on any
 555 computer system, and to redistribute it freely, subject to the following
 556 restrictions:
 557
 558 1. This software is distributed in the hope that it will be useful,
 559    but WITHOUT ANY WARRANTY; without even the implied warranty of
 560    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 561
 562 2. The origin of this software must not be misrepresented, either by
 563    explicit claim or by omission.
 564
 565 3. Altered versions must be plainly marked as such, and must not be
 566    misrepresented as being the original software.
 567 -----------------------------------------------------------------------------
 568 */
 569
 570
 571 /* Define DEBUG to get debugging output on stdout. */
 572
 573 /* #define DEBUG */
 574
 575 /* Use a macro for debugging printing, 'cause that eliminates the the use
 576 of #ifdef inline, and there are *still* stupid compilers about that don't like
 577 indented pre-processor statements. I suppose it's only been 10 years... */
 578
 579 #ifdef DEBUG
 580 #define DPRINTF(p) printf p
 581 #else
 582 #define DPRINTF(p) /*nothing*/
 583 #endif
 584
 585 /* Include the internals header, which itself includes Standard C headers plus
 586 the external pcre header. */
 587
 588
 589
 590
 591 #ifndef Py_eval_input
 592 /* For Python 1.4, graminit.h has to be explicitly included */
 593 #define Py_eval_input eval_input
 594
 595 #endif /* FOR_PYTHON */
 596
 597 /* Allow compilation as C++ source code, should anybody want to do that. */
 598
 599 #ifdef __cplusplus
 600 #define class pcre_class
 601 #endif
 602
 603
 604 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
 605
 606 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
 607 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
 608
 609 /* Text forms of OP_ values and things, for debugging (not all used) */
 610
 611 #ifdef DEBUG
 612 static const char *OP_names[] = {
 613   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
 614   "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
 615   "localized \\B", "localized \\b", "localized \\W", "localized \\w",
 616   "^", "$", "Any", "chars",
 617   "not",
 618   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 619   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 620   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 621   "*", "*?", "+", "+?", "?", "??", "{", "{",
 622   "class", "negclass", "classL", "Ref",
 623   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
 624   "Brazero", "Braminzero", "Bra"
 625 };
 626 #endif
 627
 628 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 629 are simple data values; negative values are for special things like \d and so
 630 on. Zero means further processing is needed (for things like \x), or the escape
 631 is invalid. */
 632
 633 static const short int escapes[] = {
 634     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 635     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 636   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
 637     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 638     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 639     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 640   '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
 641     0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
 642     0,      0,   '\r', -ESC_s,   '\t',      0,   '\v', -ESC_w,   /* p - w */
 643     0,      0,      0                                            /* x - z */
 644 };
 645
 646 /* Definition to allow mutual recursion */
 647
 648 static BOOL
 649 compile_regex(int, int *, uschar **, const uschar **, const char **,
 650               PyObject *);
 651
 652 /* Structure for passing "static" information around between the functions
 653 doing the matching, so that they are thread-safe. */
 654
 655 typedef struct match_data {
 656   int    errorcode;             /* As it says */
 657   int   *offset_vector;         /* Offset vector */
 658   int    offset_end;            /* One past the end */
 659   BOOL   offset_overflow;       /* Set if too many extractions */
 660   BOOL   caseless;              /* Case-independent flag */
 661   BOOL   runtime_caseless;      /* Caseless forced at run time */
 662   BOOL   multiline;             /* Multiline flag */
 663   BOOL   notbol;                /* NOTBOL flag */
 664   BOOL   noteol;                /* NOTEOL flag */
 665   BOOL   dotall;                /* Dot matches any char */
 666   BOOL   endonly;               /* Dollar not before final \n */
 667   const uschar *start_subject;  /* Start of the subject string */
 668   const uschar *end_subject;    /* End of the subject string */
 669   jmp_buf fail_env;             /* Environment for longjump() break out */
 670   const uschar *end_match_ptr;  /* Subject position at end match */
 671   int     end_offset_top;       /* Highwater mark at end of match */
 672   jmp_buf error_env;          /* For longjmp() if an error occurs deep inside a
 673                                    matching operation */
 674   int    length;                /* Length of the allocated stacks */
 675   int    point;                 /* Point to add next item pushed onto stacks */
 676   /* Pointers to the 6 stacks */
 677   int *off_num, *offset_top, *r1, *r2;
 678   const uschar **eptr, **ecode;
 679 } match_data;
 680
 681
 682
 683 /*************************************************
 684 *               Global variables                 *
 685 *************************************************/
 686
 687 /* PCRE is thread-clean and doesn't use any global variables in the normal
 688 sense. However, it calls memory allocation and free functions via the two
 689 indirections below, which are can be changed by the caller, but are shared
 690 between all threads. */
 691
 692 void *(*pcre_malloc)(size_t) = malloc;
 693 void  (*pcre_free)(void *) = free;
 694
 695
 696
 697
 698 /*************************************************
 699 *          Return version string                 *
 700 *************************************************/
 701
 702 const char *
 703 pcre_version(void)
 704 {
 705 return PCRE_VERSION;
 706 }
 707
 708
 709
 710
 711 /*************************************************
 712 *       Return info about a compiled pattern     *
 713 *************************************************/
 714
 715 /* This function picks potentially useful data out of the private
 716 structure.
 717
 718 Arguments:
 719   external_re   points to compiled code
 720   optptr        where to pass back the options
 721   first_char    where to pass back the first character,
 722                 or -1 if multiline and all branches start ^,
 723                 or -2 otherwise
 724
 725 Returns:        number of identifying extraction brackets
 726                 or negative values on error
 727 */
 728
 729 int
 730 pcre_info(const pcre *external_re, int *optptr, int *first_char)
 731 {
 732 const real_pcre *re = (real_pcre *)external_re;
 733 if (re == NULL) return PCRE_ERROR_NULL;
 734 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 735 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
 736 if (first_char != NULL)
 737   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 738      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 739 return re->top_bracket;
 740 }
 741
 742
 743
 744
 745 #ifdef DEBUG
 746 /*************************************************
 747 *        Debugging function to print chars       *
 748 *************************************************/
 749
 750 /* Print a sequence of chars in printable format, stopping at the end of the
 751 subject if the requested.
 752
 753 Arguments:
 754   p           points to characters
 755   length      number to print
 756   is_subject  TRUE if printing from within md->start_subject
 757   md          pointer to matching data block, if is_subject is TRUE
 758
 759 Returns:     nothing
 760 */
 761
 762 static void
 763 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 764 {
 765 int c;
 766 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 767 while (length-- > 0)
 768   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 769 }
 770 #endif
 771
 772
 773
 774
 775 /*************************************************
 776 *         Check subpattern for empty operand     *
 777 *************************************************/
 778
 779 /* This function checks a bracketed subpattern to see if any of the paths
 780 through it could match an empty string. This is used to diagnose an error if
 781 such a subpattern is followed by a quantifier with an unlimited upper bound.
 782
 783 Argument:
 784   code      points to the opening bracket
 785
 786 Returns:    TRUE or FALSE
 787 */
 788
 789 static BOOL
 790 could_be_empty(uschar *code)
 791 {
 792 do {
 793   uschar *cc = code + 3;
 794
 795   /* Scan along the opcodes for this branch; as soon as we find something
 796   that matches a non-empty string, break out and advance to test the next
 797   branch. If we get to the end of the branch, return TRUE for the whole
 798   sub-expression. */
 799
 800   for (;;)
 801     {
 802     /* Test an embedded subpattern; if it could not be empty, break the
 803     loop. Otherwise carry on in the branch. */
 804
 805     if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
 806       {
 807       if (!could_be_empty(cc)) break;
 808       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 809       cc += 3;
 810       }
 811
 812     else switch (*cc)
 813       {
 814       /* Reached end of a branch: the subpattern may match the empty string */
 815
 816       case OP_ALT:
 817       case OP_KET:
 818       case OP_KETRMAX:
 819       case OP_KETRMIN:
 820       return TRUE;
 821
 822       /* Skip over entire bracket groups with zero lower bound */
 823
 824       case OP_BRAZERO:
 825       case OP_BRAMINZERO:
 826       cc++;
 827       /* Fall through */
 828
 829       /* Skip over assertive subpatterns */
 830
 831       case OP_ASSERT:
 832       case OP_ASSERT_NOT:
 833       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 834       cc += 3;
 835       break;
 836
 837       /* Skip over things that don't match chars */
 838
 839       case OP_SOD:
 840       case OP_EOD:
 841       case OP_CIRC:
 842       case OP_DOLL:
 843       case OP_NOT_WORD_BOUNDARY:
 844       case OP_WORD_BOUNDARY:
 845       case OP_NOT_WORD_BOUNDARY_L:
 846       case OP_WORD_BOUNDARY_L:
 847       cc++;
 848       break;
 849
 850       /* Skip over simple repeats with zero lower bound */
 851
 852       case OP_STAR:
 853       case OP_MINSTAR:
 854       case OP_QUERY:
 855       case OP_MINQUERY:
 856       case OP_NOTSTAR:
 857       case OP_NOTMINSTAR:
 858       case OP_NOTQUERY:
 859       case OP_NOTMINQUERY:
 860       case OP_TYPESTAR:
 861       case OP_TYPEMINSTAR:
 862       case OP_TYPEQUERY:
 863       case OP_TYPEMINQUERY:
 864       cc += 2;
 865       break;
 866
 867       /* Skip over UPTOs (lower bound is zero) */
 868
 869       case OP_UPTO:
 870       case OP_MINUPTO:
 871       case OP_TYPEUPTO:
 872       case OP_TYPEMINUPTO:
 873       cc += 4;
 874       break;
 875
 876       /* Check a class or a back reference for a zero minimum */
 877
 878       case OP_CLASS:
 879       case OP_NEGCLASS:
 880       case OP_REF:
 881       case OP_CLASS_L:
 882         switch(*cc)
 883           {
 884           case (OP_REF):    cc += 2; break;
 885           case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
 886           case (OP_CLASS_L): cc += 1+1+32; break;
 887           }
 888
 889       switch (*cc)
 890         {
 891         case OP_CRSTAR:
 892         case OP_CRMINSTAR:
 893         case OP_CRQUERY:
 894         case OP_CRMINQUERY:
 895         cc++;
 896         break;
 897
 898         case OP_CRRANGE:
 899         case OP_CRMINRANGE:
 900         if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
 901         cc += 3;
 902         break;
 903
 904         default:
 905         goto NEXT_BRANCH;
 906         }
 907       break;
 908
 909       /* Anything else matches at least one character */
 910
 911       default:
 912       goto NEXT_BRANCH;
 913       }
 914     }
 915
 916   NEXT_BRANCH:
 917   code += (code[1] << 8) + code[2];
 918   }
 919 while (*code == OP_ALT);
 920
 921 /* No branches match the empty string */
 922
 923 return FALSE;
 924 }
 925
 926 /* Determine the length of a group ID in an expression like
 927    (?P<foo_123>...)
 928 Arguments:
 929   ptr        pattern position pointer (say that 3 times fast)
 930   finalchar  the character that will mark the end of the ID
 931   errorptr   points to the pointer to the error message
 932 */
 933
 934 static int
 935 get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
 936 {
 937   const uschar *start = ptr;
 938
 939   /* If the first character is not in \w, or is in \w but is a digit,
 940      report an error */
 941   if (!(pcre_ctypes[*ptr] & ctype_word) ||
 942       (pcre_ctypes[*ptr++] & ctype_digit))
 943     {
 944       *errorptr = "(?P identifier must start with a letter or underscore";
 945       return 0;
 946     }
 947
 948   /* Increment ptr until we either hit a null byte, the desired
 949      final character, or a non-word character */
 950   for(; (*ptr != 0) && (*ptr != finalchar) &&
 951         (pcre_ctypes[*ptr] & ctype_word); ptr++)
 952     {
 953       /* Empty loop body */
 954     }
 955   if (*ptr==finalchar)
 956     return ptr-start;
 957   if (*ptr==0)
 958     {
 959       *errorptr = "unterminated (?P identifier";
 960       return 0;
 961     }
 962   *errorptr = "illegal character in (?P identifier";
 963   return 0;
 964 }
 965
 966 /*************************************************
 967 *            Handle escapes                      *
 968 *************************************************/
 969
 970 /* This function is called when a \ has been encountered. It either returns a
 971 positive value for a simple escape such as \n, or a negative value which
 972 encodes one of the more complicated things such as \d. On entry, ptr is
 973 pointing at the \. On exit, it is on the final character of the escape
 974 sequence.
 975
 976 Arguments:
 977   ptrptr     points to the pattern position pointer
 978   errorptr   points to the pointer to the error message
 979   bracount   number of previous extracting brackets
 980   options    the options bits
 981   isclass    TRUE if inside a character class
 982
 983 Returns:     zero or positive => a data character
 984              negative => a special escape sequence
 985              on error, errorptr is set
 986 */
 987
 988 static int
 989 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 990              int options, BOOL isclass)
 991 {
 992 const uschar *ptr = *ptrptr;
 993 int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
 994 int i;
 995
 996 if (c == 0) *errorptr = ERR1;
 997
 998 /* Digits or letters may have special meaning; all others are literals. */
 999
1000 else if (c < '0' || c > 'z') {}
1001
1002 /* Do an initial lookup in a table. A non-zero result is something that can be
1003 returned immediately. Otherwise further processing may be required. */
1004
1005 else if ((i = escapes[c - '0']) != 0) c = i;
1006
1007 /* Escapes that need further processing, or are illegal. */
1008
1009 else
1010   {
1011
1012   switch (c)
1013     {
1014     /* The handling of escape sequences consisting of a string of digits
1015     starting with one that is not zero is not straightforward. By experiment,
1016     the way Perl works seems to be as follows:
1017
1018     Outside a character class, the digits are read as a decimal number. If the
1019     number is less than 10, or if there are that many previous extracting
1020     left brackets, then it is a back reference. Otherwise, up to three octal
1021     digits are read to form an escaped byte. Thus \123 is likely to be octal
1022     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1023     value is greater than 377, the least significant 8 bits are taken. Inside a
1024     character class, \ followed by a digit is always an octal number. */
1025
1026     case '1': case '2': case '3': case '4': case '5':
1027     case '6': case '7': case '8': case '9':
1028
1029     {
1030       /* PYTHON: Try to compute an octal value for a character */
1031       for(c=0, i=0; ptr[i]!=0 && i<3; i++)
1032         {
1033           if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1034             c = c * 8 + ptr[i]-'0';
1035           else
1036             break; /* Non-octal character--break out of the loop */
1037         }
1038       /* It's a character if there were exactly 3 octal digits, or if
1039          we're inside a character class and there was at least one
1040          octal digit. */
1041       if ( (i == 3) || (isclass && i!=0) )
1042         {
1043           ptr += i-1;
1044           break;
1045         }
1046       c = ptr[0]; /* Restore the first character after the \ */
1047       c -= '0'; i = 1;
1048       while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1049         {
1050           c = c * 10 + ptr[1] - '0';
1051           ptr++; i++;
1052         }
1053       if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1054       c = -(ESC_REF + c);
1055     }
1056   break;
1057
1058     /* \0 always starts an octal number, but we may drop through to here with a
1059     larger first octal digit */
1060
1061     case '0':
1062     c -= '0';
1063     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1064       ptr[1] != '8' && ptr[1] != '9')
1065         c = c * 8 + *(++ptr) - '0';
1066     break;
1067
1068     /* Special escapes not starting with a digit are straightforward */
1069
1070     case 'x':
1071   c = 0;
1072   while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
1073     {
1074     ptr++;
1075     c = c * 16 + pcre_lcc[*ptr] -
1076       (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1077     c &= 255;
1078     }
1079   break;
1080
1081
1082     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1083     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1084     for Perl compatibility, it is a literal. */
1085
1086     default:
1087     if ((options & PCRE_EXTRA) != 0) switch(c)
1088       {
1089       case 'X':
1090       c = -ESC_X;      /* This could be a lookup if it ever got into Perl */
1091       break;
1092
1093       default:
1094       *errorptr = ERR3;
1095       break;
1096       }
1097     break;
1098     }
1099   }
1100
1101 *ptrptr = ptr;
1102 return c;
1103 }
1104
1105
1106
1107 /*************************************************
1108 *            Check for counted repeat            *
1109 *************************************************/
1110
1111 /* This function is called when a '{' is encountered in a place where it might
1112 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1113 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1114 where the ddds are digits.
1115
1116 Arguments:
1117   p         pointer to the first char after '{'
1118
1119 Returns:    TRUE or FALSE
1120 */
1121
1122 static BOOL
1123 is_counted_repeat(const uschar *p)
1124 {
1125 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1126 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1127 if (*p == '}') return TRUE;
1128
1129 if (*p++ != ',') return FALSE;
1130 if (*p == '}') return TRUE;
1131
1132 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1133 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1134 return (*p == '}');
1135 }
1136
1137
1138
1139 /*************************************************
1140 *         Read repeat counts                     *
1141 *************************************************/
1142
1143 /* Read an item of the form {n,m} and return the values. This is called only
1144 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1145 so the syntax is guaranteed to be correct, but we need to check the values.
1146
1147 Arguments:
1148   p          pointer to first char after '{'
1149   minp       pointer to int for min
1150   maxp       pointer to int for max
1151              returned as -1 if no max
1152   errorptr   points to pointer to error message
1153
1154 Returns:     pointer to '}' on success;
1155              current ptr on error, with errorptr set
1156 */
1157
1158 static const uschar *
1159 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1160 {
1161 int min = 0;
1162 int max = -1;
1163
1164 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1165
1166 if (*p == '}') max = min; else
1167   {
1168   if (*(++p) != '}')
1169     {
1170     max = 0;
1171     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1172     if (max < min)
1173       {
1174       *errorptr = ERR4;
1175       return p;
1176       }
1177     }
1178   }
1179
1180 /* Do paranoid checks, then fill in the required variables, and pass back the
1181 pointer to the terminating '}'. */
1182
1183 if (min > 65535 || max > 65535)
1184   *errorptr = ERR5;
1185 else
1186   {
1187   *minp = min;
1188   *maxp = max;
1189   }
1190 return p;
1191 }
1192
1193
1194
1195 /*************************************************
1196 *           Compile one branch                   *
1197 *************************************************/
1198
1199 /* Scan the pattern, compiling it into the code vector.
1200
1201 Arguments:
1202   options    the option bits
1203   bracket    points to number of brackets used
1204   code       points to the pointer to the current code point
1205   ptrptr     points to the current pattern pointer
1206   errorptr   points to pointer to error message
1207
1208 Returns:     TRUE on success
1209              FALSE, with *errorptr set on error
1210 */
1211
1212 static BOOL
1213 compile_branch(int options, int *brackets, uschar **codeptr,
1214                const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
1215 {
1216 int repeat_type, op_type;
1217 int repeat_min, repeat_max;
1218 int bravalue, length;
1219 int greedy_default, greedy_non_default;
1220 register int c;
1221 register uschar *code = *codeptr;
1222 const uschar *ptr = *ptrptr;
1223 const uschar *oldptr;
1224 uschar *previous = NULL;
1225 uschar class[32];
1226 uschar *class_flag;  /* Pointer to the single-byte flag for OP_CLASS_L */
1227
1228 /* Set up the default and non-default settings for greediness */
1229
1230 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1231 greedy_non_default = greedy_default ^ 1;
1232
1233 /* Switch on next character until the end of the branch */
1234
1235 for (;; ptr++)
1236   {
1237   BOOL negate_class;
1238   int  class_charcount;
1239   int  class_lastchar;
1240
1241   c = *ptr;
1242   if ((options & PCRE_EXTENDED) != 0)
1243     {
1244     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1245     if (c == '#')
1246       {
1247       while ((c = *(++ptr)) != 0 && c != '\n');
1248       continue;
1249       }
1250     }
1251
1252   switch(c)
1253     {
1254     /* The branch terminates at end of string, |, or ). */
1255
1256     case 0:
1257     case '|':
1258     case ')':
1259     *codeptr = code;
1260     *ptrptr = ptr;
1261     return TRUE;
1262
1263     /* Handle single-character metacharacters */
1264
1265     case '^':
1266     previous = NULL;
1267     *code++ = OP_CIRC;
1268     break;
1269
1270     case '$':
1271     previous = NULL;
1272     *code++ = OP_DOLL;
1273     break;
1274
1275     case '.':
1276     previous = code;
1277     *code++ = OP_ANY;
1278     break;
1279
1280     /* Character classes. These always build a 32-byte bitmap of the permitted
1281     characters, except in the special case where there is only one character.
1282     For negated classes, we build the map as usual, then invert it at the end.
1283     */
1284
1285     case '[':
1286     previous = code;
1287     if (options & PCRE_LOCALE)
1288       {
1289         *code++ = OP_CLASS_L;
1290         /* Set the flag for localized classes (like \w) to 0 */
1291         class_flag = code;
1292         *class_flag = 0;
1293       }
1294     else
1295       {
1296         *code++ = OP_CLASS;
1297         class_flag = NULL;
1298       }
1299
1300     /* If the first character is '^', set the negation flag, and use a
1301     different opcode. This only matters if caseless matching is specified at
1302     runtime. */
1303
1304     if ((c = *(++ptr)) == '^')
1305       {
1306       negate_class = TRUE;
1307       if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
1308       c = *(++ptr);
1309       }
1310     else negate_class = FALSE;
1311
1312     /* Keep a count of chars so that we can optimize the case of just a single
1313     character. */
1314
1315     class_charcount = 0;
1316     class_lastchar = -1;
1317
1318     /* Initialize the 32-char bit map to all zeros. We have to build the
1319     map in a temporary bit of store, in case the class contains only 1
1320     character, because in that case the compiled code doesn't use the
1321     bit map. */
1322
1323     memset(class, 0, 32 * sizeof(uschar));
1324
1325     /* Process characters until ] is reached. By writing this as a "do" it
1326     means that an initial ] is taken as a data character. */
1327
1328     do
1329       {
1330       if (c == 0)
1331         {
1332         *errorptr = ERR6;
1333         goto FAILED;
1334         }
1335
1336       /* Backslash may introduce a single character, or it may introduce one
1337       of the specials, which just set a flag. Escaped items are checked for
1338       validity in the pre-compiling pass. The sequence \b is a special case.
1339       Inside a class (and only there) it is treated as backspace. Elsewhere
1340       it marks a word boundary. Other escapes have preset maps ready to
1341       or into the one we are building. We assume they have more than one
1342       character in them, so set class_count bigger than one. */
1343
1344       if (c == '\\')
1345         {
1346         c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1347         if (-c == ESC_b) c = '\b';
1348         else if (c < 0)
1349           {
1350           class_charcount = 10;
1351           switch (-c)
1352             {
1353             case ESC_d:
1354               {
1355                 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1356               }
1357             continue;
1358
1359             case ESC_D:
1360               {
1361                 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1362               }
1363             continue;
1364
1365             case ESC_w:
1366             if (options & PCRE_LOCALE)
1367               {
1368                 *class_flag |= 1;
1369               }
1370             else
1371               {
1372                 for (c = 0; c < 32; c++)
1373                   class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1374               }
1375             continue;
1376
1377             case ESC_W:
1378             if (options & PCRE_LOCALE)
1379               {
1380                 *class_flag |= 2;
1381               }
1382             else
1383               {
1384                 for (c = 0; c < 32; c++)
1385                   class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1386               }
1387             continue;
1388
1389             case ESC_s:
1390               {
1391                 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1392               }
1393             continue;
1394
1395             case ESC_S:
1396               {
1397                 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1398               }
1399             continue;
1400
1401             default:
1402             *errorptr = ERR7;
1403             goto FAILED;
1404             }
1405           }
1406         /* Fall through if single character */
1407         }
1408
1409       /* A single character may be followed by '-' to form a range. However,
1410       Perl does not permit ']' to be the end of the range. A '-' character
1411       here is treated as a literal. */
1412
1413       if (ptr[1] == '-' && ptr[2] != ']')
1414         {
1415         int d;
1416         ptr += 2;
1417         d = *ptr;
1418
1419         if (d == 0)
1420           {
1421           *errorptr = ERR6;
1422           goto FAILED;
1423           }
1424
1425         /* The second part of a range can be a single-character escape, but
1426         not any of the other escapes. */
1427
1428         if (d == '\\')
1429           {
1430           d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1431           if (d < 0)
1432             {
1433             if (d == -ESC_b) d = '\b'; else
1434               {
1435               *errorptr = ERR7;
1436               goto FAILED;
1437               }
1438             }
1439           }
1440
1441         if (d < c)
1442           {
1443           *errorptr = ERR8;
1444           goto FAILED;
1445           }
1446
1447         for (; c <= d; c++)
1448           {
1449           class[c/8] |= (1 << (c&7));
1450           if ((options & PCRE_CASELESS) != 0)
1451             {
1452             int uc = pcre_fcc[c];           /* flip case */
1453             class[uc/8] |= (1 << (uc&7));
1454             }
1455           class_charcount++;                /* in case a one-char range */
1456           class_lastchar = c;
1457           }
1458         continue;   /* Go get the next char in the class */
1459         }
1460
1461       /* Handle a lone single character - we can get here for a normal
1462       non-escape char, or after \ that introduces a single character. */
1463
1464       class [c/8] |= (1 << (c&7));
1465       if ((options & PCRE_CASELESS) != 0)
1466         {
1467         c = pcre_fcc[c];   /* flip case */
1468         class[c/8] |= (1 << (c&7));
1469         }
1470       class_charcount++;
1471       class_lastchar = c;
1472       }
1473
1474     /* Loop until ']' reached; the check for end of string happens inside the
1475     loop. This "while" is the end of the "do" above. */
1476
1477     while ((c = *(++ptr)) != ']');
1478
1479     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1480     precisely one character. This doesn't need the whole 32-byte bit map.
1481     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1482     it's negative. */
1483
1484     if (class_charcount == 1 && class_lastchar >= 0)
1485       {
1486       if (negate_class)
1487         {
1488         code[-1] = OP_NOT;
1489         }
1490       else
1491         {
1492         code[-1] = OP_CHARS;
1493         *code++ = 1;
1494         }
1495       *code++ = class_lastchar;
1496       }
1497
1498     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1499     the code vector. */
1500
1501     else
1502       {
1503         /* If this is a localized opcode, bump the code pointer up */
1504         if (class_flag) code++;
1505       if (negate_class)
1506         {
1507           if (class_flag) *class_flag = (*class_flag) ^ 63;
1508           for (c = 0; c < 32; c++) code[c] = ~class[c];
1509         }
1510       else
1511         memcpy(code, class, 32);
1512       code += 32;
1513       }
1514     break;
1515
1516     /* Various kinds of repeat */
1517
1518     case '{':
1519     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1520     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1521     if (*errorptr != NULL) goto FAILED;
1522     goto REPEAT;
1523
1524     case '*':
1525     repeat_min = 0;
1526     repeat_max = -1;
1527     goto REPEAT;
1528
1529     case '+':
1530     repeat_min = 1;
1531     repeat_max = -1;
1532     goto REPEAT;
1533
1534     case '?':
1535     repeat_min = 0;
1536     repeat_max = 1;
1537
1538     REPEAT:
1539     if (previous == NULL)
1540       {
1541       *errorptr = ERR9;
1542       goto FAILED;
1543       }
1544
1545     /* If the next character is '?' this is a minimizing repeat, by default,
1546     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1547     next character. */
1548
1549     if (ptr[1] == '?')
1550       { repeat_type = greedy_non_default; ptr++; }
1551     else repeat_type = greedy_default;
1552
1553     /* If the maximum is zero then the minimum must also be zero; Perl allows
1554     this case, so we do too - by simply omitting the item altogether. */
1555
1556     if (repeat_max == 0) code = previous;
1557
1558     /* If previous was a string of characters, chop off the last one and use it
1559     as the subject of the repeat. If there was only one character, we can
1560     abolish the previous item altogether. */
1561
1562     else if (*previous == OP_CHARS)
1563       {
1564       int len = previous[1];
1565       if (len == 1)
1566         {
1567         c = previous[2];
1568         code = previous;
1569         }
1570       else
1571         {
1572         c = previous[len+1];
1573         previous[1]--;
1574         code--;
1575         }
1576       op_type = 0;                 /* Use single-char op codes */
1577       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1578       }
1579
1580     /* If previous was a single negated character ([^a] or similar), we use
1581     one of the special opcodes, replacing it. The code is shared with single-
1582     character repeats by adding a suitable offset into repeat_type. */
1583
1584     else if ((int)*previous == OP_NOT)
1585       {
1586       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1587       c = previous[1];
1588       code = previous;
1589       goto OUTPUT_SINGLE_REPEAT;
1590       }
1591
1592     /* If previous was a character type match (\d or similar), abolish it and
1593     create a suitable repeat item. The code is shared with single-character
1594     repeats by adding a suitable offset into repeat_type. */
1595
1596     else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
1597       {
1598       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1599       c = *previous;
1600       code = previous;
1601
1602       OUTPUT_SINGLE_REPEAT:
1603       repeat_type += op_type;      /* Combine both values for many cases */
1604
1605       /* A minimum of zero is handled either as the special case * or ?, or as
1606       an UPTO, with the maximum given. */
1607
1608       if (repeat_min == 0)
1609         {
1610         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1611           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1612         else
1613           {
1614           *code++ = OP_UPTO + repeat_type;
1615           *code++ = repeat_max >> 8;
1616           *code++ = (repeat_max & 255);
1617           }
1618         }
1619
1620       /* The case {1,} is handled as the special case + */
1621
1622       else if (repeat_min == 1 && repeat_max == -1)
1623         *code++ = OP_PLUS + repeat_type;
1624
1625       /* The case {n,n} is just an EXACT, while the general case {n,m} is
1626       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1627
1628       else
1629         {
1630         if (repeat_min != 1)
1631           {
1632           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1633           *code++ = repeat_min >> 8;
1634           *code++ = (repeat_min & 255);
1635           }
1636
1637         /* If the mininum is 1 and the previous item was a character string,
1638         we either have to put back the item that got cancelled if the string
1639         length was 1, or add the character back onto the end of a longer
1640         string. For a character type nothing need be done; it will just get
1641         put back naturally. Note that the final character is always going to
1642         get added below. */
1643
1644         else if (*previous == OP_CHARS)
1645           {
1646           if (code == previous) code += 2; else previous[1]++;
1647           }
1648
1649         /*  For a single negated character we also have to put back the
1650         item that got cancelled. */
1651
1652         else if (*previous == OP_NOT) code++;
1653
1654         /* If the maximum is unlimited, insert an OP_STAR. */
1655
1656         if (repeat_max < 0)
1657           {
1658           *code++ = c;
1659           *code++ = OP_STAR + repeat_type;
1660           }
1661
1662         /* Else insert an UPTO if the max is greater than the min. */
1663
1664         else if (repeat_max != repeat_min)
1665           {
1666           *code++ = c;
1667           repeat_max -= repeat_min;
1668           *code++ = OP_UPTO + repeat_type;
1669           *code++ = repeat_max >> 8;
1670           *code++ = (repeat_max & 255);
1671           }
1672         }
1673
1674       /* The character or character type itself comes last in all cases. */
1675
1676       *code++ = c;
1677       }
1678
1679     /* If previous was a character class or a back reference, we put the repeat
1680     stuff after it. */
1681
1682     else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1683              *previous==OP_CLASS_L || *previous == OP_REF)
1684       {
1685       if (repeat_min == 0 && repeat_max == -1)
1686         *code++ = OP_CRSTAR + repeat_type;
1687       else if (repeat_min == 1 && repeat_max == -1)
1688         *code++ = OP_CRPLUS + repeat_type;
1689       else if (repeat_min == 0 && repeat_max == 1)
1690         *code++ = OP_CRQUERY + repeat_type;
1691       else
1692         {
1693         *code++ = OP_CRRANGE + repeat_type;
1694         *code++ = repeat_min >> 8;
1695         *code++ = repeat_min & 255;
1696         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
1697         *code++ = repeat_max >> 8;
1698         *code++ = repeat_max & 255;
1699         }
1700       }
1701
1702     /* If previous was a bracket group, we may have to replicate it in certain
1703     cases. If the maximum repeat count is unlimited, check that the bracket
1704     group cannot match the empty string, and diagnose an error if it can. */
1705
1706     else if ((int)*previous >= OP_BRA)
1707       {
1708       int i;
1709       int len = code - previous;
1710
1711       if (repeat_max == -1 && could_be_empty(previous))
1712         {
1713         *errorptr = ERR10;
1714         goto FAILED;
1715         }
1716
1717       /* If the minimum is greater than zero, and the maximum is unlimited or
1718       equal to the minimum, the first copy remains where it is, and is
1719       replicated up to the minimum number of times. This case includes the +
1720       repeat, but of course no replication is needed in that case. */
1721
1722       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1723         {
1724         for (i = 1; i < repeat_min; i++)
1725           {
1726           memcpy(code, previous, len);
1727           code += len;
1728           }
1729         }
1730
1731       /* If the minimum is zero, stick BRAZERO in front of the first copy.
1732       Then, if there is a fixed upper limit, replicated up to that many times,
1733       sticking BRAZERO in front of all the optional ones. */
1734
1735       else
1736         {
1737         if (repeat_min == 0)
1738           {
1739           memmove(previous+1, previous, len);
1740           code++;
1741           *previous++ = OP_BRAZERO + repeat_type;
1742           }
1743
1744         for (i = 1; i < repeat_min; i++)
1745           {
1746           memcpy(code, previous, len);
1747           code += len;
1748           }
1749
1750         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1751           {
1752           *code++ = OP_BRAZERO + repeat_type;
1753           memcpy(code, previous, len);
1754           code += len;
1755           }
1756         }
1757
1758       /* If the maximum is unlimited, set a repeater in the final copy. */
1759
1760       if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1761       }
1762
1763     /* Else there's some kind of shambles */
1764
1765     else
1766       {
1767       *errorptr = ERR11;
1768       goto FAILED;
1769       }
1770
1771     /* In all case we no longer have a previous item. */
1772
1773     previous = NULL;
1774     break;
1775
1776
1777     /* Start of nested bracket sub-expression, or comment or lookahead.
1778     First deal with special things that can come after a bracket; all are
1779     introduced by ?, and the appearance of any of them means that this is not a
1780     referencing group. They were checked for validity in the first pass over
1781     the string, so we don't have to check for syntax errors here.  */
1782
1783     case '(':
1784     previous = code;              /* Only real brackets can be repeated */
1785     if (*(++ptr) == '?')
1786       {
1787       bravalue = OP_BRA;
1788
1789       switch (*(++ptr))
1790         {
1791         case '#':
1792         case 'i':
1793         case 'L':
1794         case 'm':
1795         case 's':
1796         case 'x':
1797         ptr++;
1798         while (*ptr != ')') ptr++;
1799         previous = NULL;
1800         continue;
1801
1802         case ':':                 /* Non-extracting bracket */
1803         ptr++;
1804         break;
1805
1806         case '=':                 /* Assertions can't be repeated */
1807         bravalue = OP_ASSERT;
1808         ptr++;
1809         previous = NULL;
1810         break;
1811
1812         case '!':
1813         bravalue = OP_ASSERT_NOT;
1814         ptr++;
1815         previous = NULL;
1816         break;
1817
1818         case ('P'):
1819           ptr++;
1820           if (*ptr=='<')
1821             {
1822               /* (?P<groupname>...) */
1823               int idlen;
1824               PyObject *string, *intobj;
1825
1826               ptr++;
1827               idlen = get_group_id(ptr, '>', errorptr);
1828               if (*errorptr) {
1829                 goto FAILED;
1830               }
1831               string = PyString_FromStringAndSize((char*)ptr, idlen);
1832               intobj = PyInt_FromLong( brackets[0] + 1 );
1833               if (intobj == NULL || string == NULL)
1834                 {
1835                   Py_XDECREF(string);
1836                   Py_XDECREF(intobj);
1837                   *errorptr = "exception raised";
1838                   goto FAILED;
1839                 }
1840               PyDict_SetItem(dictionary, string, intobj);
1841               Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
1842               ptr += idlen+1;  /* Point to rest of expression */
1843               goto do_grouping_bracket;
1844             }
1845           if (*ptr=='=')
1846             {
1847               /* (?P=groupname) */
1848               int idlen, refnum;
1849               PyObject *string, *intobj;
1850
1851               ptr++;
1852               idlen = get_group_id(ptr, ')', errorptr);
1853               if (*errorptr) {
1854                 goto FAILED;
1855               }
1856               string = PyString_FromStringAndSize((char *)ptr, idlen);
1857               if (string==NULL) {
1858                   *errorptr = "exception raised";
1859                   goto FAILED;
1860                 }
1861               intobj = PyDict_GetItem(dictionary, string);
1862               if (intobj==NULL) {
1863                 Py_DECREF(string);
1864                 *errorptr = "?P= group identifier isn't defined";
1865                 goto FAILED;
1866               }
1867
1868               refnum = PyInt_AsLong(intobj);
1869               Py_DECREF(string);
1870               /* The caller doesn't own the reference to the value
1871                  returned from PyDict_GetItem, so intobj is not
1872                  DECREF'ed. */
1873
1874               *code++ = OP_REF;
1875               *code++ = refnum;
1876               /* The continue will cause the top-level for() loop to
1877                  be resumed, so ptr will be immediately incremented.
1878                  Therefore, the following line adds just idlen, not
1879                  idlen+1 */
1880               ptr += idlen;
1881               continue;
1882             }
1883           /* The character after ?P is neither < nor =, so
1884              report an error.  Add more Python-extensions here. */
1885           *errorptr="unknown after (?P";
1886           goto FAILED;
1887
1888         case '>':                         /* "Match once" brackets */
1889         if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */
1890           {
1891           bravalue = OP_ONCE;
1892           ptr++;
1893           previous = NULL;
1894           break;
1895           }
1896         /* Else fall through */
1897
1898         default:
1899         *errorptr = ERR12;
1900         goto FAILED;
1901         }
1902       }
1903
1904     /* Else we have a referencing group */
1905
1906     else
1907       {
1908       do_grouping_bracket:
1909       if (++(*brackets) > EXTRACT_MAX)
1910         {
1911         *errorptr = ERR13;
1912         goto FAILED;
1913         }
1914       bravalue = OP_BRA + *brackets;
1915       }
1916
1917     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1918     code into a non-register variable in order to be able to pass its address
1919     because some compilers complain otherwise. */
1920
1921     *code = bravalue;
1922       {
1923       uschar *mcode = code;
1924       if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
1925         goto FAILED;
1926       code = mcode;
1927       }
1928
1929     if (*ptr != ')')
1930       {
1931       *errorptr = ERR14;
1932       goto FAILED;
1933       }
1934     break;
1935
1936     /* Check \ for being a real metacharacter; if not, fall through and handle
1937     it as a data character at the start of a string. Escape items are checked
1938     for validity in the pre-compiling pass. */
1939
1940     case '\\':
1941     oldptr = ptr;
1942     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1943
1944     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1945     are arranged to be the negation of the corresponding OP_values. For the
1946     back references, the values are ESC_REF plus the reference number. Only
1947     back references and those types that consume a character may be repeated.
1948     We can test for values between ESC_b and ESC_Z for the latter; this may
1949     have to change if any new ones are ever created. */
1950
1951     if (c < 0)
1952       {
1953       if (-c >= ESC_REF)
1954         {
1955         int refnum = -c - ESC_REF;
1956         if (*brackets < refnum)
1957           {
1958           *errorptr = ERR15;
1959           goto FAILED;
1960           }
1961         previous = code;
1962         *code++ = OP_REF;
1963         *code++ = refnum;
1964         }
1965       else
1966         {
1967         previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1968         if ( (options & PCRE_LOCALE) != 0)
1969           {
1970             switch (c)
1971               {
1972                 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1973                 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1974                 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1975                 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1976               }
1977           }
1978         *code++ = -c;
1979         }
1980       continue;
1981       }
1982
1983     /* Data character: Reset and fall through */
1984
1985     ptr = oldptr;
1986     c = '\\';
1987
1988     /* Handle a run of data characters until a metacharacter is encountered.
1989     The first character is guaranteed not to be whitespace or # when the
1990     extended flag is set. */
1991
1992     NORMAL_CHAR:
1993     default:
1994     previous = code;
1995     *code = OP_CHARS;
1996     code += 2;
1997     length = 0;
1998
1999     do
2000       {
2001       if ((options & PCRE_EXTENDED) != 0)
2002         {
2003         if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2004         if (c == '#')
2005           {
2006           while ((c = *(++ptr)) != 0 && c != '\n');
2007           if (c == 0) break;
2008           continue;
2009           }
2010         }
2011
2012       /* Backslash may introduce a data char or a metacharacter. Escaped items
2013       are checked for validity in the pre-compiling pass. Stop the string
2014       before a metaitem. */
2015
2016       if (c == '\\')
2017         {
2018         oldptr = ptr;
2019         c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
2020         if (c < 0) { ptr = oldptr; break; }
2021         }
2022
2023       /* Ordinary character or single-char escape */
2024
2025       *code++ = c;
2026       length++;
2027       }
2028
2029     /* This "while" is the end of the "do" above. */
2030
2031     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2032
2033     /* Compute the length and set it in the data vector, and advance to
2034     the next state. */
2035
2036     previous[1] = length;
2037     if (length < 255) ptr--;
2038     break;
2039     }
2040   }                   /* end of big loop */
2041
2042 /* Control never reaches here by falling through, only by a goto for all the
2043 error states. Pass back the position in the pattern so that it can be displayed
2044 to the user for diagnosing the error. */
2045
2046 FAILED:
2047 *ptrptr = ptr;
2048 return FALSE;
2049 }
2050
2051
2052
2053
2054 /*************************************************
2055 *     Compile sequence of alternatives           *
2056 *************************************************/
2057
2058 /* On entry, ptr is pointing past the bracket character, but on return
2059 it points to the closing bracket, or vertical bar, or end of string.
2060 The code variable is pointing at the byte into which the BRA operator has been
2061 stored.
2062
2063 Argument:
2064   options   the option bits
2065   brackets  -> int containing the number of extracting brackets used
2066   codeptr   -> the address of the current code pointer
2067   ptrptr    -> the address of the current pattern pointer
2068   errorptr  -> pointer to error message
2069
2070 Returns:    TRUE on success
2071 */
2072
2073 static BOOL
2074 compile_regex(int options, int *brackets, uschar **codeptr,
2075   const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
2076 {
2077 const uschar *ptr = *ptrptr;
2078 uschar *code = *codeptr;
2079 uschar *start_bracket = code;
2080
2081 for (;;)
2082   {
2083   int length;
2084   uschar *last_branch = code;
2085
2086   code += 3;
2087   if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
2088     {
2089     *ptrptr = ptr;
2090     return FALSE;
2091     }
2092
2093   /* Fill in the length of the last branch */
2094
2095   length = code - last_branch;
2096   last_branch[1] = length >> 8;
2097   last_branch[2] = length & 255;
2098
2099   /* Reached end of expression, either ')' or end of pattern. Insert a
2100   terminating ket and the length of the whole bracketed item, and return,
2101   leaving the pointer at the terminating char. */
2102
2103   if (*ptr != '|')
2104     {
2105     length = code - start_bracket;
2106     *code++ = OP_KET;
2107     *code++ = length >> 8;
2108     *code++ = length & 255;
2109     *codeptr = code;
2110     *ptrptr = ptr;
2111     return TRUE;
2112     }
2113
2114   /* Another branch follows; insert an "or" node and advance the pointer. */
2115
2116   *code = OP_ALT;
2117   ptr++;
2118   }
2119 /* Control never reaches here */
2120 }
2121
2122
2123
2124 /*************************************************
2125 *          Check for anchored expression         *
2126 *************************************************/
2127
2128 /* Try to find out if this is an anchored regular expression. Consider each
2129 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2130 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2131 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2132 counts, since OP_CIRC can match in the middle.
2133
2134 A branch is also implicitly anchored if it starts with .* because that will try
2135 the rest of the pattern at all possible matching points, so there is no point
2136 trying them again.
2137
2138 Argument:  points to start of expression (the bracket)
2139 Returns:   TRUE or FALSE
2140 */
2141
2142 static BOOL
2143 is_anchored(register const uschar *code, BOOL multiline)
2144 {
2145 do {
2146    int op = (int)code[3];
2147    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2148      { if (!is_anchored(code+3, multiline)) return FALSE; }
2149    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2150      { if (code[4] != OP_ANY) return FALSE; }
2151    else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2152    code += (code[1] << 8) + code[2];
2153    }
2154 while (*code == OP_ALT);
2155 return TRUE;
2156 }
2157
2158
2159
2160 /*************************************************
2161 *     Check for start with \n line expression    *
2162 *************************************************/
2163
2164 /* This is called for multiline expressions to try to find out if every branch
2165 starts with ^ so that "first char" processing can be done to speed things up.
2166
2167 Argument:  points to start of expression (the bracket)
2168 Returns:   TRUE or FALSE
2169 */
2170
2171 static BOOL
2172 is_startline(const uschar *code)
2173 {
2174 do {
2175    if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2176      { if (!is_startline(code+3)) return FALSE; }
2177    else if (code[3] != OP_CIRC) return FALSE;
2178    code += (code[1] << 8) + code[2];
2179    }
2180 while (*code == OP_ALT);
2181 return TRUE;
2182 }
2183
2184
2185
2186 /*************************************************
2187 *          Check for fixed first char            *
2188 *************************************************/
2189
2190 /* Try to find out if there is a fixed first character. This is called for
2191 unanchored expressions, as it speeds up their processing quite considerably.
2192 Consider each alternative branch. If they all start with the same char, or with
2193 a bracket all of whose alternatives start with the same char (recurse ad lib),
2194 then we return that char, otherwise -1.
2195
2196 Argument:  points to start of expression (the bracket)
2197 Returns:   -1 or the fixed first char
2198 */
2199
2200 static int
2201 find_firstchar(uschar *code)
2202 {
2203 register int c = -1;
2204 do
2205   {
2206   register int charoffset = 4;
2207
2208   if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2209     {
2210     register int d;
2211     if ((d = find_firstchar(code+3)) < 0) return -1;
2212     if (c < 0) c = d; else if (c != d) return -1;
2213     }
2214
2215   else switch(code[3])
2216     {
2217     default:
2218     return -1;
2219
2220     case OP_EXACT:       /* Fall through */
2221     charoffset++;
2222
2223     case OP_CHARS:       /* Fall through */
2224     charoffset++;
2225
2226     case OP_PLUS:
2227     case OP_MINPLUS:
2228     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2229     break;
2230     }
2231   code += (code[1] << 8) + code[2];
2232   }
2233 while (*code == OP_ALT);
2234 return c;
2235 }
2236
2237
2238
2239 /*************************************************
2240 *        Compile a Regular Expression            *
2241 *************************************************/
2242
2243 /* This function takes a string and returns a pointer to a block of store
2244 holding a compiled version of the expression.
2245
2246 Arguments:
2247   pattern      the regular expression
2248   options      various option bits
2249   errorptr     pointer to pointer to error text
2250   erroroffset  ptr offset in pattern where error was detected
2251
2252 Returns:       pointer to compiled data block, or NULL on error,
2253                with errorptr and erroroffset set
2254 */
2255
2256 pcre *
2257 pcre_compile(const char *pattern, int options, const char **errorptr,
2258              int *erroroffset, PyObject *dictionary)
2259 {
2260 real_pcre *re;
2261 int spaces = 0;
2262 int length = 3;      /* For initial BRA plus length */
2263 int runlength;
2264 int c, size;
2265 int bracount = 0;
2266 int brastack[200];
2267 int top_backref = 0;
2268 unsigned int brastackptr = 0;
2269 uschar *code;
2270 const uschar *ptr;
2271
2272 #ifdef DEBUG
2273 uschar *code_base, *code_end;
2274 #endif
2275
2276 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2277 can do is just return NULL. */
2278
2279 if (errorptr == NULL) return NULL;
2280 *errorptr = NULL;
2281
2282 /* However, we can give a message for this error */
2283
2284 if (erroroffset == NULL)
2285   {
2286   *errorptr = ERR16;
2287   return NULL;
2288   }
2289 *erroroffset = 0;
2290
2291 if ((options & ~PUBLIC_OPTIONS) != 0)
2292   {
2293   *errorptr = ERR17;
2294   return NULL;
2295   }
2296
2297 DPRINTF(("------------------------------------------------------------------\n"));
2298 DPRINTF(("%s\n", pattern));
2299
2300 /* The first thing to do is to make a pass over the pattern to compute the
2301 amount of store required to hold the compiled code. This does not have to be
2302 perfect as long as errors are overestimates. At the same time we can detect any
2303 internal flag settings. Make an attempt to correct for any counted white space
2304 if an "extended" flag setting appears late in the pattern. We can't be so
2305 clever for #-comments. */
2306
2307 ptr = (const uschar *)(pattern - 1);
2308 while ((c = *(++ptr)) != 0)
2309   {
2310   int min, max;
2311   int class_charcount;
2312
2313   if ((pcre_ctypes[c] & ctype_space) != 0)
2314     {
2315     if ((options & PCRE_EXTENDED) != 0) continue;
2316     spaces++;
2317     }
2318
2319   if (c == '#' && (options & PCRE_EXTENDED) != 0)
2320     {
2321     while ((c = *(++ptr)) != 0 && c != '\n');
2322     continue;
2323     }
2324
2325   switch(c)
2326     {
2327     /* A backslashed item may be an escaped "normal" character or a
2328     character type. For a "normal" character, put the pointers and
2329     character back so that tests for whitespace etc. in the input
2330     are done correctly. */
2331
2332     case '\\':
2333       {
2334       const uschar *save_ptr = ptr;
2335       c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2336       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2337       if (c >= 0)
2338         {
2339         ptr = save_ptr;
2340         c = '\\';
2341         goto NORMAL_CHAR;
2342         }
2343       }
2344     length++;
2345
2346     /* A back reference needs an additional char, plus either one or 5
2347     bytes for a repeat. We also need to keep the value of the highest
2348     back reference. */
2349
2350     if (c <= -ESC_REF)
2351       {
2352       int refnum = -c - ESC_REF;
2353       if (refnum > top_backref) top_backref = refnum;
2354       length++;   /* For single back reference */
2355       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2356         {
2357         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2358         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2359         if ((min == 0 && (max == 1 || max == -1)) ||
2360           (min == 1 && max == -1))
2361             length++;
2362         else length += 5;
2363         if (ptr[1] == '?') ptr++;
2364         }
2365       }
2366     continue;
2367
2368     case '^':
2369     case '.':
2370     case '$':
2371     case '*':     /* These repeats won't be after brackets; */
2372     case '+':     /* those are handled separately */
2373     case '?':
2374     length++;
2375     continue;
2376
2377     /* This covers the cases of repeats after a single char, metachar, class,
2378     or back reference. */
2379
2380     case '{':
2381     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2382     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2383     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2384     if ((min == 0 && (max == 1 || max == -1)) ||
2385       (min == 1 && max == -1))
2386         length++;
2387     else
2388       {
2389       length--;   /* Uncount the original char or metachar */
2390       if (min == 1) length++; else if (min > 0) length += 4;
2391       if (max > 0) length += 4; else length += 2;
2392       }
2393     if (ptr[1] == '?') ptr++;
2394     continue;
2395
2396     /* An alternation contains an offset to the next branch or ket. */
2397     case '|':
2398     length += 3;
2399     continue;
2400
2401     /* A character class uses 33 characters. Don't worry about character types
2402     that aren't allowed in classes - they'll get picked up during the compile.
2403     A character class that contains only one character uses 2 or 3 bytes,
2404     depending on whether it is negated or not. Notice this where we can. */
2405
2406     case '[':
2407     class_charcount = 0;
2408     if (*(++ptr) == '^') ptr++;
2409     do
2410       {
2411       if (*ptr == '\\')
2412         {
2413         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
2414         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2415         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2416         }
2417       else class_charcount++;
2418       ptr++;
2419       }
2420     while (*ptr != 0 && *ptr != ']');
2421
2422     /* Repeats for negated single chars are handled by the general code */
2423
2424     if (class_charcount == 1) length += 3; else
2425       {
2426       length += 33;
2427       if (options & PCRE_LOCALE) length++;  /* Add a byte for the localization flag */
2428
2429       /* A repeat needs either 1 or 5 bytes. */
2430
2431       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2432         {
2433         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2434         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2435         if ((min == 0 && (max == 1 || max == -1)) ||
2436           (min == 1 && max == -1))
2437             length++;
2438         else length += 5;
2439         if (ptr[1] == '?') ptr++;
2440         }
2441       }
2442     continue;
2443
2444     /* Brackets may be genuine groups or special things */
2445
2446     case '(':
2447
2448     /* Handle special forms of bracket, which all start (? */
2449
2450     if (ptr[1] == '?') switch (c = ptr[2])
2451       {
2452       /* Skip over comments entirely */
2453       case '#':
2454       ptr += 3;
2455       while (*ptr != 0 && *ptr != ')') ptr++;
2456       if (*ptr == 0)
2457         {
2458         *errorptr = ERR18;
2459         goto PCRE_ERROR_RETURN;
2460         }
2461       continue;
2462
2463       /* Non-referencing groups and lookaheads just move the pointer on, and
2464       then behave like a non-special bracket, except that they don't increment
2465       the count of extracting brackets. */
2466
2467       case ':':
2468       case '=':
2469       case '!':
2470       ptr += 2;
2471       break;
2472
2473       case ('P'):
2474         {
2475           int idlen;
2476           switch (*ptr++) {
2477           case ('<'):
2478             idlen = get_group_id(ptr++, '>', errorptr);
2479             if (*errorptr) goto PCRE_ERROR_RETURN;
2480             ptr += idlen+1;
2481             break;
2482           case ('='):
2483             idlen = get_group_id(ptr++, ')', errorptr);
2484             if (*errorptr) goto PCRE_ERROR_RETURN;
2485             ptr += idlen+1;
2486             length++;
2487             break;
2488           }
2489         }
2490         break;
2491
2492       /* Ditto for the "once only" bracket, allowed only if the extra bit
2493       is set. */
2494
2495       case '>':
2496       if ((options & PCRE_EXTRA) != 0)
2497         {
2498         ptr += 2;
2499         break;
2500         }
2501       /* Else fall through */
2502
2503       /* Else loop setting valid options until ) is met. Anything else is an
2504       error. */
2505
2506       default:
2507       ptr += 2;
2508       for (;; ptr++)
2509         {
2510         if ((c = *ptr) == 'i')
2511           {
2512           options |= PCRE_CASELESS;
2513           continue;
2514           }
2515         else if ((c = *ptr) == 'L')
2516           {
2517           options |= PCRE_LOCALE;
2518           continue;
2519           }
2520         else if ((c = *ptr) == 'm')
2521           {
2522           options |= PCRE_MULTILINE;
2523           continue;
2524           }
2525         else if (c == 's')
2526           {
2527           options |= PCRE_DOTALL;
2528           continue;
2529           }
2530         else if (c == 'x')
2531           {
2532           options |= PCRE_EXTENDED;
2533           length -= spaces;          /* Already counted spaces */
2534           continue;
2535           }
2536         else if (c == ')') break;
2537
2538         *errorptr = ERR12;
2539         goto PCRE_ERROR_RETURN;
2540         }
2541       continue;                      /* End of this bracket handling */
2542       }
2543
2544     /* Extracting brackets must be counted so we can process escapes in a
2545     Perlish way. */
2546
2547     else bracount++;
2548
2549     /* Non-special forms of bracket. Save length for computing whole length
2550     at end if there's a repeat that requires duplication of the group. */
2551
2552     if (brastackptr >= sizeof(brastack)/sizeof(int))
2553       {
2554       *errorptr = ERR19;
2555       goto PCRE_ERROR_RETURN;
2556       }
2557
2558     brastack[brastackptr++] = length;
2559     length += 3;
2560     continue;
2561
2562     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2563     have to replicate this bracket up to that many times. If brastackptr is
2564     0 this is an unmatched bracket which will generate an error, but take care
2565     not to try to access brastack[-1]. */
2566
2567     case ')':
2568     length += 3;
2569       {
2570       int minval = 1;
2571       int maxval = 1;
2572       int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
2573
2574       /* Leave ptr at the final char; for read_repeat_counts this happens
2575       automatically; for the others we need an increment. */
2576
2577       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2578         {
2579         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2580         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2581         }
2582       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2583       else if (c == '+') { maxval = -1; ptr++; }
2584       else if (c == '?') { minval = 0; ptr++; }
2585
2586       /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2587       if there is a limited maximum we have to replicate up to maxval-1 times
2588       and allow for a BRAZERO item before each optional copy, as we also have
2589       to do before the first copy if the minimum is zero. */
2590
2591       if (minval == 0) length++;
2592         else if (minval > 1) length += (minval - 1) * duplength;
2593       if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2594       }
2595     continue;
2596
2597     /* Non-special character. For a run of such characters the length required
2598     is the number of characters + 2, except that the maximum run length is 255.
2599     We won't get a skipped space or a non-data escape or the start of a #
2600     comment as the first character, so the length can't be zero. */
2601
2602     NORMAL_CHAR:
2603     default:
2604     length += 2;
2605     runlength = 0;
2606     do
2607       {
2608       if ((pcre_ctypes[c] & ctype_space) != 0)
2609         {
2610         if ((options & PCRE_EXTENDED) != 0) continue;
2611         spaces++;
2612         }
2613
2614       if (c == '#' && (options & PCRE_EXTENDED) != 0)
2615         {
2616         while ((c = *(++ptr)) != 0 && c != '\n');
2617         continue;
2618         }
2619
2620       /* Backslash may introduce a data char or a metacharacter; stop the
2621       string before the latter. */
2622
2623       if (c == '\\')
2624         {
2625         const uschar *saveptr = ptr;
2626         c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2627         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2628         if (c < 0) { ptr = saveptr; break; }
2629         }
2630
2631       /* Ordinary character or single-char escape */
2632
2633       runlength++;
2634       }
2635
2636     /* This "while" is the end of the "do" above. */
2637
2638     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2639
2640     ptr--;
2641     length += runlength;
2642     continue;
2643     }
2644   }
2645
2646 length += 4;    /* For final KET and END */
2647
2648 if (length > 65539)
2649   {
2650   *errorptr = ERR20;
2651   return NULL;
2652   }
2653
2654 /* Compute the size of data block needed and get it, either from malloc or
2655 externally provided function. We specify "code[0]" in the offsetof() expression
2656 rather than just "code", because it has been reported that one broken compiler
2657 fails on "code" because it is also an independent variable. It should make no
2658 difference to the value of the offsetof(). */
2659
2660 size = length + offsetof(real_pcre, code[0]);
2661 re = (real_pcre *)(pcre_malloc)(size+50);
2662
2663 if (re == NULL)
2664   {
2665   *errorptr = ERR21;
2666   return NULL;
2667   }
2668
2669 /* Put in the magic number and the options. */
2670
2671 re->magic_number = MAGIC_NUMBER;
2672 re->options = options;
2673
2674 /* Set up a starting, non-extracting bracket, then compile the expression. On
2675 error, *errorptr will be set non-NULL, so we don't need to look at the result
2676 of the function here. */
2677
2678 ptr = (const uschar *)pattern;
2679 code = re->code;
2680 *code = OP_BRA;
2681 bracount = 0;
2682 (void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2683 re->top_bracket = bracount;
2684 re->top_backref = top_backref;
2685
2686 /* If not reached end of pattern on success, there's an excess bracket. */
2687
2688 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2689
2690 /* Fill in the terminating state and check for disastrous overflow, but
2691 if debugging, leave the test till after things are printed out. */
2692
2693 *code++ = OP_END;
2694
2695
2696 #ifndef DEBUG
2697 if (code - re->code > length) *errorptr = ERR23;
2698 #endif
2699
2700 /* Failed to compile */
2701
2702 if (*errorptr != NULL)
2703   {
2704   (pcre_free)(re);
2705   PCRE_ERROR_RETURN:
2706   *erroroffset = ptr - (const uschar *)pattern;
2707   return NULL;
2708   }
2709
2710 /* If the anchored option was not passed, set flag if we can determine that it
2711 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2712 we can determine what the first character has to be, because that speeds up
2713 unanchored matches no end. In the case of multiline matches, an alternative is
2714 to set the PCRE_STARTLINE flag if all branches start with ^. */
2715
2716 if ((options & PCRE_ANCHORED) == 0)
2717   {
2718   if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2719     re->options |= PCRE_ANCHORED;
2720   else
2721     {
2722     int ch = find_firstchar(re->code);
2723     if (ch >= 0)
2724       {
2725       re->first_char = ch;
2726       re->options |= PCRE_FIRSTSET;
2727       }
2728     else if (is_startline(re->code))
2729       re->options |= PCRE_STARTLINE;
2730     }
2731   }
2732
2733 /* Print out the compiled data for debugging */
2734
2735 #ifdef DEBUG
2736
2737 printf("Length = %d top_bracket = %d top_backref=%d\n",
2738   length, re->top_bracket, re->top_backref);
2739
2740 if (re->options != 0)
2741   {
2742   printf("%s%s%s%s%s%s%s%s\n",
2743     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2744     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2745     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2746     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2747     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2748     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2749     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2750     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2751   }
2752
2753 if ((re->options & PCRE_FIRSTSET) != 0)
2754   {
2755   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2756     else printf("First char = \\x%02x\n", re->first_char);
2757   }
2758
2759 code_end = code;
2760 code_base = code = re->code;
2761
2762 while (code < code_end)
2763   {
2764   int charlength;
2765
2766   printf("%3d ", code - code_base);
2767
2768   if (*code >= OP_BRA)
2769     {
2770     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2771     code += 2;
2772     }
2773
2774   else switch(*code)
2775     {
2776     case OP_CHARS:
2777     charlength = *(++code);
2778     printf("%3d ", charlength);
2779     while (charlength-- > 0)
2780       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2781     break;
2782
2783     case OP_KETRMAX:
2784     case OP_KETRMIN:
2785     case OP_ALT:
2786     case OP_KET:
2787     case OP_ASSERT:
2788     case OP_ASSERT_NOT:
2789     case OP_ONCE:
2790     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2791     code += 2;
2792     break;
2793
2794     case OP_STAR:
2795     case OP_MINSTAR:
2796     case OP_PLUS:
2797     case OP_MINPLUS:
2798     case OP_QUERY:
2799     case OP_MINQUERY:
2800     case OP_TYPESTAR:
2801     case OP_TYPEMINSTAR:
2802     case OP_TYPEPLUS:
2803     case OP_TYPEMINPLUS:
2804     case OP_TYPEQUERY:
2805     case OP_TYPEMINQUERY:
2806     if (*code >= OP_TYPESTAR)
2807       printf("    %s", OP_names[code[1]]);
2808     else if (isprint(c = code[1])) printf("    %c", c);
2809       else printf("    \\x%02x", c);
2810     printf("%s", OP_names[*code++]);
2811     break;
2812
2813     case OP_EXACT:
2814     case OP_UPTO:
2815     case OP_MINUPTO:
2816     if (isprint(c = code[3])) printf("    %c{", c);
2817       else printf("    \\x%02x{", c);
2818     if (*code != OP_EXACT) printf("0,");
2819     printf("%d}", (code[1] << 8) + code[2]);
2820     if (*code == OP_MINUPTO) printf("?");
2821     code += 3;
2822     break;
2823
2824     case OP_TYPEEXACT:
2825     case OP_TYPEUPTO:
2826     case OP_TYPEMINUPTO:
2827     printf("    %s{", OP_names[code[3]]);
2828     if (*code != OP_TYPEEXACT) printf(",");
2829     printf("%d}", (code[1] << 8) + code[2]);
2830     if (*code == OP_TYPEMINUPTO) printf("?");
2831     code += 3;
2832     break;
2833
2834     case OP_NOT:
2835     if (isprint(c = *(++code))) printf("    [^%c]", c);
2836       else printf("    [^\\x%02x]", c);
2837     break;
2838
2839     case OP_NOTSTAR:
2840     case OP_NOTMINSTAR:
2841     case OP_NOTPLUS:
2842     case OP_NOTMINPLUS:
2843     case OP_NOTQUERY:
2844     case OP_NOTMINQUERY:
2845     if (isprint(c = code[1])) printf("    [^%c]", c);
2846       else printf("    [^\\x%02x]", c);
2847     printf("%s", OP_names[*code++]);
2848     break;
2849
2850     case OP_NOTEXACT:
2851     case OP_NOTUPTO:
2852     case OP_NOTMINUPTO:
2853     if (isprint(c = code[3])) printf("    [^%c]{", c);
2854       else printf("    [^\\x%02x]{", c);
2855     if (*code != OP_NOTEXACT) printf(",");
2856     printf("%d}", (code[1] << 8) + code[2]);
2857     if (*code == OP_NOTMINUPTO) printf("?");
2858     code += 3;
2859     break;
2860
2861     case OP_REF:
2862     printf("    \\%d", *(++code));
2863     code ++;
2864     goto CLASS_REF_REPEAT;
2865
2866     case OP_CLASS:
2867     case OP_NEGCLASS:
2868     case OP_CLASS_L:
2869       {
2870       int i, min, max;
2871
2872       if (*code==OP_CLASS_L)
2873         {
2874           code++;
2875           printf("Locflag = %i ", *code++);
2876           printf("    [");
2877         }
2878       else
2879         {
2880           if (*code++ == OP_CLASS) printf("    [");
2881           else printf("   ^[");
2882         }
2883
2884
2885       for (i = 0; i < 256; i++)
2886         {
2887         if ((code[i/8] & (1 << (i&7))) != 0)
2888           {
2889           int j;
2890           for (j = i+1; j < 256; j++)
2891             if ((code[j/8] & (1 << (j&7))) == 0) break;
2892           if (i == '-' || i == ']') printf("\\");
2893           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2894           if (--j > i)
2895             {
2896             printf("-");
2897             if (j == '-' || j == ']') printf("\\");
2898             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2899             }
2900           i = j;
2901           }
2902         }
2903       printf("]");
2904       code += 32;
2905       /*      code ++;*/
2906
2907       CLASS_REF_REPEAT:
2908
2909       switch(*code)
2910         {
2911         case OP_CRSTAR:
2912         case OP_CRMINSTAR:
2913         case OP_CRPLUS:
2914         case OP_CRMINPLUS:
2915         case OP_CRQUERY:
2916         case OP_CRMINQUERY:
2917         printf("%s", OP_names[*code]);
2918         break;
2919
2920         case OP_CRRANGE:
2921         case OP_CRMINRANGE:
2922         min = (code[1] << 8) + code[2];
2923         max = (code[3] << 8) + code[4];
2924         if (max == 0) printf("{%d,}", min);
2925         else printf("{%d,%d}", min, max);
2926         if (*code == OP_CRMINRANGE) printf("?");
2927         code += 4;
2928         break;
2929
2930         default:
2931         code--;
2932         }
2933       }
2934     break;
2935
2936     /* Anything else is just a one-node item */
2937
2938     default:
2939     printf("    %s", OP_names[*code]);
2940     break;
2941     }
2942
2943   code++;
2944   printf("\n");
2945   }
2946 printf("------------------------------------------------------------------\n");
2947
2948 /* This check is done here in the debugging case so that the code that
2949 was compiled can be seen. */
2950
2951 if (code - re->code > length)
2952   {
2953   printf("length=%i, code length=%i\n", length, code-re->code);
2954   *errorptr = ERR23;
2955   (pcre_free)(re);
2956   *erroroffset = ptr - (uschar *)pattern;
2957   return NULL;
2958   }
2959 #endif
2960
2961 return (pcre *)re;
2962 }
2963
2964
2965
2966 /*************************************************
2967 *        Match a character type                  *
2968 *************************************************/
2969
2970 /* Not used in all the places it might be as it's sometimes faster
2971 to put the code inline.
2972
2973 Arguments:
2974   type        the character type
2975   c           the character
2976   dotall      the dotall flag
2977
2978 Returns:      TRUE if character is of the type
2979 */
2980
2981 static BOOL
2982 match_type(int type, int c, BOOL dotall)
2983 {
2984
2985 #ifdef DEBUG
2986 if (isprint(c)) printf("matching subject %c against ", c);
2987   else printf("matching subject \\x%02x against ", c);
2988 printf("%s\n", OP_names[type]);
2989 #endif
2990
2991 switch(type)
2992   {
2993   case OP_ANY:            return dotall || c != '\n';
2994   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;
2995   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;
2996   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2997   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;
2998   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;
2999   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;
3000   case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3001   case OP_WORDCHAR_L:     return (c=='_' || isalnum(c));
3002   }
3003 return FALSE;
3004 }
3005
3006
3007
3008 /*************************************************
3009 *          Match a back-reference                *
3010 *************************************************/
3011
3012 /* If a back reference hasn't been set, the match fails.
3013
3014 Arguments:
3015   number      reference number
3016   eptr        points into the subject
3017   length      length to be matched
3018   md          points to match data block
3019
3020 Returns:      TRUE if matched
3021 */
3022
3023 static BOOL
3024 match_ref(int number, register const uschar *eptr, int length, match_data *md)
3025 {
3026 const uschar *p = md->start_subject + md->offset_vector[number];
3027
3028 #ifdef DEBUG
3029 if (eptr >= md->end_subject)
3030   printf("matching subject <null>");
3031 else
3032   {
3033   printf("matching subject ");
3034   pchars(eptr, length, TRUE, md);
3035   }
3036 printf(" against backref ");
3037 pchars(p, length, FALSE, md);
3038 printf("\n");
3039 #endif
3040
3041 /* Always fail if not enough characters left */
3042
3043 if (length > md->end_subject - p) return FALSE;
3044
3045 /* Separate the caseless case for speed */
3046
3047 if (md->caseless)
3048   { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3049 else
3050   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3051
3052 return TRUE;
3053 }
3054
3055 static int free_stack(match_data *md)
3056 {
3057 /* Free any stack space that was allocated by the call to match(). */
3058 if (md->off_num)    free(md->off_num);
3059 if (md->offset_top) free(md->offset_top);
3060 if (md->r1)         free(md->r1);
3061 if (md->r2)         free(md->r2);
3062 if (md->eptr)       free((char *)md->eptr);
3063 if (md->ecode)      free((char *)md->ecode);
3064 return 0;
3065 }
3066
3067 static int grow_stack(match_data *md)
3068 {
3069   if (md->length != 0)
3070     {
3071       md->length = md->length + md->length/2;
3072     }
3073   else
3074     {
3075       int string_len = md->end_subject - md->start_subject + 1;
3076       if (string_len < 80) {md->length = string_len; }
3077       else {md->length = 80;}
3078     }
3079   PyMem_RESIZE(md->offset_top, int, md->length);
3080   PyMem_RESIZE(md->eptr, const uschar *, md->length);
3081   PyMem_RESIZE(md->ecode, const uschar *, md->length);
3082   PyMem_RESIZE(md->off_num, int, md->length);
3083   PyMem_RESIZE(md->r1, int, md->length);
3084   PyMem_RESIZE(md->r2, int, md->length);
3085   if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3086       md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3087     {
3088       PyErr_NoMemory();
3089       longjmp(md->error_env, 1);
3090     }
3091   return 0;
3092 }
3093
3094
3095 /*************************************************
3096 *         Match from current position            *
3097 *************************************************/
3098
3099 /* On entry ecode points to the first opcode, and eptr to the first character.
3100
3101 Arguments:
3102    eptr        pointer in subject
3103    ecode       position in code
3104    offset_top  current top pointer
3105    md          pointer to "static" info for the match
3106
3107 Returns:       TRUE if matched
3108 */
3109
3110 static BOOL
3111 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
3112   match_data *md)
3113 {
3114   int save_stack_position = md->point;
3115 match_loop:
3116
3117 #define SUCCEED goto succeed
3118 #define FAIL    goto fail
3119
3120 for (;;)
3121   {
3122   int min, max, ctype;
3123   register int i;
3124   register int c;
3125   BOOL minimize = FALSE;
3126
3127   /* Opening bracket. Check the alternative branches in turn, failing if none
3128   match. We have to set the start offset if required and there is space
3129   in the offset vector so that it is available for subsequent back references
3130   if the bracket matches. However, if the bracket fails, we must put back the
3131   previous value of both offsets in case they were set by a previous copy of
3132   the same bracket. Don't worry about setting the flag for the error case here;
3133   that is handled in the code for KET. */
3134
3135   if ((int)*ecode >= OP_BRA)
3136     {
3137     int number = (*ecode - OP_BRA) << 1;
3138     int save_offset1 = 0, save_offset2 = 0;
3139
3140     DPRINTF(("start bracket %d\n", number/2));
3141
3142     if (number > 0 && number < md->offset_end)
3143       {
3144       save_offset1 = md->offset_vector[number];
3145       save_offset2 = md->offset_vector[number+1];
3146       md->offset_vector[number] = eptr - md->start_subject;
3147
3148       DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
3149       }
3150
3151     /* Recurse for all the alternatives. */
3152
3153     do
3154       {
3155       if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3156       ecode += (ecode[1] << 8) + ecode[2];
3157       }
3158     while (*ecode == OP_ALT);
3159
3160     DPRINTF(("bracket %d failed\n", number/2));
3161
3162     if (number > 0 && number < md->offset_end)
3163       {
3164       md->offset_vector[number] = save_offset1;
3165       md->offset_vector[number+1] = save_offset2;
3166       }
3167
3168     FAIL;
3169     }
3170
3171   /* Other types of node can be handled by a switch */
3172
3173   switch(*ecode)
3174     {
3175     case OP_END:
3176     md->end_match_ptr = eptr;          /* Record where we ended */
3177     md->end_offset_top = offset_top;   /* and how many extracts were taken */
3178     SUCCEED;
3179
3180     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3181     whole thing doesn't match, so we have to get out via a longjmp(). */
3182
3183     case OP_CUT:
3184     if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3185     longjmp(md->fail_env, 1);
3186
3187     /* Assertion brackets. Check the alternative branches in turn - the
3188     matching won't pass the KET for an assertion. If any one branch matches,
3189     the assertion is true. */
3190
3191     case OP_ASSERT:
3192     do
3193       {
3194       if (match(eptr, ecode+3, offset_top, md)) break;
3195       ecode += (ecode[1] << 8) + ecode[2];
3196       }
3197     while (*ecode == OP_ALT);
3198     if (*ecode == OP_KET) FAIL;
3199
3200     /* Continue from after the assertion, updating the offsets high water
3201     mark, since extracts may have been taken during the assertion. */
3202
3203     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3204     ecode += 3;
3205     offset_top = md->end_offset_top;
3206     continue;
3207
3208     /* Negative assertion: all branches must fail to match */
3209
3210     case OP_ASSERT_NOT:
3211     do
3212       {
3213       if (match(eptr, ecode+3, offset_top, md)) FAIL;
3214       ecode += (ecode[1] << 8) + ecode[2];
3215       }
3216     while (*ecode == OP_ALT);
3217     ecode += 3;
3218     continue;
3219
3220     /* "Once" brackets are like assertion brackets except that after a match,
3221     the point in the subject string is not moved back. Thus there can never be
3222     a move back into the brackets. Check the alternative branches in turn - the
3223     matching won't pass the KET for this kind of subpattern. If any one branch
3224     matches, we carry on, leaving the subject pointer. */
3225
3226     case OP_ONCE:
3227     do
3228       {
3229       if (match(eptr, ecode+3, offset_top, md)) break;
3230       ecode += (ecode[1] << 8) + ecode[2];
3231       }
3232     while (*ecode == OP_ALT);
3233     if (*ecode == OP_KET) FAIL;
3234
3235     /* Continue as from after the assertion, updating the offsets high water
3236     mark, since extracts may have been taken. */
3237
3238     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3239     ecode += 3;
3240     offset_top = md->end_offset_top;
3241     eptr = md->end_match_ptr;
3242     continue;
3243
3244     /* An alternation is the end of a branch; scan along to find the end of the
3245     bracketed group and go to there. */
3246
3247     case OP_ALT:
3248     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3249     break;
3250
3251     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3252     that it may occur zero times. It may repeat infinitely, or not at all -
3253     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3254     repeat limits are compiled as a number of copies, with the optional ones
3255     preceded by BRAZERO or BRAMINZERO. */
3256
3257     case OP_BRAZERO:
3258       {
3259       const uschar *next = ecode+1;
3260       if (match(eptr, next, offset_top, md)) SUCCEED;
3261       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3262       ecode = next + 3;
3263       }
3264     break;
3265
3266     case OP_BRAMINZERO:
3267       {
3268       const uschar *next = ecode+1;
3269       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3270       if (match(eptr, next+3, offset_top, md)) SUCCEED;
3271       ecode++;
3272       }
3273     break;;
3274
3275     /* End of a group, repeated or non-repeating. If we are at the end of
3276     an assertion "group", stop matching and SUCCEED, but record the
3277     current high water mark for use by positive assertions. */
3278
3279     case OP_KET:
3280     case OP_KETRMIN:
3281     case OP_KETRMAX:
3282       {
3283       int number;
3284       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3285
3286       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
3287         {
3288         md->end_match_ptr = eptr;      /* For ONCE */
3289         md->end_offset_top = offset_top;
3290         SUCCEED;
3291         }
3292
3293       /* In all other cases we have to check the group number back at the
3294       start and if necessary complete handling an extraction by setting the
3295       final offset and bumping the high water mark. */
3296
3297       number = (*prev - OP_BRA) << 1;
3298
3299       DPRINTF(("end bracket %d\n", number/2));
3300
3301       if (number > 0)
3302         {
3303         if (number >= md->offset_end) md->offset_overflow = TRUE; else
3304           {
3305           md->offset_vector[number+1] = eptr - md->start_subject;
3306           if (offset_top <= number) offset_top = number + 2;
3307           }
3308         }
3309
3310       /* For a non-repeating ket, just advance to the next node and continue at
3311       this level. */
3312
3313       if (*ecode == OP_KET)
3314         {
3315         ecode += 3;
3316         break;
3317         }
3318
3319       /* The repeating kets try the rest of the pattern or restart from the
3320       preceding bracket, in the appropriate order. */
3321
3322       if (*ecode == OP_KETRMIN)
3323         {
3324         const uschar *ptr;
3325         if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3326         /* Handle alternation inside the BRA...KET; push the additional
3327            alternatives onto the stack */
3328         ptr=prev;
3329         do {
3330           ptr += (ptr[1]<<8)+ ptr[2];
3331           if (*ptr==OP_ALT)
3332             {
3333               if (md->length == md->point)
3334                 {
3335                   grow_stack(md);
3336                 }
3337               md->offset_top[md->point] = offset_top;
3338               md->eptr[md->point]       = eptr;
3339               md->ecode[md->point]      = ptr+3;
3340               md->r1[md->point]         = 0;
3341               md->r2[md->point]         = 0;
3342               md->off_num[md->point]    = 0;
3343               md->point++;
3344             }
3345         } while (*ptr==OP_ALT);
3346         ecode=prev+3; goto match_loop;
3347         }
3348       else  /* OP_KETRMAX */
3349         {
3350         const uschar *ptr;
3351         /*int points_pushed=0;*/
3352
3353         /* Push one failure point, that will resume matching at the code after
3354            the KETRMAX opcode. */
3355         if (md->length == md->point)
3356           {
3357             grow_stack(md);
3358           }
3359         md->offset_top[md->point] = offset_top;
3360         md->eptr[md->point]       = eptr;
3361         md->ecode[md->point]      = ecode+3;
3362         md->r1[md->point]         = md->offset_vector[number];
3363         md->r2[md->point]         = md->offset_vector[number+1];
3364         md->off_num[md->point]    = number;
3365         md->point++;
3366
3367         md->offset_vector[number] = eptr - md->start_subject;
3368         /* Handle alternation inside the BRA...KET; push each of the
3369            additional alternatives onto the stack */
3370         ptr=prev;
3371         do {
3372           ptr += (ptr[1]<<8)+ ptr[2];
3373           if (*ptr==OP_ALT)
3374             {
3375               if (md->length == md->point)
3376                 if (md->length == md->point)
3377                   {
3378                     grow_stack(md);
3379                   }
3380               md->offset_top[md->point] = offset_top;
3381               md->eptr[md->point]       = eptr;
3382               md->ecode[md->point]      = ptr+3;
3383               md->r1[md->point]         = 0;
3384               md->r2[md->point]         = 0;
3385               md->off_num[md->point]    = 0;
3386               md->point++;
3387               /*points_pushed++;*/
3388             }
3389         } while (*ptr==OP_ALT);
3390         /* Jump to the first (or only) alternative and resume trying to match */
3391         ecode=prev+3; goto match_loop;
3392         }
3393       }
3394
3395     /* Start of subject unless notbol, or after internal newline if multiline */
3396
3397     case OP_CIRC:
3398     if (md->notbol && eptr == md->start_subject) FAIL;
3399     if (md->multiline)
3400       {
3401       if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3402       ecode++;
3403       break;
3404       }
3405     /* ... else fall through */
3406
3407     /* Start of subject assertion */
3408
3409     case OP_SOD:
3410     if (eptr != md->start_subject) FAIL;
3411     ecode++;
3412     break;
3413
3414     /* Assert before internal newline if multiline, or before
3415     a terminating newline unless endonly is set, else end of subject unless
3416     noteol is set. */
3417
3418     case OP_DOLL:
3419     if (md->noteol && eptr >= md->end_subject) FAIL;
3420     if (md->multiline)
3421       {
3422       if (eptr < md->end_subject && *eptr != '\n') FAIL;
3423       ecode++;
3424       break;
3425       }
3426     else if (!md->endonly)
3427       {
3428       if (eptr < md->end_subject - 1 ||
3429          (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3430       ecode++;
3431       break;
3432       }
3433     /* ... else fall through */
3434
3435     /* End of subject assertion */
3436
3437     case OP_EOD:
3438     if (eptr < md->end_subject) FAIL;
3439     ecode++;
3440     break;
3441
3442     /* Word boundary assertions */
3443
3444     case OP_NOT_WORD_BOUNDARY:
3445     case OP_WORD_BOUNDARY:
3446       {
3447       BOOL prev_is_word = (eptr != md->start_subject) &&
3448         ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3449       BOOL cur_is_word = (eptr < md->end_subject) &&
3450         ((pcre_ctypes[*eptr] & ctype_word) != 0);
3451       if ((*ecode++ == OP_WORD_BOUNDARY)?
3452            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3453         FAIL;
3454       }
3455     break;
3456
3457     case OP_NOT_WORD_BOUNDARY_L:
3458     case OP_WORD_BOUNDARY_L:
3459       {
3460         BOOL prev_is_word = (eptr != md->start_subject) &&
3461           (isalnum(eptr[-1]) || eptr[-1]=='_');
3462         BOOL cur_is_word = (eptr < md->end_subject) &&
3463           (isalnum(*eptr) || *eptr=='_');
3464         if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3465             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3466           FAIL;
3467       }
3468       break;
3469
3470
3471     /* Match a single character type; inline for speed */
3472
3473     case OP_ANY:
3474     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3475     if (eptr++ >= md->end_subject) FAIL;
3476     ecode++;
3477     break;
3478
3479     case OP_NOT_DIGIT:
3480     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3481       FAIL;
3482     ecode++;
3483     break;
3484
3485     case OP_DIGIT:
3486     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3487       FAIL;
3488     ecode++;
3489     break;
3490
3491     case OP_NOT_WHITESPACE:
3492     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3493       FAIL;
3494     ecode++;
3495     break;
3496
3497     case OP_WHITESPACE:
3498     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3499       FAIL;
3500     ecode++;
3501     break;
3502
3503     case OP_NOT_WORDCHAR:
3504     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3505       FAIL;
3506     ecode++;
3507     break;
3508
3509     case OP_WORDCHAR:
3510     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3511       FAIL;
3512     ecode++;
3513     break;
3514
3515     case OP_NOT_WORDCHAR_L:
3516     if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
3517       FAIL;
3518     eptr++;
3519     ecode++;
3520     break;
3521
3522     case OP_WORDCHAR_L:
3523     if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
3524       FAIL;
3525     eptr++;
3526     ecode++;
3527     break;
3528
3529     /* Match a back reference, possibly repeatedly. Look past the end of the
3530     item to see if there is repeat information following. The code is similar
3531     to that for character classes, but repeated for efficiency. Then obey
3532     similar code to character type repeats - written out again for speed.
3533     However, if the referenced string is the empty string, always treat
3534     it as matched, any number of times (otherwise there could be infinite
3535     loops). */
3536
3537     case OP_REF:
3538       {
3539       int length;
3540       int number = ecode[1] << 1;                /* Doubled reference number */
3541       ecode += 2;                                /* Advance past the item */
3542
3543       if (number >= offset_top || md->offset_vector[number] < 0)
3544         {
3545         md->errorcode = PCRE_ERROR_BADREF;
3546         FAIL;
3547         }
3548
3549       length = md->offset_vector[number+1] - md->offset_vector[number];
3550
3551       switch (*ecode)
3552         {
3553         case OP_CRSTAR:
3554         case OP_CRMINSTAR:
3555         case OP_CRPLUS:
3556         case OP_CRMINPLUS:
3557         case OP_CRQUERY:
3558         case OP_CRMINQUERY:
3559         c = *ecode++ - OP_CRSTAR;
3560         minimize = (c & 1) != 0;
3561         min = rep_min[c];                 /* Pick up values from tables; */
3562         max = rep_max[c];                 /* zero for max => infinity */
3563         if (max == 0) max = INT_MAX;
3564         break;
3565
3566         case OP_CRRANGE:
3567         case OP_CRMINRANGE:
3568         minimize = (*ecode == OP_CRMINRANGE);
3569         min = (ecode[1] << 8) + ecode[2];
3570         max = (ecode[3] << 8) + ecode[4];
3571         if (max == 0) max = INT_MAX;
3572         ecode += 5;
3573         break;
3574
3575         default:               /* No repeat follows */
3576         if (!match_ref(number, eptr, length, md)) FAIL;
3577         eptr += length;
3578         continue;              /* With the main loop */
3579         }
3580
3581       /* If the length of the reference is zero, just continue with the
3582       main loop. */
3583
3584       if (length == 0) continue;
3585
3586       /* First, ensure the minimum number of matches are present. We get back
3587       the length of the reference string explicitly rather than passing the
3588       address of eptr, so that eptr can be a register variable. */
3589
3590       for (i = 1; i <= min; i++)
3591         {
3592         if (!match_ref(number, eptr, length, md)) FAIL;
3593         eptr += length;
3594         }
3595
3596       /* If min = max, continue at the same level without recursion.
3597       They are not both allowed to be zero. */
3598
3599       if (min == max) continue;
3600
3601       /* If minimizing, keep trying and advancing the pointer */
3602
3603       if (minimize)
3604         {
3605         for (i = min;; i++)
3606           {
3607           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3608           if (i >= max || !match_ref(number, eptr, length, md))
3609             FAIL;
3610           eptr += length;
3611           }
3612         /* Control never gets here */
3613         }
3614
3615       /* If maximizing, find the longest string and work backwards */
3616
3617       else
3618         {
3619         const uschar *pp = eptr;
3620         for (i = min; i < max; i++)
3621           {
3622           if (!match_ref(number, eptr, length, md)) break;
3623           eptr += length;
3624           }
3625         while (eptr >= pp)
3626           {
3627           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3628           eptr -= length;
3629           }
3630         FAIL;
3631         }
3632       }
3633     /* Control never gets here */
3634
3635     /* Match a character class, possibly repeatedly. Look past the end of the
3636     item to see if there is repeat information following. Then obey similar
3637     code to character type repeats - written out again for speed. If caseless
3638     matching was set at runtime but not at compile time, we have to check both
3639     versions of a character, and we have to behave differently for positive and
3640     negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3641     treated differently. */
3642
3643     case OP_CLASS:
3644     case OP_NEGCLASS:
3645       {
3646       BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
3647       const uschar *data = ecode + 1;  /* Save for matching */
3648       ecode += 33;                     /* Advance past the item */
3649
3650       switch (*ecode)
3651         {
3652         case OP_CRSTAR:
3653         case OP_CRMINSTAR:
3654         case OP_CRPLUS:
3655         case OP_CRMINPLUS:
3656         case OP_CRQUERY:
3657         case OP_CRMINQUERY:
3658         c = *ecode++ - OP_CRSTAR;
3659         minimize = (c & 1) != 0;
3660         min = rep_min[c];                 /* Pick up values from tables; */
3661         max = rep_max[c];                 /* zero for max => infinity */
3662         if (max == 0) max = INT_MAX;
3663         break;
3664
3665         case OP_CRRANGE:
3666         case OP_CRMINRANGE:
3667         minimize = (*ecode == OP_CRMINRANGE);
3668         min = (ecode[1] << 8) + ecode[2];
3669         max = (ecode[3] << 8) + ecode[4];
3670         if (max == 0) max = INT_MAX;
3671         ecode += 5;
3672         break;
3673
3674         default:               /* No repeat follows */
3675           min = max = 1;
3676           break;
3677         }
3678
3679       /* First, ensure the minimum number of matches are present. */
3680
3681       for (i = 1; i <= min; i++)
3682         {
3683         if (eptr >= md->end_subject) FAIL;
3684         c = *eptr++;
3685
3686         /* Either not runtime caseless, or it was a positive class. For
3687         runtime caseless, continue if either case is in the map. */
3688
3689         if (!nasty_case)
3690           {
3691           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3692           if (md->runtime_caseless)
3693             {
3694             c = pcre_fcc[c];
3695             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3696             }
3697           }
3698
3699         /* Runtime caseless and it was a negative class. Continue only if
3700         both cases are in the map. */
3701
3702         else
3703           {
3704            if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3705            c = pcre_fcc[c];
3706            if ((data[c/8] & (1 << (c&7))) != 0) continue;
3707            }
3708
3709         FAIL;
3710         }
3711
3712       /* If max == min we can continue with the main loop without the
3713       need to recurse. */
3714
3715       if (min == max) continue;
3716
3717       /* If minimizing, keep testing the rest of the expression and advancing
3718       the pointer while it matches the class. */
3719
3720       if (minimize)
3721         {
3722         for (i = min;; i++)
3723           {
3724           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3725           if (i >= max || eptr >= md->end_subject) FAIL;
3726           c = *eptr++;
3727
3728           /* Either not runtime caseless, or it was a positive class. For
3729           runtime caseless, continue if either case is in the map. */
3730
3731           if (!nasty_case)
3732             {
3733             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3734             if (md->runtime_caseless)
3735               {
3736               c = pcre_fcc[c];
3737               if ((data[c/8] & (1 << (c&7))) != 0) continue;
3738               }
3739             }
3740
3741           /* Runtime caseless and it was a negative class. Continue only if
3742           both cases are in the map. */
3743
3744           else
3745              {
3746              if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3747              c = pcre_fcc[c];
3748              if ((data[c/8] & (1 << (c&7))) != 0) continue;
3749              }
3750
3751           FAIL;
3752           }
3753         /* Control never gets here */
3754         }
3755
3756       /* If maximizing, find the longest possible run, then work backwards. */
3757
3758       else
3759         {
3760         const uschar *pp = eptr;
3761         for (i = min; i < max; eptr++, i++)
3762           {
3763           if (eptr >= md->end_subject) break;
3764           c = *eptr;
3765
3766           /* Either not runtime caseless, or it was a positive class. For
3767           runtime caseless, continue if either case is in the map. */
3768
3769           if (!nasty_case)
3770             {
3771             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3772             if (md->runtime_caseless)
3773               {
3774               c = pcre_fcc[c];
3775               if ((data[c/8] & (1 << (c&7))) != 0) continue;
3776               }
3777             }
3778
3779           /* Runtime caseless and it was a negative class. Continue only if
3780           both cases are in the map. */
3781
3782           else
3783             {
3784             if ((data[c/8] & (1 << (c&7))) == 0) break;
3785             c = pcre_fcc[c];
3786             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3787             }
3788
3789           break;
3790           }
3791
3792         while (eptr >= pp)
3793           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3794         FAIL;
3795         }
3796       }
3797     /* Control never gets here */
3798
3799    /* OP_CLASS_L opcode: handles localized character classes */
3800
3801    case OP_CLASS_L:
3802      {
3803       const uschar *data = ecode + 1;  /* Save for matching */
3804       const uschar locale_flag = *data;
3805       ecode++; data++;          /* The localization support adds an extra byte */
3806
3807       ecode += 33;               /* Advance past the item */
3808
3809       switch (*ecode)
3810         {
3811         case OP_CRSTAR:
3812         case OP_CRMINSTAR:
3813         case OP_CRPLUS:
3814         case OP_CRMINPLUS:
3815         case OP_CRQUERY:
3816         case OP_CRMINQUERY:
3817         c = *ecode++ - OP_CRSTAR;
3818         minimize = (c & 1) != 0;
3819         min = rep_min[c];                 /* Pick up values from tables; */
3820         max = rep_max[c];                 /* zero for max => infinity */
3821         if (max == 0) max = INT_MAX;
3822         break;
3823
3824         case OP_CRRANGE:
3825         case OP_CRMINRANGE:
3826         minimize = (*ecode == OP_CRMINRANGE);
3827         min = (ecode[1] << 8) + ecode[2];
3828         max = (ecode[3] << 8) + ecode[4];
3829         if (max == 0) max = INT_MAX;
3830         ecode += 5;
3831         break;
3832
3833         default:               /* No repeat follows */
3834         if (eptr >= md->end_subject) FAIL;
3835         c = *eptr++;
3836         if ((data[c/8] & (1 << (c&7))) != 0) continue;    /* With main loop */
3837         if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3838         if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3839 #if 0
3840         if ( (locale_flag &  4) && isdigit(c) ) continue;    /* Locale \d */
3841         if ( (locale_flag &  8) && !isdigit(c) ) continue;   /* Locale \D */
3842         if ( (locale_flag & 16) && isspace(c) ) continue;    /* Locale \s */
3843         if ( (locale_flag & 32) && !isspace(c) ) continue;   /* Locale \S */
3844 #endif
3845
3846         if (md->runtime_caseless)
3847           {
3848           c = pcre_fcc[c];
3849           if ((data[c/8] & (1 << (c&7))) != 0) continue;  /* With main loop */
3850
3851           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3852           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3853           }
3854         FAIL;
3855         }
3856
3857       /* First, ensure the minimum number of matches are present. */
3858
3859       for (i = 1; i <= min; i++)
3860         {
3861         if (eptr >= md->end_subject) FAIL;
3862         c = *eptr++;
3863         if ((data[c/8] & (1 << (c&7))) != 0) continue;
3864         if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3865         if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3866
3867         if (md->runtime_caseless)
3868           {
3869           c = pcre_fcc[c];
3870           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3871           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3872           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3873           }
3874         FAIL;
3875         }
3876
3877       /* If max == min we can continue with the main loop without the
3878       need to recurse. */
3879
3880       if (min == max) continue;
3881
3882       /* If minimizing, keep testing the rest of the expression and advancing
3883       the pointer while it matches the class. */
3884
3885       if (minimize)
3886         {
3887         for (i = min;; i++)
3888           {
3889           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3890           if (i >= max || eptr >= md->end_subject) FAIL;
3891           c = *eptr++;
3892           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3893           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3894           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3895
3896           if (md->runtime_caseless)
3897             {
3898             c = pcre_fcc[c];
3899             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3900             if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3901             if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3902             }
3903           FAIL;
3904           }
3905         /* Control never gets here */
3906         }
3907
3908       /* If maximizing, find the longest possible run, then work backwards. */
3909
3910       else
3911         {
3912         const uschar *pp = eptr;
3913         for (i = min; i < max; eptr++, i++)
3914           {
3915           if (eptr >= md->end_subject) break;
3916           c = *eptr;
3917           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3918           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3919           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3920           if (md->runtime_caseless)
3921             {
3922             c = pcre_fcc[c];
3923             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3924             if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3925             if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3926             }
3927           break;
3928           }
3929
3930         while (eptr >= pp)
3931           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3932         FAIL;
3933         }
3934       }
3935     /* Control never gets here */
3936
3937     /* Match a run of characters */
3938
3939     case OP_CHARS:
3940       {
3941       register int length = ecode[1];
3942       ecode += 2;
3943
3944 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
3945       if (eptr >= md->end_subject)
3946         printf("matching subject <null> against pattern ");
3947       else
3948         {
3949         printf("matching subject ");
3950         pchars(eptr, length, TRUE, md);
3951         printf(" against pattern ");
3952         }
3953       pchars(ecode, length, FALSE, md);
3954       printf("\n");
3955 #endif
3956
3957       if (length > md->end_subject - eptr) FAIL;
3958       if (md->caseless)
3959         {
3960         while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3961         }
3962       else
3963         {
3964         while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3965         }
3966       }
3967     break;
3968
3969     /* Match a single character repeatedly; different opcodes share code. */
3970
3971     case OP_EXACT:
3972     min = max = (ecode[1] << 8) + ecode[2];
3973     ecode += 3;
3974     goto REPEATCHAR;
3975
3976     case OP_UPTO:
3977     case OP_MINUPTO:
3978     min = 0;
3979     max = (ecode[1] << 8) + ecode[2];
3980     minimize = *ecode == OP_MINUPTO;
3981     ecode += 3;
3982     goto REPEATCHAR;
3983
3984     case OP_STAR:
3985     case OP_MINSTAR:
3986     case OP_PLUS:
3987     case OP_MINPLUS:
3988     case OP_QUERY:
3989     case OP_MINQUERY:
3990     c = *ecode++ - OP_STAR;
3991     minimize = (c & 1) != 0;
3992     min = rep_min[c];                 /* Pick up values from tables; */
3993     max = rep_max[c];                 /* zero for max => infinity */
3994     if (max == 0) max = INT_MAX;
3995
3996     /* Common code for all repeated single-character matches. We can give
3997     up quickly if there are fewer than the minimum number of characters left in
3998     the subject. */
3999
4000     REPEATCHAR:
4001     if (min > md->end_subject - eptr) FAIL;
4002     c = *ecode++;
4003
4004     /* The code is duplicated for the caseless and caseful cases, for speed,
4005     since matching characters is likely to be quite common. First, ensure the
4006     minimum number of matches are present. If min = max, continue at the same
4007     level without recursing. Otherwise, if minimizing, keep trying the rest of
4008     the expression and advancing one matching character if failing, up to the
4009     maximum. Alternatively, if maximizing, find the maximum number of
4010     characters and work backwards. */
4011
4012     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4013       max, eptr));
4014
4015     if (md->caseless)
4016       {
4017       c = pcre_lcc[c];
4018       for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4019       if (min == max) continue;
4020       if (minimize)
4021         {
4022         for (i = min;; i++)
4023           {
4024           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4025           if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4026             FAIL;
4027           }
4028         /* Control never gets here */
4029         }
4030       else
4031         {
4032         const uschar *pp = eptr;
4033         for (i = min; i < max; i++)
4034           {
4035           if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4036           eptr++;
4037           }
4038         while (eptr >= pp)
4039           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4040         FAIL;
4041         }
4042       /* Control never gets here */
4043       }
4044
4045     /* Caseful comparisons */
4046
4047     else
4048       {
4049       for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4050       if (min == max) continue;
4051       if (minimize)
4052         {
4053         for (i = min;; i++)
4054           {
4055           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4056           if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4057           }
4058         /* Control never gets here */
4059         }
4060       else
4061         {
4062         const uschar *pp = eptr;
4063         for (i = min; i < max; i++)
4064           {
4065           if (eptr >= md->end_subject || c != *eptr) break;
4066           eptr++;
4067           }
4068         while (eptr >= pp)
4069          if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4070         FAIL;
4071         }
4072       }
4073     /* Control never gets here */
4074
4075     /* Match a negated single character */
4076
4077     case OP_NOT:
4078     if (eptr >= md->end_subject) FAIL;
4079     ecode++;
4080     if (md->caseless)
4081       {
4082       if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4083       }
4084     else
4085       {
4086       if (*ecode++ == *eptr++) FAIL;
4087       }
4088     break;
4089
4090     /* Match a negated single character repeatedly. This is almost a repeat of
4091     the code for a repeated single character, but I haven't found a nice way of
4092     commoning these up that doesn't require a test of the positive/negative
4093     option for each character match. Maybe that wouldn't add very much to the
4094     time taken, but character matching *is* what this is all about... */
4095
4096     case OP_NOTEXACT:
4097     min = max = (ecode[1] << 8) + ecode[2];
4098     ecode += 3;
4099     goto REPEATNOTCHAR;
4100
4101     case OP_NOTUPTO:
4102     case OP_NOTMINUPTO:
4103     min = 0;
4104     max = (ecode[1] << 8) + ecode[2];
4105     minimize = *ecode == OP_NOTMINUPTO;
4106     ecode += 3;
4107     goto REPEATNOTCHAR;
4108
4109     case OP_NOTSTAR:
4110     case OP_NOTMINSTAR:
4111     case OP_NOTPLUS:
4112     case OP_NOTMINPLUS:
4113     case OP_NOTQUERY:
4114     case OP_NOTMINQUERY:
4115     c = *ecode++ - OP_NOTSTAR;
4116     minimize = (c & 1) != 0;
4117     min = rep_min[c];                 /* Pick up values from tables; */
4118     max = rep_max[c];                 /* zero for max => infinity */
4119     if (max == 0) max = INT_MAX;
4120
4121     /* Common code for all repeated single-character matches. We can give
4122     up quickly if there are fewer than the minimum number of characters left in
4123     the subject. */
4124
4125     REPEATNOTCHAR:
4126     if (min > md->end_subject - eptr) FAIL;
4127     c = *ecode++;
4128
4129     /* The code is duplicated for the caseless and caseful cases, for speed,
4130     since matching characters is likely to be quite common. First, ensure the
4131     minimum number of matches are present. If min = max, continue at the same
4132     level without recursing. Otherwise, if minimizing, keep trying the rest of
4133     the expression and advancing one matching character if failing, up to the
4134     maximum. Alternatively, if maximizing, find the maximum number of
4135     characters and work backwards. */
4136
4137     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4138       max, eptr));
4139
4140     if (md->caseless)
4141       {
4142       c = pcre_lcc[c];
4143       for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4144       if (min == max) continue;
4145       if (minimize)
4146         {
4147         for (i = min;; i++)
4148           {
4149           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4150           if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4151             FAIL;
4152           }
4153         /* Control never gets here */
4154         }
4155       else
4156         {
4157         const uschar *pp = eptr;
4158         for (i = min; i < max; i++)
4159           {
4160           if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4161           eptr++;
4162           }
4163         while (eptr >= pp)
4164           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4165         FAIL;
4166         }
4167       /* Control never gets here */
4168       }
4169
4170     /* Caseful comparisons */
4171
4172     else
4173       {
4174       for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4175       if (min == max) continue;
4176       if (minimize)
4177         {
4178         for (i = min;; i++)
4179           {
4180           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4181           if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4182           }
4183         /* Control never gets here */
4184         }
4185       else
4186         {
4187         const uschar *pp = eptr;
4188         for (i = min; i < max; i++)
4189           {
4190           if (eptr >= md->end_subject || c == *eptr) break;
4191           eptr++;
4192           }
4193         while (eptr >= pp)
4194          if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4195         FAIL;
4196         }
4197       }
4198     /* Control never gets here */
4199
4200     /* Match a single character type repeatedly; several different opcodes
4201     share code. This is very similar to the code for single characters, but we
4202     repeat it in the interests of efficiency. */
4203
4204     case OP_TYPEEXACT:
4205     min = max = (ecode[1] << 8) + ecode[2];
4206     minimize = TRUE;
4207     ecode += 3;
4208     goto REPEATTYPE;
4209
4210     case OP_TYPEUPTO:
4211     case OP_TYPEMINUPTO:
4212     min = 0;
4213     max = (ecode[1] << 8) + ecode[2];
4214     minimize = *ecode == OP_TYPEMINUPTO;
4215     ecode += 3;
4216     goto REPEATTYPE;
4217
4218     case OP_TYPESTAR:
4219     case OP_TYPEMINSTAR:
4220     case OP_TYPEPLUS:
4221     case OP_TYPEMINPLUS:
4222     case OP_TYPEQUERY:
4223     case OP_TYPEMINQUERY:
4224     c = *ecode++ - OP_TYPESTAR;
4225     minimize = (c & 1) != 0;
4226     min = rep_min[c];                 /* Pick up values from tables; */
4227     max = rep_max[c];                 /* zero for max => infinity */
4228     if (max == 0) max = INT_MAX;
4229
4230     /* Common code for all repeated single character type matches */
4231
4232     REPEATTYPE:
4233     ctype = *ecode++;      /* Code for the character type */
4234
4235     /* First, ensure the minimum number of matches are present. Use inline
4236     code for maximizing the speed, and do the type test once at the start
4237     (i.e. keep it out of the loop). Also test that there are at least the
4238     minimum number of characters before we start. */
4239
4240     if (min > md->end_subject - eptr) FAIL;
4241     if (min > 0) switch(ctype)
4242       {
4243       case OP_ANY:
4244       if (!md->dotall)
4245         { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4246       else eptr += min;
4247       break;
4248
4249       case OP_NOT_DIGIT:
4250       for (i = 1; i <= min; i++)
4251         if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4252       break;
4253
4254       case OP_DIGIT:
4255       for (i = 1; i <= min; i++)
4256         if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4257       break;
4258
4259       case OP_NOT_WHITESPACE:
4260       for (i = 1; i <= min; i++)
4261         if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4262       break;
4263
4264       case OP_WHITESPACE:
4265       for (i = 1; i <= min; i++)
4266         if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4267       break;
4268
4269       case OP_NOT_WORDCHAR:
4270       for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4271         FAIL;
4272       break;
4273
4274       case OP_WORDCHAR:
4275       for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4276         FAIL;
4277       break;
4278
4279       case OP_NOT_WORDCHAR_L:
4280       for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
4281         FAIL;
4282       break;
4283
4284       case OP_WORDCHAR_L:
4285       for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
4286         FAIL;
4287       break;
4288       }
4289
4290     /* If min = max, continue at the same level without recursing */
4291
4292     if (min == max) continue;
4293
4294     /* If minimizing, we have to test the rest of the pattern before each
4295     subsequent match, so inlining isn't much help; just use the function. */
4296
4297     if (minimize)
4298       {
4299       for (i = min;; i++)
4300         {
4301         if (match(eptr, ecode, offset_top, md)) SUCCEED;
4302         if (i >= max || eptr >= md->end_subject ||
4303           !match_type(ctype, *eptr++, md->dotall))
4304             FAIL;
4305         }
4306       /* Control never gets here */
4307       }
4308
4309     /* If maximizing it is worth using inline code for speed, doing the type
4310     test once at the start (i.e. keep it out of the loop). */
4311
4312     else
4313       {
4314       const uschar *pp = eptr;
4315       switch(ctype)
4316         {
4317         case OP_ANY:
4318         if (!md->dotall)
4319           {
4320           for (i = min; i < max; i++)
4321             {
4322             if (eptr >= md->end_subject || *eptr == '\n') break;
4323             eptr++;
4324             }
4325           }
4326         else
4327           {
4328           c = max - min;
4329           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4330           eptr += c;
4331           }
4332         break;
4333
4334         case OP_NOT_DIGIT:
4335         for (i = min; i < max; i++)
4336           {
4337           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4338             break;
4339           eptr++;
4340           }
4341         break;
4342
4343         case OP_DIGIT:
4344         for (i = min; i < max; i++)
4345           {
4346           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4347             break;
4348           eptr++;
4349           }
4350         break;
4351
4352         case OP_NOT_WHITESPACE:
4353         for (i = min; i < max; i++)
4354           {
4355           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4356             break;
4357           eptr++;
4358           }
4359         break;
4360
4361         case OP_WHITESPACE:
4362         for (i = min; i < max; i++)
4363           {
4364           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4365             break;
4366           eptr++;
4367           }
4368         break;
4369
4370         case OP_NOT_WORDCHAR:
4371         for (i = min; i < max; i++)
4372           {
4373           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4374             break;
4375           eptr++;
4376           }
4377         break;
4378
4379         case OP_WORDCHAR:
4380         for (i = min; i < max; i++)
4381           {
4382             if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4383               break;
4384             eptr++;
4385           }
4386         break;
4387         case OP_NOT_WORDCHAR_L:
4388           for (i = min; i < max; i++)
4389          {
4390          if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
4391            break;
4392          eptr++;
4393          }
4394        break;
4395
4396        case OP_WORDCHAR_L:
4397        for (i = min; i < max; i++)
4398          {
4399          if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
4400              break;
4401           eptr++;
4402           }
4403         break;
4404         }
4405
4406       while (eptr >= pp)
4407         if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4408       FAIL;
4409       }
4410     /* Control never gets here */
4411
4412     /* There's been some horrible disaster. */
4413
4414     default:
4415     DPRINTF(("Unknown opcode %d\n", *ecode));
4416     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4417     FAIL;
4418     }
4419
4420   /* Do not stick any code in here without much thought; it is assumed
4421   that "continue" in the code above comes out to here to repeat the main
4422   loop. */
4423
4424   }             /* End of main loop */
4425 /* Control never reaches here */
4426
4427 fail:
4428  if (md->point > save_stack_position)
4429  {
4430    /* If there are still points remaining on the stack, pop the next one off */
4431    int off_num;
4432
4433    md->point--;
4434    offset_top = md->offset_top[md->point];
4435    eptr       = md->eptr[md->point];
4436    ecode      = md->ecode[md->point];
4437    off_num    = md->off_num[md->point];
4438    md->offset_vector[off_num]   = md->r1[md->point];
4439    md->offset_vector[off_num+1] = md->r2[md->point];
4440    goto match_loop;
4441   }
4442    /* Failure, and nothing left on the stack, so end this function call */
4443
4444  /* Restore the top of the stack to where it was before this function
4445     call.  This lets us use one stack for everything; recursive calls
4446     can push and pop information, and may increase the stack.  When
4447     the call returns, the parent function can resume pushing and
4448     popping wherever it was. */
4449
4450  md->point = save_stack_position;
4451  return FALSE;
4452
4453 succeed:
4454  return TRUE;
4455 }
4456
4457
4458
4459 /*************************************************
4460 *         Segregate setjmp()                     *
4461 *************************************************/
4462
4463 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4464 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4465 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4466 since it's needed only for the extension \X option, and with any luck, a good
4467 compiler will spot the tail recursion and compile it efficiently.
4468
4469 Arguments:
4470    eptr        pointer in subject
4471    ecode       position in code
4472    offset_top  current top pointer
4473    md          pointer to "static" info for the match
4474
4475 Returns:       TRUE if matched
4476 */
4477
4478 static BOOL
4479 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4480   match_data *match_block)
4481 {
4482 return setjmp(match_block->fail_env) == 0 &&
4483       match(eptr, ecode, offset_top, match_block);
4484 }
4485
4486
4487
4488 /*************************************************
4489 *         Execute a Regular Expression           *
4490 *************************************************/
4491
4492 /* This function applies a compiled re to a subject string and picks out
4493 portions of the string if it matches. Two elements in the vector are set for
4494 each substring: the offsets to the start and end of the substring.
4495
4496 Arguments:
4497   external_re     points to the compiled expression
4498   external_extra  points to "hints" from pcre_study() or is NULL
4499   subject         points to the subject string
4500   length          length of subject string (may contain binary zeros)
4501   options         option bits
4502   offsets         points to a vector of ints to be filled in with offsets
4503   offsetcount     the number of elements in the vector
4504
4505 Returns:          > 0 => success; value is the number of elements filled in
4506                   = 0 => success, but offsets is not big enough
4507                    -1 => failed to match
4508                  < -1 => some kind of unexpected problem
4509 */
4510
4511 int
4512 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4513   const char *subject, int length, int start_pos, int options,
4514   int *offsets, int offsetcount)
4515 {
4516   /* The "volatile" directives are to make gcc -Wall stop complaining
4517      that these variables can be clobbered by the longjmp.  Hopefully
4518      they won't cost too much performance. */
4519 volatile int resetcount, ocount;
4520 volatile int first_char = -1;
4521 match_data match_block;
4522 const uschar *start_bits = NULL;
4523 const uschar *start_match = (const uschar *)subject + start_pos;
4524 const uschar *end_subject;
4525 const real_pcre *re = (const real_pcre *)external_re;
4526 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4527 volatile BOOL using_temporary_offsets = FALSE;
4528 volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4529 volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4530
4531 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4532
4533 if (re == NULL || subject == NULL ||
4534    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4535 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4536
4537 match_block.start_subject = (const uschar *)subject;
4538 match_block.end_subject = match_block.start_subject + length;
4539 end_subject = match_block.end_subject;
4540
4541 match_block.caseless  = ((re->options | options) & PCRE_CASELESS) != 0;
4542 match_block.runtime_caseless = match_block.caseless &&
4543   (re->options & PCRE_CASELESS) == 0;
4544
4545 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4546 match_block.dotall    = ((re->options | options) & PCRE_DOTALL) != 0;
4547 match_block.endonly   = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4548
4549 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4550 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4551
4552 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4553
4554 /* Set the stack state to empty */
4555   match_block.off_num = match_block.offset_top = NULL;
4556   match_block.r1 = match_block.r2 = NULL;
4557   match_block.eptr = match_block.ecode = NULL;
4558   match_block.point = match_block.length = 0;
4559
4560 /* If the expression has got more back references than the offsets supplied can
4561 hold, we get a temporary bit of working store to use during the matching.
4562 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4563 of 2. */
4564
4565 ocount = offsetcount & (-2);
4566 if (re->top_backref > 0 && re->top_backref >= ocount/2)
4567   {
4568   ocount = re->top_backref * 2 + 2;
4569   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4570   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4571   using_temporary_offsets = TRUE;
4572   DPRINTF(("Got memory to hold back references\n"));
4573   }
4574 else match_block.offset_vector = offsets;
4575
4576 match_block.offset_end = ocount;
4577 match_block.offset_overflow = FALSE;
4578
4579 /* Compute the minimum number of offsets that we need to reset each time. Doing
4580 this makes a huge difference to execution time when there aren't many brackets
4581 in the pattern. */
4582
4583 resetcount = 2 + re->top_bracket * 2;
4584 if (resetcount > offsetcount) resetcount = ocount;
4585
4586 /* If MULTILINE is set at exec time but was not set at compile time, and the
4587 anchored flag is set, we must re-check because a setting provoked by ^ in the
4588 pattern is not right in multi-line mode. Calling is_anchored() again here does
4589 the right check, because multiline is now set. If it now yields FALSE, the
4590 expression must have had ^ starting some of its branches. Check to see if
4591 that is true for *all* branches, and if so, set the startline flag. */
4592
4593 if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4594     !is_anchored(re->code, match_block.multiline))
4595   {
4596   anchored = FALSE;
4597   if (is_startline(re->code)) startline = TRUE;
4598   }
4599
4600 /* Set up the first character to match, if available. The first_char value is
4601 never set for an anchored regular expression, but the anchoring may be forced
4602 at run time, so we have to test for anchoring. The first char may be unset for
4603 an unanchored pattern, of course. If there's no first char and the pattern was
4604 studied, the may be a bitmap of possible first characters. However, we can
4605 use this only if the caseless state of the studying was correct. */
4606
4607 if (!anchored)
4608   {
4609   if ((re->options & PCRE_FIRSTSET) != 0)
4610     {
4611     first_char = re->first_char;
4612     if (match_block.caseless) first_char = pcre_lcc[first_char];
4613     }
4614   else
4615     if (!startline && extra != NULL &&
4616       (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4617       ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4618         start_bits = extra->start_bits;
4619   }
4620
4621 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4622
4623 do
4624   {
4625   int rc;
4626   register int *iptr = match_block.offset_vector;
4627   register int *iend = iptr + resetcount;
4628
4629   /* Reset the maximum number of extractions we might see. */
4630
4631   while (iptr < iend) *iptr++ = -1;
4632
4633   /* Advance to a unique first char if possible */
4634
4635   if (first_char >= 0)
4636     {
4637     if (match_block.caseless)
4638       while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4639         start_match++;
4640     else
4641       while (start_match < end_subject && *start_match != first_char)
4642         start_match++;
4643     }
4644
4645   /* Or to just after \n for a multiline match if possible */
4646
4647   else if (startline)
4648     {
4649     if (start_match > match_block.start_subject)
4650       {
4651       while (start_match < end_subject && start_match[-1] != '\n')
4652         start_match++;
4653       }
4654     }
4655
4656   /* Or to a non-unique first char */
4657
4658   else if (start_bits != NULL)
4659     {
4660     while (start_match < end_subject)
4661       {
4662       register int c = *start_match;
4663       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4664       }
4665     }
4666
4667 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
4668   printf(">>>> Match against: ");
4669   pchars(start_match, end_subject - start_match, TRUE, &match_block);
4670   printf("\n");
4671 #endif
4672
4673   /* When a match occurs, substrings will be set for all internal extractions;
4674   we just need to set up the whole thing as substring 0 before returning. If
4675   there were too many extractions, set the return code to zero. In the case
4676   where we had to get some local store to hold offsets for backreferences, copy
4677   those back references that we can. In this case there need not be overflow
4678   if certain parts of the pattern were not used.
4679
4680   Before starting the match, we have to set up a longjmp() target to enable
4681   the "cut" operation to fail a match completely without backtracking. This
4682   is done in a separate function to avoid compiler warnings. We need not do
4683   it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4684   enabled. */
4685
4686   /* To handle errors such as running out of memory for the failure
4687      stack, we need to save this location via setjmp(), so
4688      error-handling code can call longjmp() to jump out of deeply-nested code. */
4689   if (setjmp(match_block.error_env)==0)
4690     {
4691
4692   if ((re->options & PCRE_EXTRA) != 0)
4693     {
4694     if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4695       continue;
4696     }
4697   else if (!match(start_match, re->code, 2, &match_block)) continue;
4698
4699   /* Copy the offset information from temporary store if necessary */
4700
4701   if (using_temporary_offsets)
4702     {
4703     if (offsetcount >= 4)
4704       {
4705       memcpy(offsets + 2, match_block.offset_vector + 2,
4706         (offsetcount - 2) * sizeof(int));
4707       DPRINTF(("Copied offsets from temporary memory\n"));
4708       }
4709     if (match_block.end_offset_top > offsetcount)
4710       match_block.offset_overflow = TRUE;
4711
4712     DPRINTF(("Freeing temporary memory\n"));
4713     (pcre_free)(match_block.offset_vector);
4714     }
4715
4716   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4717
4718   if (match_block.offset_end < 2) rc = 0; else
4719     {
4720     offsets[0] = start_match - match_block.start_subject;
4721     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4722     }
4723
4724   DPRINTF((">>>> returning %d\n", rc));
4725   free_stack(&match_block);
4726   return rc;
4727   }  /* End of (if setjmp(match_block.error_env)...) */
4728   free_stack(&match_block);
4729
4730   /* Return an error code; pcremodule.c will preserve the exception */
4731   if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4732   }
4733 while (!anchored &&
4734        match_block.errorcode == PCRE_ERROR_NOMATCH &&
4735        start_match++ < end_subject);
4736
4737 if (using_temporary_offsets)
4738   {
4739   DPRINTF(("Freeing temporary memory\n"));
4740   (pcre_free)(match_block.offset_vector);
4741   }
4742
4743 #ifdef DEBUG
4744 printf(">>>> returning %d\n", match_block.errorcode);
4745 #endif
4746
4747  return match_block.errorcode;
4748 }
4749
4750 /* End of pcre.c */