Modules/pypcre.c

   1
   2 /*************************************************
   3 *      Perl-Compatible Regular Expressions       *
   4 *************************************************/
   5
   6 /*   DO NOT EDIT THIS FILE! */
   7
   8 /* This file is automatically written by the merge-files.py script
   9 included with the PCRE distribution for Python; it's produced from
  10 several C files, and code is removed in the process.  If you want to
  11 modify the code or track down bugs, it will be much easier to work
  12 with the code in its original, multiple-file form.  Don't edit this
  13 file by hand, or submit patches to it.
  14
  15 The Python-specific PCRE distribution can be retrieved from
  16        http://starship.skyport.net/crew/amk/regex/
  17
  18 The unmodified original PCRE distribution is available at
  19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
  20 written by: Philip Hazel <ph10@cam.ac.uk>
  21
  22 Extensively modified by the Python String-SIG: <string-sig@python.org>
  23 Send bug reports to:                           <string-sig@python.org>
  24 (They'll figure out if it's a bug in PCRE or in the Python-specific
  25 changes.)
  26
  27            Copyright (c) 1997 University of Cambridge
  28
  29 -----------------------------------------------------------------------------
  30 Permission is granted to anyone to use this software for any purpose on any
  31 computer system, and to redistribute it freely, subject to the following
  32 restrictions:
  33
  34 1. This software is distributed in the hope that it will be useful,
  35    but WITHOUT ANY WARRANTY; without even the implied warranty of
  36    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  37
  38 2. The origin of this software must not be misrepresented, either by
  39    explicit claim or by omission.
  40
  41 3. Altered versions must be plainly marked as such, and must not be
  42    misrepresented as being the original software.
  43 -----------------------------------------------------------------------------
  44 */
  45
  46
  47 #define FOR_PYTHON
  48 #include "pcre-int.h"
  49 #include "Python.h"
  50 #include <ctype.h>
  51 #include "graminit.h"
  52
  53 /*************************************************
  54 *      Perl-Compatible Regular Expressions       *
  55 *************************************************/
  56
  57 /* This file is automatically written by the makechartables auxiliary
  58 program. If you edit it by hand, you might like to edit the Makefile to
  59 prevent its ever being regenerated. */
  60
  61 /* This table is a lower casing table. */
  62
  63 unsigned char pcre_lcc[] = {
  64     0,  1,  2,  3,  4,  5,  6,  7,
  65     8,  9, 10, 11, 12, 13, 14, 15,
  66    16, 17, 18, 19, 20, 21, 22, 23,
  67    24, 25, 26, 27, 28, 29, 30, 31,
  68    32, 33, 34, 35, 36, 37, 38, 39,
  69    40, 41, 42, 43, 44, 45, 46, 47,
  70    48, 49, 50, 51, 52, 53, 54, 55,
  71    56, 57, 58, 59, 60, 61, 62, 63,
  72    64, 97, 98, 99,100,101,102,103,
  73   104,105,106,107,108,109,110,111,
  74   112,113,114,115,116,117,118,119,
  75   120,121,122, 91, 92, 93, 94, 95,
  76    96, 97, 98, 99,100,101,102,103,
  77   104,105,106,107,108,109,110,111,
  78   112,113,114,115,116,117,118,119,
  79   120,121,122,123,124,125,126,127,
  80   128,129,130,131,132,133,134,135,
  81   136,137,138,139,140,141,142,143,
  82   144,145,146,147,148,149,150,151,
  83   152,153,154,155,156,157,158,159,
  84   160,161,162,163,164,165,166,167,
  85   168,169,170,171,172,173,174,175,
  86   176,177,178,179,180,181,182,183,
  87   184,185,186,187,188,189,190,191,
  88   192,193,194,195,196,197,198,199,
  89   200,201,202,203,204,205,206,207,
  90   208,209,210,211,212,213,214,215,
  91   216,217,218,219,220,221,222,223,
  92   224,225,226,227,228,229,230,231,
  93   232,233,234,235,236,237,238,239,
  94   240,241,242,243,244,245,246,247,
  95   248,249,250,251,252,253,254,255 };
  96
  97 /* This table is a case flipping table. */
  98
  99 unsigned char pcre_fcc[] = {
 100     0,  1,  2,  3,  4,  5,  6,  7,
 101     8,  9, 10, 11, 12, 13, 14, 15,
 102    16, 17, 18, 19, 20, 21, 22, 23,
 103    24, 25, 26, 27, 28, 29, 30, 31,
 104    32, 33, 34, 35, 36, 37, 38, 39,
 105    40, 41, 42, 43, 44, 45, 46, 47,
 106    48, 49, 50, 51, 52, 53, 54, 55,
 107    56, 57, 58, 59, 60, 61, 62, 63,
 108    64, 97, 98, 99,100,101,102,103,
 109   104,105,106,107,108,109,110,111,
 110   112,113,114,115,116,117,118,119,
 111   120,121,122, 91, 92, 93, 94, 95,
 112    96, 65, 66, 67, 68, 69, 70, 71,
 113    72, 73, 74, 75, 76, 77, 78, 79,
 114    80, 81, 82, 83, 84, 85, 86, 87,
 115    88, 89, 90,123,124,125,126,127,
 116   128,129,130,131,132,133,134,135,
 117   136,137,138,139,140,141,142,143,
 118   144,145,146,147,148,149,150,151,
 119   152,153,154,155,156,157,158,159,
 120   160,161,162,163,164,165,166,167,
 121   168,169,170,171,172,173,174,175,
 122   176,177,178,179,180,181,182,183,
 123   184,185,186,187,188,189,190,191,
 124   192,193,194,195,196,197,198,199,
 125   200,201,202,203,204,205,206,207,
 126   208,209,210,211,212,213,214,215,
 127   216,217,218,219,220,221,222,223,
 128   224,225,226,227,228,229,230,231,
 129   232,233,234,235,236,237,238,239,
 130   240,241,242,243,244,245,246,247,
 131   248,249,250,251,252,253,254,255 };
 132
 133 /* This table contains bit maps for digits, letters, 'word' chars, and
 134 white space. Each map is 32 bytes long and the bits run from the least
 135 significant end of each byte. */
 136
 137 unsigned char pcre_cbits[] = {
 138   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
 139   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 140   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 141   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 142
 143   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 144   0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
 145   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 146   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 147
 148   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
 149   0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
 150   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 151   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 152
 153   0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
 154   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 155   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 156   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
 157
 158 /* This table identifies various classes of character by individual bits:
 159   0x01   white space character
 160   0x02   letter
 161   0x04   decimal digit
 162   0x08   hexadecimal digit
 163   0x10   alphanumeric or '_'
 164   0x80   regular expression metacharacter or binary zero
 165 */
 166
 167 unsigned char pcre_ctypes[] = {
 168   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 169   0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /*   8- 15 */
 170   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 171   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 172   0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /*    - '  */
 173   0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /*  ( - /  */
 174   0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /*  0 - 7  */
 175   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /*  8 - ?  */
 176   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  @ - G  */
 177   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  H - O  */
 178   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  P - W  */
 179   0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /*  X - _  */
 180   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  ` - g  */
 181   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  h - o  */
 182   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  p - w  */
 183   0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /*  x -127 */
 184   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 185   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 186   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 187   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 188   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 189   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 190   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 191   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 192   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 193   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 194   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 195   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 196   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 197   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 198   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 199   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 200
 201 /* End of chartables.c */
 202 /*************************************************
 203 *      Perl-Compatible Regular Expressions       *
 204 *************************************************/
 205
 206 /*
 207 This is a library of functions to support regular expressions whose syntax
 208 and semantics are as close as possible to those of the Perl 5 language. See
 209 the file Tech.Notes for some information on the internals.
 210
 211 Written by: Philip Hazel <ph10@cam.ac.uk>
 212
 213            Copyright (c) 1998 University of Cambridge
 214
 215 -----------------------------------------------------------------------------
 216 Permission is granted to anyone to use this software for any purpose on any
 217 computer system, and to redistribute it freely, subject to the following
 218 restrictions:
 219
 220 1. This software is distributed in the hope that it will be useful,
 221    but WITHOUT ANY WARRANTY; without even the implied warranty of
 222    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 223
 224 2. The origin of this software must not be misrepresented, either by
 225    explicit claim or by omission.
 226
 227 3. Altered versions must be plainly marked as such, and must not be
 228    misrepresented as being the original software.
 229 -----------------------------------------------------------------------------
 230 */
 231
 232
 233 /* Include the internals header, which itself includes Standard C headers plus
 234 the external pcre header. */
 235
 236
 237
 238
 239 /*************************************************
 240 *          Create bitmap of starting chars       *
 241 *************************************************/
 242
 243 /* This function scans a compiled unanchored expression and attempts to build a
 244 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
 245 goes by, we may be able to get more clever at doing this.
 246
 247 Arguments:
 248   code         points to an expression
 249   start_bits   points to a 32-byte table, initialized to 0
 250
 251 Returns:       TRUE if table built, FALSE otherwise
 252 */
 253
 254 static BOOL
 255 set_start_bits(const uschar *code, uschar *start_bits)
 256 {
 257 register int c;
 258 volatile int dummy;
 259
 260 do
 261   {
 262   const uschar *tcode = code + 3;
 263   BOOL try_next = TRUE;
 264
 265   while (try_next)
 266     {
 267     try_next = FALSE;
 268
 269     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
 270       {
 271       if (!set_start_bits(tcode, start_bits)) return FALSE;
 272       }
 273
 274     else switch(*tcode)
 275       {
 276       default:
 277       return FALSE;
 278
 279       /* BRAZERO does the bracket, but carries on. */
 280
 281       case OP_BRAZERO:
 282       case OP_BRAMINZERO:
 283       if (!set_start_bits(++tcode, start_bits)) return FALSE;
 284       dummy = 1;
 285       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
 286       tcode += 3;
 287       try_next = TRUE;
 288       break;
 289
 290       /* Single-char * or ? sets the bit and tries the next item */
 291
 292       case OP_STAR:
 293       case OP_MINSTAR:
 294       case OP_QUERY:
 295       case OP_MINQUERY:
 296       start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
 297       tcode += 2;
 298       try_next = TRUE;
 299       break;
 300
 301       /* Single-char upto sets the bit and tries the next */
 302
 303       case OP_UPTO:
 304       case OP_MINUPTO:
 305       start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
 306       tcode += 4;
 307       try_next = TRUE;
 308       break;
 309
 310       /* At least one single char sets the bit and stops */
 311
 312       case OP_EXACT:       /* Fall through */
 313       tcode++;
 314
 315       case OP_CHARS:       /* Fall through */
 316       tcode++;
 317
 318       case OP_PLUS:
 319       case OP_MINPLUS:
 320       start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
 321       break;
 322
 323       /* Single character type sets the bits and stops */
 324
 325       case OP_NOT_DIGIT:
 326       for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
 327       break;
 328
 329       case OP_DIGIT:
 330       for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
 331       break;
 332
 333       case OP_NOT_WHITESPACE:
 334       for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
 335       break;
 336
 337       case OP_WHITESPACE:
 338       for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
 339       break;
 340
 341       case OP_NOT_WORDCHAR:
 342       for (c = 0; c < 32; c++)
 343         start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 344       break;
 345
 346       case OP_WORDCHAR:
 347       for (c = 0; c < 32; c++)
 348         start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 349       break;
 350
 351       /* One or more character type fudges the pointer and restarts, knowing
 352       it will hit a single character type and stop there. */
 353
 354       case OP_TYPEPLUS:
 355       case OP_TYPEMINPLUS:
 356       tcode++;
 357       try_next = TRUE;
 358       break;
 359
 360       case OP_TYPEEXACT:
 361       tcode += 3;
 362       try_next = TRUE;
 363       break;
 364
 365       /* Zero or more repeats of character types set the bits and then
 366       try again. */
 367
 368       case OP_TYPEUPTO:
 369       case OP_TYPEMINUPTO:
 370       tcode += 2;               /* Fall through */
 371
 372       case OP_TYPESTAR:
 373       case OP_TYPEMINSTAR:
 374       case OP_TYPEQUERY:
 375       case OP_TYPEMINQUERY:
 376       switch(tcode[1])
 377         {
 378         case OP_NOT_DIGIT:
 379         for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
 380         break;
 381
 382         case OP_DIGIT:
 383         for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
 384         break;
 385
 386         case OP_NOT_WHITESPACE:
 387         for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
 388         break;
 389
 390         case OP_WHITESPACE:
 391         for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
 392         break;
 393
 394         case OP_NOT_WORDCHAR:
 395         for (c = 0; c < 32; c++)
 396           start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 397         break;
 398
 399         case OP_WORDCHAR:
 400         for (c = 0; c < 32; c++)
 401           start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 402         break;
 403         }
 404
 405       tcode += 2;
 406       try_next = TRUE;
 407       break;
 408
 409       /* Character class: set the bits and either carry on or not,
 410       according to the repeat count. */
 411
 412       case OP_CLASS:
 413       case OP_NEGCLASS:
 414         {
 415         tcode++;
 416         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
 417         tcode += 32;
 418         switch (*tcode)
 419           {
 420           case OP_CRSTAR:
 421           case OP_CRMINSTAR:
 422           case OP_CRQUERY:
 423           case OP_CRMINQUERY:
 424           tcode++;
 425           try_next = TRUE;
 426           break;
 427
 428           case OP_CRRANGE:
 429           case OP_CRMINRANGE:
 430           if (((tcode[1] << 8) + tcode[2]) == 0)
 431             {
 432             tcode += 5;
 433             try_next = TRUE;
 434             }
 435           break;
 436           }
 437         }
 438       break; /* End of class handling */
 439
 440       }      /* End of switch */
 441     }        /* End of try_next loop */
 442
 443   code += (code[1] << 8) + code[2];   /* Advance to next branch */
 444   }
 445 while (*code == OP_ALT);
 446 return TRUE;
 447 }
 448
 449
 450
 451 /*************************************************
 452 *          Study a compiled expression           *
 453 *************************************************/
 454
 455 /* This function is handed a compiled expression that it must study to produce
 456 information that will speed up the matching. It returns a pcre_extra block
 457 which then gets handed back to pcre_exec().
 458
 459 Arguments:
 460   re        points to the compiled expression
 461   options   contains option bits
 462   errorptr  points to where to place error messages;
 463             set NULL unless error
 464
 465 Returns:    pointer to a pcre_extra block,
 466             NULL on error or if no optimization possible
 467 */
 468
 469 pcre_extra *
 470 pcre_study(const pcre *external_re, int options, const char **errorptr)
 471 {
 472 BOOL caseless;
 473 uschar start_bits[32];
 474 real_pcre_extra *extra;
 475 const real_pcre *re = (const real_pcre *)external_re;
 476
 477 *errorptr = NULL;
 478
 479 if (re == NULL || re->magic_number != MAGIC_NUMBER)
 480   {
 481   *errorptr = "argument is not a compiled regular expression";
 482   return NULL;
 483   }
 484
 485 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
 486   {
 487   *errorptr = "unknown or incorrect option bit(s) set";
 488   return NULL;
 489   }
 490
 491 /* Caseless can either be from the compiled regex or from options. */
 492
 493 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
 494
 495 /* For an anchored pattern, or an unanchored pattern that has a first char, or a
 496 multiline pattern that matches only at "line starts", no further processing at
 497 present. */
 498
 499 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
 500   return NULL;
 501
 502 /* See if we can find a fixed set of initial characters for the pattern. */
 503
 504 memset(start_bits, 0, 32 * sizeof(uschar));
 505 if (!set_start_bits(re->code, start_bits)) return NULL;
 506
 507 /* If this studying is caseless, scan the created bit map and duplicate the
 508 bits for any letters. */
 509
 510 if (caseless)
 511   {
 512   register int c;
 513   for (c = 0; c < 256; c++)
 514     {
 515     if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
 516         (pcre_ctypes[c] & ctype_letter) != 0)
 517       {
 518       int d = pcre_fcc[c];
 519       start_bits[d/8] |= (1 << (d&7));
 520       }
 521     }
 522   }
 523
 524 /* Get an "extra" block and put the information therein. */
 525
 526 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
 527
 528 if (extra == NULL)
 529   {
 530   *errorptr = "failed to get memory";
 531   return NULL;
 532   }
 533
 534 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
 535 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
 536
 537 return (pcre_extra *)extra;
 538 }
 539
 540 /* End of study.c */
 541 /*************************************************
 542 *      Perl-Compatible Regular Expressions       *
 543 *************************************************/
 544
 545 /*
 546 This is a library of functions to support regular expressions whose syntax
 547 and semantics are as close as possible to those of the Perl 5 language. See
 548 the file Tech.Notes for some information on the internals.
 549
 550 Written by: Philip Hazel <ph10@cam.ac.uk>
 551
 552            Copyright (c) 1998 University of Cambridge
 553
 554 -----------------------------------------------------------------------------
 555 Permission is granted to anyone to use this software for any purpose on any
 556 computer system, and to redistribute it freely, subject to the following
 557 restrictions:
 558
 559 1. This software is distributed in the hope that it will be useful,
 560    but WITHOUT ANY WARRANTY; without even the implied warranty of
 561    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 562
 563 2. The origin of this software must not be misrepresented, either by
 564    explicit claim or by omission.
 565
 566 3. Altered versions must be plainly marked as such, and must not be
 567    misrepresented as being the original software.
 568 -----------------------------------------------------------------------------
 569 */
 570
 571
 572 /* Define DEBUG to get debugging output on stdout. */
 573
 574 /* #define DEBUG */
 575
 576 /* Use a macro for debugging printing, 'cause that eliminates the the use
 577 of #ifdef inline, and there are *still* stupid compilers about that don't like
 578 indented pre-processor statements. I suppose it's only been 10 years... */
 579
 580 #ifdef DEBUG
 581 #define DPRINTF(p) printf p
 582 #else
 583 #define DPRINTF(p) /*nothing*/
 584 #endif
 585
 586 /* Include the internals header, which itself includes Standard C headers plus
 587 the external pcre header. */
 588
 589
 590
 591
 592 #ifndef Py_eval_input
 593 /* For Python 1.4, graminit.h has to be explicitly included */
 594 #define Py_eval_input eval_input
 595
 596 #endif /* FOR_PYTHON */
 597
 598 /* Allow compilation as C++ source code, should anybody want to do that. */
 599
 600 #ifdef __cplusplus
 601 #define class pcre_class
 602 #endif
 603
 604
 605 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
 606
 607 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
 608 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
 609
 610 /* Text forms of OP_ values and things, for debugging (not all used) */
 611
 612 #ifdef DEBUG
 613 static const char *OP_names[] = {
 614   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
 615   "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
 616   "localized \\B", "localized \\b", "localized \\W", "localized \\w",
 617   "^", "$", "Any", "chars",
 618   "not",
 619   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 620   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 621   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 622   "*", "*?", "+", "+?", "?", "??", "{", "{",
 623   "class", "negclass", "classL", "Ref",
 624   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
 625   "Brazero", "Braminzero", "Bra"
 626 };
 627 #endif
 628
 629 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 630 are simple data values; negative values are for special things like \d and so
 631 on. Zero means further processing is needed (for things like \x), or the escape
 632 is invalid. */
 633
 634 static const short int escapes[] = {
 635     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 636     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 637   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
 638     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 639     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 640     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 641   '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
 642     0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
 643     0,      0,   '\r', -ESC_s,   '\t',      0,   '\v', -ESC_w,   /* p - w */
 644     0,      0,      0                                            /* x - z */
 645 };
 646
 647 /* Definition to allow mutual recursion */
 648
 649 static BOOL
 650 compile_regex(int, int *, uschar **, const uschar **, const char **,
 651               PyObject *);
 652
 653 /* Structure for passing "static" information around between the functions
 654 doing the matching, so that they are thread-safe. */
 655
 656 typedef struct match_data {
 657   int    errorcode;             /* As it says */
 658   int   *offset_vector;         /* Offset vector */
 659   int    offset_end;            /* One past the end */
 660   BOOL   offset_overflow;       /* Set if too many extractions */
 661   BOOL   caseless;              /* Case-independent flag */
 662   BOOL   runtime_caseless;      /* Caseless forced at run time */
 663   BOOL   multiline;             /* Multiline flag */
 664   BOOL   notbol;                /* NOTBOL flag */
 665   BOOL   noteol;                /* NOTEOL flag */
 666   BOOL   dotall;                /* Dot matches any char */
 667   BOOL   endonly;               /* Dollar not before final \n */
 668   const uschar *start_subject;  /* Start of the subject string */
 669   const uschar *end_subject;    /* End of the subject string */
 670   jmp_buf fail_env;             /* Environment for longjump() break out */
 671   const uschar *end_match_ptr;  /* Subject position at end match */
 672   int     end_offset_top;       /* Highwater mark at end of match */
 673   jmp_buf error_env;          /* For longjmp() if an error occurs deep inside a
 674                                    matching operation */
 675   int    length;                /* Length of the allocated stacks */
 676   int    point;                 /* Point to add next item pushed onto stacks */
 677   /* Pointers to the 6 stacks */
 678   int *off_num, *offset_top, *r1, *r2;
 679   const uschar **eptr, **ecode;
 680 } match_data;
 681
 682
 683
 684 /*************************************************
 685 *               Global variables                 *
 686 *************************************************/
 687
 688 /* PCRE is thread-clean and doesn't use any global variables in the normal
 689 sense. However, it calls memory allocation and free functions via the two
 690 indirections below, which are can be changed by the caller, but are shared
 691 between all threads. */
 692
 693 void *(*pcre_malloc)(size_t) = malloc;
 694 void  (*pcre_free)(void *) = free;
 695
 696
 697
 698
 699 /*************************************************
 700 *          Return version string                 *
 701 *************************************************/
 702
 703 const char *
 704 pcre_version(void)
 705 {
 706 return PCRE_VERSION;
 707 }
 708
 709
 710
 711
 712 /*************************************************
 713 *       Return info about a compiled pattern     *
 714 *************************************************/
 715
 716 /* This function picks potentially useful data out of the private
 717 structure.
 718
 719 Arguments:
 720   external_re   points to compiled code
 721   optptr        where to pass back the options
 722   first_char    where to pass back the first character,
 723                 or -1 if multiline and all branches start ^,
 724                 or -2 otherwise
 725
 726 Returns:        number of identifying extraction brackets
 727                 or negative values on error
 728 */
 729
 730 int
 731 pcre_info(const pcre *external_re, int *optptr, int *first_char)
 732 {
 733 const real_pcre *re = (real_pcre *)external_re;
 734 if (re == NULL) return PCRE_ERROR_NULL;
 735 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 736 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
 737 if (first_char != NULL)
 738   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 739      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 740 return re->top_bracket;
 741 }
 742
 743
 744
 745
 746 #ifdef DEBUG
 747 /*************************************************
 748 *        Debugging function to print chars       *
 749 *************************************************/
 750
 751 /* Print a sequence of chars in printable format, stopping at the end of the
 752 subject if the requested.
 753
 754 Arguments:
 755   p           points to characters
 756   length      number to print
 757   is_subject  TRUE if printing from within md->start_subject
 758   md          pointer to matching data block, if is_subject is TRUE
 759
 760 Returns:     nothing
 761 */
 762
 763 static void
 764 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 765 {
 766 int c;
 767 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 768 while (length-- > 0)
 769   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 770 }
 771 #endif
 772
 773
 774
 775
 776 /*************************************************
 777 *         Check subpattern for empty operand     *
 778 *************************************************/
 779
 780 /* This function checks a bracketed subpattern to see if any of the paths
 781 through it could match an empty string. This is used to diagnose an error if
 782 such a subpattern is followed by a quantifier with an unlimited upper bound.
 783
 784 Argument:
 785   code      points to the opening bracket
 786
 787 Returns:    TRUE or FALSE
 788 */
 789
 790 static BOOL
 791 could_be_empty(uschar *code)
 792 {
 793 do {
 794   uschar *cc = code + 3;
 795
 796   /* Scan along the opcodes for this branch; as soon as we find something
 797   that matches a non-empty string, break out and advance to test the next
 798   branch. If we get to the end of the branch, return TRUE for the whole
 799   sub-expression. */
 800
 801   for (;;)
 802     {
 803     /* Test an embedded subpattern; if it could not be empty, break the
 804     loop. Otherwise carry on in the branch. */
 805
 806     if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
 807       {
 808       if (!could_be_empty(cc)) break;
 809       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 810       cc += 3;
 811       }
 812
 813     else switch (*cc)
 814       {
 815       /* Reached end of a branch: the subpattern may match the empty string */
 816
 817       case OP_ALT:
 818       case OP_KET:
 819       case OP_KETRMAX:
 820       case OP_KETRMIN:
 821       return TRUE;
 822
 823       /* Skip over entire bracket groups with zero lower bound */
 824
 825       case OP_BRAZERO:
 826       case OP_BRAMINZERO:
 827       cc++;
 828       /* Fall through */
 829
 830       /* Skip over assertive subpatterns */
 831
 832       case OP_ASSERT:
 833       case OP_ASSERT_NOT:
 834       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 835       cc += 3;
 836       break;
 837
 838       /* Skip over things that don't match chars */
 839
 840       case OP_SOD:
 841       case OP_EOD:
 842       case OP_CIRC:
 843       case OP_DOLL:
 844       case OP_NOT_WORD_BOUNDARY:
 845       case OP_WORD_BOUNDARY:
 846       case OP_NOT_WORD_BOUNDARY_L:
 847       case OP_WORD_BOUNDARY_L:
 848       cc++;
 849       break;
 850
 851       /* Skip over simple repeats with zero lower bound */
 852
 853       case OP_STAR:
 854       case OP_MINSTAR:
 855       case OP_QUERY:
 856       case OP_MINQUERY:
 857       case OP_NOTSTAR:
 858       case OP_NOTMINSTAR:
 859       case OP_NOTQUERY:
 860       case OP_NOTMINQUERY:
 861       case OP_TYPESTAR:
 862       case OP_TYPEMINSTAR:
 863       case OP_TYPEQUERY:
 864       case OP_TYPEMINQUERY:
 865       cc += 2;
 866       break;
 867
 868       /* Skip over UPTOs (lower bound is zero) */
 869
 870       case OP_UPTO:
 871       case OP_MINUPTO:
 872       case OP_TYPEUPTO:
 873       case OP_TYPEMINUPTO:
 874       cc += 4;
 875       break;
 876
 877       /* Check a class or a back reference for a zero minimum */
 878
 879       case OP_CLASS:
 880       case OP_NEGCLASS:
 881       case OP_REF:
 882       case OP_CLASS_L:
 883         switch(*cc)
 884           {
 885           case (OP_REF):    cc += 2; break;
 886           case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
 887           case (OP_CLASS_L): cc += 1+1+32; break;
 888           }
 889
 890       switch (*cc)
 891         {
 892         case OP_CRSTAR:
 893         case OP_CRMINSTAR:
 894         case OP_CRQUERY:
 895         case OP_CRMINQUERY:
 896         cc++;
 897         break;
 898
 899         case OP_CRRANGE:
 900         case OP_CRMINRANGE:
 901         if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
 902         cc += 3;
 903         break;
 904
 905         default:
 906         goto NEXT_BRANCH;
 907         }
 908       break;
 909
 910       /* Anything else matches at least one character */
 911
 912       default:
 913       goto NEXT_BRANCH;
 914       }
 915     }
 916
 917   NEXT_BRANCH:
 918   code += (code[1] << 8) + code[2];
 919   }
 920 while (*code == OP_ALT);
 921
 922 /* No branches match the empty string */
 923
 924 return FALSE;
 925 }
 926
 927 /* Determine the length of a group ID in an expression like
 928    (?P<foo_123>...)
 929 Arguments:
 930   ptr        pattern position pointer (say that 3 times fast)
 931   finalchar  the character that will mark the end of the ID
 932   errorptr   points to the pointer to the error message
 933 */
 934
 935 static int
 936 get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
 937 {
 938   const uschar *start = ptr;
 939
 940   /* If the first character is not in \w, or is in \w but is a digit,
 941      report an error */
 942   if (!(pcre_ctypes[*ptr] & ctype_word) ||
 943       (pcre_ctypes[*ptr++] & ctype_digit))
 944     {
 945       *errorptr = "(?P identifier must start with a letter or underscore";
 946       return 0;
 947     }
 948
 949   /* Increment ptr until we either hit a null byte, the desired
 950      final character, or a non-word character */
 951   for(; (*ptr != 0) && (*ptr != finalchar) &&
 952         (pcre_ctypes[*ptr] & ctype_word); ptr++)
 953     {
 954       /* Empty loop body */
 955     }
 956   if (*ptr==finalchar)
 957     return ptr-start;
 958   if (*ptr==0)
 959     {
 960       *errorptr = "unterminated (?P identifier";
 961       return 0;
 962     }
 963   *errorptr = "illegal character in (?P identifier";
 964   return 0;
 965 }
 966
 967 /*************************************************
 968 *            Handle escapes                      *
 969 *************************************************/
 970
 971 /* This function is called when a \ has been encountered. It either returns a
 972 positive value for a simple escape such as \n, or a negative value which
 973 encodes one of the more complicated things such as \d. On entry, ptr is
 974 pointing at the \. On exit, it is on the final character of the escape
 975 sequence.
 976
 977 Arguments:
 978   ptrptr     points to the pattern position pointer
 979   errorptr   points to the pointer to the error message
 980   bracount   number of previous extracting brackets
 981   options    the options bits
 982   isclass    TRUE if inside a character class
 983
 984 Returns:     zero or positive => a data character
 985              negative => a special escape sequence
 986              on error, errorptr is set
 987 */
 988
 989 static int
 990 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 991              int options, BOOL isclass)
 992 {
 993 const uschar *ptr = *ptrptr;
 994 int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
 995 int i;
 996
 997 if (c == 0) *errorptr = ERR1;
 998
 999 /* Digits or letters may have special meaning; all others are literals. */
1000
1001 else if (c < '0' || c > 'z') {}
1002
1003 /* Do an initial lookup in a table. A non-zero result is something that can be
1004 returned immediately. Otherwise further processing may be required. */
1005
1006 else if ((i = escapes[c - '0']) != 0) c = i;
1007
1008 /* Escapes that need further processing, or are illegal. */
1009
1010 else
1011   {
1012
1013   switch (c)
1014     {
1015     /* The handling of escape sequences consisting of a string of digits
1016     starting with one that is not zero is not straightforward. By experiment,
1017     the way Perl works seems to be as follows:
1018
1019     Outside a character class, the digits are read as a decimal number. If the
1020     number is less than 10, or if there are that many previous extracting
1021     left brackets, then it is a back reference. Otherwise, up to three octal
1022     digits are read to form an escaped byte. Thus \123 is likely to be octal
1023     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1024     value is greater than 377, the least significant 8 bits are taken. Inside a
1025     character class, \ followed by a digit is always an octal number. */
1026
1027     case '1': case '2': case '3': case '4': case '5':
1028     case '6': case '7': case '8': case '9':
1029
1030     {
1031       /* PYTHON: Try to compute an octal value for a character */
1032       for(c=0, i=0; ptr[i]!=0 && i<3; i++)
1033         {
1034           if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1035             c = (c * 8 + ptr[i]-'0') & 255;
1036           else
1037             break; /* Non-octal character--break out of the loop */
1038         }
1039       /* It's a character if there were exactly 3 octal digits, or if
1040          we're inside a character class and there was at least one
1041          octal digit. */
1042       if ( (i == 3) || (isclass && i!=0) )
1043         {
1044           ptr += i-1;
1045           break;
1046         }
1047       c = ptr[0]; /* Restore the first character after the \ */
1048       c -= '0'; i = 1;
1049       while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1050         {
1051           c = c * 10 + ptr[1] - '0';
1052           ptr++; i++;
1053         }
1054       if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1055       c = -(ESC_REF + c);
1056     }
1057   break;
1058
1059     /* \0 always starts an octal number, but we may drop through to here with a
1060     larger first octal digit */
1061
1062     case '0':
1063     c -= '0';
1064     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1065       ptr[1] != '8' && ptr[1] != '9')
1066         c = (c * 8 + *(++ptr) - '0') & 255;
1067     break;
1068
1069     /* Special escapes not starting with a digit are straightforward */
1070
1071     case 'x':
1072   c = 0;
1073   while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
1074     {
1075     ptr++;
1076     c = c * 16 + pcre_lcc[*ptr] -
1077       (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1078     c &= 255;
1079     }
1080   break;
1081
1082
1083     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1084     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1085     for Perl compatibility, it is a literal. */
1086
1087     default:
1088     if ((options & PCRE_EXTRA) != 0) switch(c)
1089       {
1090       case 'X':
1091       c = -ESC_X;      /* This could be a lookup if it ever got into Perl */
1092       break;
1093
1094       default:
1095       *errorptr = ERR3;
1096       break;
1097       }
1098     break;
1099     }
1100   }
1101
1102 *ptrptr = ptr;
1103 return c;
1104 }
1105
1106
1107
1108 /*************************************************
1109 *            Check for counted repeat            *
1110 *************************************************/
1111
1112 /* This function is called when a '{' is encountered in a place where it might
1113 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1114 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1115 where the ddds are digits.
1116
1117 Arguments:
1118   p         pointer to the first char after '{'
1119
1120 Returns:    TRUE or FALSE
1121 */
1122
1123 static BOOL
1124 is_counted_repeat(const uschar *p)
1125 {
1126 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1127 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1128 if (*p == '}') return TRUE;
1129
1130 if (*p++ != ',') return FALSE;
1131 if (*p == '}') return TRUE;
1132
1133 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1134 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1135 return (*p == '}');
1136 }
1137
1138
1139
1140 /*************************************************
1141 *         Read repeat counts                     *
1142 *************************************************/
1143
1144 /* Read an item of the form {n,m} and return the values. This is called only
1145 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1146 so the syntax is guaranteed to be correct, but we need to check the values.
1147
1148 Arguments:
1149   p          pointer to first char after '{'
1150   minp       pointer to int for min
1151   maxp       pointer to int for max
1152              returned as -1 if no max
1153   errorptr   points to pointer to error message
1154
1155 Returns:     pointer to '}' on success;
1156              current ptr on error, with errorptr set
1157 */
1158
1159 static const uschar *
1160 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1161 {
1162 int min = 0;
1163 int max = -1;
1164
1165 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1166
1167 if (*p == '}') max = min; else
1168   {
1169   if (*(++p) != '}')
1170     {
1171     max = 0;
1172     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1173     if (max < min)
1174       {
1175       *errorptr = ERR4;
1176       return p;
1177       }
1178     }
1179   }
1180
1181 /* Do paranoid checks, then fill in the required variables, and pass back the
1182 pointer to the terminating '}'. */
1183
1184 if (min > 65535 || max > 65535)
1185   *errorptr = ERR5;
1186 else
1187   {
1188   *minp = min;
1189   *maxp = max;
1190   }
1191 return p;
1192 }
1193
1194
1195
1196 /*************************************************
1197 *           Compile one branch                   *
1198 *************************************************/
1199
1200 /* Scan the pattern, compiling it into the code vector.
1201
1202 Arguments:
1203   options    the option bits
1204   bracket    points to number of brackets used
1205   code       points to the pointer to the current code point
1206   ptrptr     points to the current pattern pointer
1207   errorptr   points to pointer to error message
1208
1209 Returns:     TRUE on success
1210              FALSE, with *errorptr set on error
1211 */
1212
1213 static BOOL
1214 compile_branch(int options, int *brackets, uschar **codeptr,
1215                const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
1216 {
1217 int repeat_type, op_type;
1218 int repeat_min, repeat_max;
1219 int bravalue, length;
1220 int greedy_default, greedy_non_default;
1221 register int c;
1222 register uschar *code = *codeptr;
1223 const uschar *ptr = *ptrptr;
1224 const uschar *oldptr;
1225 uschar *previous = NULL;
1226 uschar class[32];
1227 uschar *class_flag;  /* Pointer to the single-byte flag for OP_CLASS_L */
1228
1229 /* Set up the default and non-default settings for greediness */
1230
1231 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1232 greedy_non_default = greedy_default ^ 1;
1233
1234 /* Switch on next character until the end of the branch */
1235
1236 for (;; ptr++)
1237   {
1238   BOOL negate_class;
1239   int  class_charcount;
1240   int  class_lastchar;
1241
1242   c = *ptr;
1243   if ((options & PCRE_EXTENDED) != 0)
1244     {
1245     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1246     if (c == '#')
1247       {
1248       while ((c = *(++ptr)) != 0 && c != '\n');
1249       continue;
1250       }
1251     }
1252
1253   switch(c)
1254     {
1255     /* The branch terminates at end of string, |, or ). */
1256
1257     case 0:
1258     case '|':
1259     case ')':
1260     *codeptr = code;
1261     *ptrptr = ptr;
1262     return TRUE;
1263
1264     /* Handle single-character metacharacters */
1265
1266     case '^':
1267     previous = NULL;
1268     *code++ = OP_CIRC;
1269     break;
1270
1271     case '$':
1272     previous = NULL;
1273     *code++ = OP_DOLL;
1274     break;
1275
1276     case '.':
1277     previous = code;
1278     *code++ = OP_ANY;
1279     break;
1280
1281     /* Character classes. These always build a 32-byte bitmap of the permitted
1282     characters, except in the special case where there is only one character.
1283     For negated classes, we build the map as usual, then invert it at the end.
1284     */
1285
1286     case '[':
1287     previous = code;
1288     if (options & PCRE_LOCALE)
1289       {
1290         *code++ = OP_CLASS_L;
1291         /* Set the flag for localized classes (like \w) to 0 */
1292         class_flag = code;
1293         *class_flag = 0;
1294       }
1295     else
1296       {
1297         *code++ = OP_CLASS;
1298         class_flag = NULL;
1299       }
1300
1301     /* If the first character is '^', set the negation flag, and use a
1302     different opcode. This only matters if caseless matching is specified at
1303     runtime. */
1304
1305     if ((c = *(++ptr)) == '^')
1306       {
1307       negate_class = TRUE;
1308       if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
1309       c = *(++ptr);
1310       }
1311     else negate_class = FALSE;
1312
1313     /* Keep a count of chars so that we can optimize the case of just a single
1314     character. */
1315
1316     class_charcount = 0;
1317     class_lastchar = -1;
1318
1319     /* Initialize the 32-char bit map to all zeros. We have to build the
1320     map in a temporary bit of store, in case the class contains only 1
1321     character, because in that case the compiled code doesn't use the
1322     bit map. */
1323
1324     memset(class, 0, 32 * sizeof(uschar));
1325
1326     /* Process characters until ] is reached. By writing this as a "do" it
1327     means that an initial ] is taken as a data character. */
1328
1329     do
1330       {
1331       if (c == 0)
1332         {
1333         *errorptr = ERR6;
1334         goto FAILED;
1335         }
1336
1337       /* Backslash may introduce a single character, or it may introduce one
1338       of the specials, which just set a flag. Escaped items are checked for
1339       validity in the pre-compiling pass. The sequence \b is a special case.
1340       Inside a class (and only there) it is treated as backspace. Elsewhere
1341       it marks a word boundary. Other escapes have preset maps ready to
1342       or into the one we are building. We assume they have more than one
1343       character in them, so set class_count bigger than one. */
1344
1345       if (c == '\\')
1346         {
1347         c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1348         if (-c == ESC_b) c = '\b';
1349         else if (c < 0)
1350           {
1351           class_charcount = 10;
1352           switch (-c)
1353             {
1354             case ESC_d:
1355               {
1356                 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1357               }
1358             continue;
1359
1360             case ESC_D:
1361               {
1362                 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1363               }
1364             continue;
1365
1366             case ESC_w:
1367             if (options & PCRE_LOCALE)
1368               {
1369                 *class_flag |= 1;
1370               }
1371             else
1372               {
1373                 for (c = 0; c < 32; c++)
1374                   class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1375               }
1376             continue;
1377
1378             case ESC_W:
1379             if (options & PCRE_LOCALE)
1380               {
1381                 *class_flag |= 2;
1382               }
1383             else
1384               {
1385                 for (c = 0; c < 32; c++)
1386                   class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1387               }
1388             continue;
1389
1390             case ESC_s:
1391               {
1392                 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1393               }
1394             continue;
1395
1396             case ESC_S:
1397               {
1398                 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1399               }
1400             continue;
1401
1402             default:
1403             *errorptr = ERR7;
1404             goto FAILED;
1405             }
1406           }
1407         /* Fall through if single character */
1408         }
1409
1410       /* A single character may be followed by '-' to form a range. However,
1411       Perl does not permit ']' to be the end of the range. A '-' character
1412       here is treated as a literal. */
1413
1414       if (ptr[1] == '-' && ptr[2] != ']')
1415         {
1416         int d;
1417         ptr += 2;
1418         d = *ptr;
1419
1420         if (d == 0)
1421           {
1422           *errorptr = ERR6;
1423           goto FAILED;
1424           }
1425
1426         /* The second part of a range can be a single-character escape, but
1427         not any of the other escapes. */
1428
1429         if (d == '\\')
1430           {
1431           d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1432           if (d < 0)
1433             {
1434             if (d == -ESC_b) d = '\b'; else
1435               {
1436               *errorptr = ERR7;
1437               goto FAILED;
1438               }
1439             }
1440           }
1441
1442         if (d < c)
1443           {
1444           *errorptr = ERR8;
1445           goto FAILED;
1446           }
1447
1448         for (; c <= d; c++)
1449           {
1450           class[c/8] |= (1 << (c&7));
1451           if ((options & PCRE_CASELESS) != 0)
1452             {
1453             int uc = pcre_fcc[c];           /* flip case */
1454             class[uc/8] |= (1 << (uc&7));
1455             }
1456           class_charcount++;                /* in case a one-char range */
1457           class_lastchar = c;
1458           }
1459         continue;   /* Go get the next char in the class */
1460         }
1461
1462       /* Handle a lone single character - we can get here for a normal
1463       non-escape char, or after \ that introduces a single character. */
1464
1465       class [c/8] |= (1 << (c&7));
1466       if ((options & PCRE_CASELESS) != 0)
1467         {
1468         c = pcre_fcc[c];   /* flip case */
1469         class[c/8] |= (1 << (c&7));
1470         }
1471       class_charcount++;
1472       class_lastchar = c;
1473       }
1474
1475     /* Loop until ']' reached; the check for end of string happens inside the
1476     loop. This "while" is the end of the "do" above. */
1477
1478     while ((c = *(++ptr)) != ']');
1479
1480     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1481     precisely one character. This doesn't need the whole 32-byte bit map.
1482     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1483     it's negative. */
1484
1485     if (class_charcount == 1 && class_lastchar >= 0)
1486       {
1487       if (negate_class)
1488         {
1489         code[-1] = OP_NOT;
1490         }
1491       else
1492         {
1493         code[-1] = OP_CHARS;
1494         *code++ = 1;
1495         }
1496       *code++ = class_lastchar;
1497       }
1498
1499     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1500     the code vector. */
1501
1502     else
1503       {
1504         /* If this is a localized opcode, bump the code pointer up */
1505         if (class_flag) code++;
1506       if (negate_class)
1507         {
1508           if (class_flag) *class_flag = (*class_flag) ^ 63;
1509           for (c = 0; c < 32; c++) code[c] = ~class[c];
1510         }
1511       else
1512         memcpy(code, class, 32);
1513       code += 32;
1514       }
1515     break;
1516
1517     /* Various kinds of repeat */
1518
1519     case '{':
1520     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1521     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1522     if (*errorptr != NULL) goto FAILED;
1523     goto REPEAT;
1524
1525     case '*':
1526     repeat_min = 0;
1527     repeat_max = -1;
1528     goto REPEAT;
1529
1530     case '+':
1531     repeat_min = 1;
1532     repeat_max = -1;
1533     goto REPEAT;
1534
1535     case '?':
1536     repeat_min = 0;
1537     repeat_max = 1;
1538
1539     REPEAT:
1540     if (previous == NULL)
1541       {
1542       *errorptr = ERR9;
1543       goto FAILED;
1544       }
1545
1546     /* If the next character is '?' this is a minimizing repeat, by default,
1547     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1548     next character. */
1549
1550     if (ptr[1] == '?')
1551       { repeat_type = greedy_non_default; ptr++; }
1552     else repeat_type = greedy_default;
1553
1554     /* If the maximum is zero then the minimum must also be zero; Perl allows
1555     this case, so we do too - by simply omitting the item altogether. */
1556
1557     if (repeat_max == 0) code = previous;
1558
1559     /* If previous was a string of characters, chop off the last one and use it
1560     as the subject of the repeat. If there was only one character, we can
1561     abolish the previous item altogether. */
1562
1563     else if (*previous == OP_CHARS)
1564       {
1565       int len = previous[1];
1566       if (len == 1)
1567         {
1568         c = previous[2];
1569         code = previous;
1570         }
1571       else
1572         {
1573         c = previous[len+1];
1574         previous[1]--;
1575         code--;
1576         }
1577       op_type = 0;                 /* Use single-char op codes */
1578       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1579       }
1580
1581     /* If previous was a single negated character ([^a] or similar), we use
1582     one of the special opcodes, replacing it. The code is shared with single-
1583     character repeats by adding a suitable offset into repeat_type. */
1584
1585     else if ((int)*previous == OP_NOT)
1586       {
1587       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1588       c = previous[1];
1589       code = previous;
1590       goto OUTPUT_SINGLE_REPEAT;
1591       }
1592
1593     /* If previous was a character type match (\d or similar), abolish it and
1594     create a suitable repeat item. The code is shared with single-character
1595     repeats by adding a suitable offset into repeat_type. */
1596
1597     else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
1598       {
1599       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1600       c = *previous;
1601       code = previous;
1602
1603       OUTPUT_SINGLE_REPEAT:
1604       repeat_type += op_type;      /* Combine both values for many cases */
1605
1606       /* A minimum of zero is handled either as the special case * or ?, or as
1607       an UPTO, with the maximum given. */
1608
1609       if (repeat_min == 0)
1610         {
1611         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1612           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1613         else
1614           {
1615           *code++ = OP_UPTO + repeat_type;
1616           *code++ = repeat_max >> 8;
1617           *code++ = (repeat_max & 255);
1618           }
1619         }
1620
1621       /* The case {1,} is handled as the special case + */
1622
1623       else if (repeat_min == 1 && repeat_max == -1)
1624         *code++ = OP_PLUS + repeat_type;
1625
1626       /* The case {n,n} is just an EXACT, while the general case {n,m} is
1627       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1628
1629       else
1630         {
1631         if (repeat_min != 1)
1632           {
1633           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1634           *code++ = repeat_min >> 8;
1635           *code++ = (repeat_min & 255);
1636           }
1637
1638         /* If the minimum is 1 and the previous item was a character string,
1639         we either have to put back the item that got canceled if the string
1640         length was 1, or add the character back onto the end of a longer
1641         string. For a character type nothing need be done; it will just get
1642         put back naturally. Note that the final character is always going to
1643         get added below. */
1644
1645         else if (*previous == OP_CHARS)
1646           {
1647           if (code == previous) code += 2; else previous[1]++;
1648           }
1649
1650         /*  For a single negated character we also have to put back the
1651         item that got canceled. */
1652
1653         else if (*previous == OP_NOT) code++;
1654
1655         /* If the maximum is unlimited, insert an OP_STAR. */
1656
1657         if (repeat_max < 0)
1658           {
1659           *code++ = c;
1660           *code++ = OP_STAR + repeat_type;
1661           }
1662
1663         /* Else insert an UPTO if the max is greater than the min. */
1664
1665         else if (repeat_max != repeat_min)
1666           {
1667           *code++ = c;
1668           repeat_max -= repeat_min;
1669           *code++ = OP_UPTO + repeat_type;
1670           *code++ = repeat_max >> 8;
1671           *code++ = (repeat_max & 255);
1672           }
1673         }
1674
1675       /* The character or character type itself comes last in all cases. */
1676
1677       *code++ = c;
1678       }
1679
1680     /* If previous was a character class or a back reference, we put the repeat
1681     stuff after it. */
1682
1683     else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1684              *previous==OP_CLASS_L || *previous == OP_REF)
1685       {
1686       if (repeat_min == 0 && repeat_max == -1)
1687         *code++ = OP_CRSTAR + repeat_type;
1688       else if (repeat_min == 1 && repeat_max == -1)
1689         *code++ = OP_CRPLUS + repeat_type;
1690       else if (repeat_min == 0 && repeat_max == 1)
1691         *code++ = OP_CRQUERY + repeat_type;
1692       else
1693         {
1694         *code++ = OP_CRRANGE + repeat_type;
1695         *code++ = repeat_min >> 8;
1696         *code++ = repeat_min & 255;
1697         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
1698         *code++ = repeat_max >> 8;
1699         *code++ = repeat_max & 255;
1700         }
1701       }
1702
1703     /* If previous was a bracket group, we may have to replicate it in certain
1704     cases. If the maximum repeat count is unlimited, check that the bracket
1705     group cannot match the empty string, and diagnose an error if it can. */
1706
1707     else if ((int)*previous >= OP_BRA)
1708       {
1709       int i;
1710       int len = code - previous;
1711
1712       if (repeat_max == -1 && could_be_empty(previous))
1713         {
1714         *errorptr = ERR10;
1715         goto FAILED;
1716         }
1717
1718       /* If the minimum is greater than zero, and the maximum is unlimited or
1719       equal to the minimum, the first copy remains where it is, and is
1720       replicated up to the minimum number of times. This case includes the +
1721       repeat, but of course no replication is needed in that case. */
1722
1723       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1724         {
1725         for (i = 1; i < repeat_min; i++)
1726           {
1727           memcpy(code, previous, len);
1728           code += len;
1729           }
1730         }
1731
1732       /* If the minimum is zero, stick BRAZERO in front of the first copy.
1733       Then, if there is a fixed upper limit, replicated up to that many times,
1734       sticking BRAZERO in front of all the optional ones. */
1735
1736       else
1737         {
1738         if (repeat_min == 0)
1739           {
1740           memmove(previous+1, previous, len);
1741           code++;
1742           *previous++ = OP_BRAZERO + repeat_type;
1743           }
1744
1745         for (i = 1; i < repeat_min; i++)
1746           {
1747           memcpy(code, previous, len);
1748           code += len;
1749           }
1750
1751         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1752           {
1753           *code++ = OP_BRAZERO + repeat_type;
1754           memcpy(code, previous, len);
1755           code += len;
1756           }
1757         }
1758
1759       /* If the maximum is unlimited, set a repeater in the final copy. */
1760
1761       if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1762       }
1763
1764     /* Else there's some kind of shambles */
1765
1766     else
1767       {
1768       *errorptr = ERR11;
1769       goto FAILED;
1770       }
1771
1772     /* In all case we no longer have a previous item. */
1773
1774     previous = NULL;
1775     break;
1776
1777
1778     /* Start of nested bracket sub-expression, or comment or lookahead.
1779     First deal with special things that can come after a bracket; all are
1780     introduced by ?, and the appearance of any of them means that this is not a
1781     referencing group. They were checked for validity in the first pass over
1782     the string, so we don't have to check for syntax errors here.  */
1783
1784     case '(':
1785     previous = code;              /* Only real brackets can be repeated */
1786     if (*(++ptr) == '?')
1787       {
1788       bravalue = OP_BRA;
1789
1790       switch (*(++ptr))
1791         {
1792         case '#':
1793         case 'i':
1794         case 'L':
1795         case 'm':
1796         case 's':
1797         case 'x':
1798         ptr++;
1799         while (*ptr != ')') ptr++;
1800         previous = NULL;
1801         continue;
1802
1803         case ':':                 /* Non-extracting bracket */
1804         ptr++;
1805         break;
1806
1807         case '=':                 /* Assertions can't be repeated */
1808         bravalue = OP_ASSERT;
1809         ptr++;
1810         previous = NULL;
1811         break;
1812
1813         case '!':
1814         bravalue = OP_ASSERT_NOT;
1815         ptr++;
1816         previous = NULL;
1817         break;
1818
1819         case ('P'):
1820           ptr++;
1821           if (*ptr=='<')
1822             {
1823               /* (?P<groupname>...) */
1824               int idlen;
1825               PyObject *string, *intobj;
1826
1827               ptr++;
1828               idlen = get_group_id(ptr, '>', errorptr);
1829               if (*errorptr) {
1830                 goto FAILED;
1831               }
1832               string = PyString_FromStringAndSize((char*)ptr, idlen);
1833               intobj = PyInt_FromLong( brackets[0] + 1 );
1834               if (intobj == NULL || string == NULL)
1835                 {
1836                   Py_XDECREF(string);
1837                   Py_XDECREF(intobj);
1838                   *errorptr = "exception raised";
1839                   goto FAILED;
1840                 }
1841               PyDict_SetItem(dictionary, string, intobj);
1842               Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
1843               ptr += idlen+1;  /* Point to rest of expression */
1844               goto do_grouping_bracket;
1845             }
1846           if (*ptr=='=')
1847             {
1848               /* (?P=groupname) */
1849               int idlen, refnum;
1850               PyObject *string, *intobj;
1851
1852               ptr++;
1853               idlen = get_group_id(ptr, ')', errorptr);
1854               if (*errorptr) {
1855                 goto FAILED;
1856               }
1857               string = PyString_FromStringAndSize((char *)ptr, idlen);
1858               if (string==NULL) {
1859                   *errorptr = "exception raised";
1860                   goto FAILED;
1861                 }
1862               intobj = PyDict_GetItem(dictionary, string);
1863               if (intobj==NULL) {
1864                 Py_DECREF(string);
1865                 *errorptr = "?P= group identifier isn't defined";
1866                 goto FAILED;
1867               }
1868
1869               refnum = PyInt_AsLong(intobj);
1870               Py_DECREF(string);
1871               /* The caller doesn't own the reference to the value
1872                  returned from PyDict_GetItem, so intobj is not
1873                  DECREF'ed. */
1874
1875               *code++ = OP_REF;
1876               *code++ = refnum;
1877               /* The continue will cause the top-level for() loop to
1878                  be resumed, so ptr will be immediately incremented.
1879                  Therefore, the following line adds just idlen, not
1880                  idlen+1 */
1881               ptr += idlen;
1882               continue;
1883             }
1884           /* The character after ?P is neither < nor =, so
1885              report an error.  Add more Python-extensions here. */
1886           *errorptr="unknown after (?P";
1887           goto FAILED;
1888
1889         case '>':                         /* "Match once" brackets */
1890         if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */
1891           {
1892           bravalue = OP_ONCE;
1893           ptr++;
1894           previous = NULL;
1895           break;
1896           }
1897         /* Else fall through */
1898
1899         default:
1900         *errorptr = ERR12;
1901         goto FAILED;
1902         }
1903       }
1904
1905     /* Else we have a referencing group */
1906
1907     else
1908       {
1909       do_grouping_bracket:
1910       if (++(*brackets) > EXTRACT_MAX)
1911         {
1912         *errorptr = ERR13;
1913         goto FAILED;
1914         }
1915       bravalue = OP_BRA + *brackets;
1916       }
1917
1918     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1919     code into a non-register variable in order to be able to pass its address
1920     because some compilers complain otherwise. */
1921
1922     *code = bravalue;
1923       {
1924       uschar *mcode = code;
1925       if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
1926         goto FAILED;
1927       code = mcode;
1928       }
1929
1930     if (*ptr != ')')
1931       {
1932       *errorptr = ERR14;
1933       goto FAILED;
1934       }
1935     break;
1936
1937     /* Check \ for being a real metacharacter; if not, fall through and handle
1938     it as a data character at the start of a string. Escape items are checked
1939     for validity in the pre-compiling pass. */
1940
1941     case '\\':
1942     oldptr = ptr;
1943     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1944
1945     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1946     are arranged to be the negation of the corresponding OP_values. For the
1947     back references, the values are ESC_REF plus the reference number. Only
1948     back references and those types that consume a character may be repeated.
1949     We can test for values between ESC_b and ESC_Z for the latter; this may
1950     have to change if any new ones are ever created. */
1951
1952     if (c < 0)
1953       {
1954       if (-c >= ESC_REF)
1955         {
1956         int refnum = -c - ESC_REF;
1957         if (*brackets < refnum)
1958           {
1959           *errorptr = ERR15;
1960           goto FAILED;
1961           }
1962         previous = code;
1963         *code++ = OP_REF;
1964         *code++ = refnum;
1965         }
1966       else
1967         {
1968         previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1969         if ( (options & PCRE_LOCALE) != 0)
1970           {
1971             switch (c)
1972               {
1973                 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1974                 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1975                 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1976                 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1977               }
1978           }
1979         *code++ = -c;
1980         }
1981       continue;
1982       }
1983
1984     /* Data character: Reset and fall through */
1985
1986     ptr = oldptr;
1987     c = '\\';
1988
1989     /* Handle a run of data characters until a metacharacter is encountered.
1990     The first character is guaranteed not to be whitespace or # when the
1991     extended flag is set. */
1992
1993     NORMAL_CHAR:
1994     default:
1995     previous = code;
1996     *code = OP_CHARS;
1997     code += 2;
1998     length = 0;
1999
2000     do
2001       {
2002       if ((options & PCRE_EXTENDED) != 0)
2003         {
2004         if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2005         if (c == '#')
2006           {
2007           while ((c = *(++ptr)) != 0 && c != '\n');
2008           if (c == 0) break;
2009           continue;
2010           }
2011         }
2012
2013       /* Backslash may introduce a data char or a metacharacter. Escaped items
2014       are checked for validity in the pre-compiling pass. Stop the string
2015       before a metaitem. */
2016
2017       if (c == '\\')
2018         {
2019         oldptr = ptr;
2020         c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
2021         if (c < 0) { ptr = oldptr; break; }
2022         }
2023
2024       /* Ordinary character or single-char escape */
2025
2026       *code++ = c;
2027       length++;
2028       }
2029
2030     /* This "while" is the end of the "do" above. */
2031
2032     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2033
2034     /* Compute the length and set it in the data vector, and advance to
2035     the next state. */
2036
2037     previous[1] = length;
2038     if (length < 255) ptr--;
2039     break;
2040     }
2041   }                   /* end of big loop */
2042
2043 /* Control never reaches here by falling through, only by a goto for all the
2044 error states. Pass back the position in the pattern so that it can be displayed
2045 to the user for diagnosing the error. */
2046
2047 FAILED:
2048 *ptrptr = ptr;
2049 return FALSE;
2050 }
2051
2052
2053
2054
2055 /*************************************************
2056 *     Compile sequence of alternatives           *
2057 *************************************************/
2058
2059 /* On entry, ptr is pointing past the bracket character, but on return
2060 it points to the closing bracket, or vertical bar, or end of string.
2061 The code variable is pointing at the byte into which the BRA operator has been
2062 stored.
2063
2064 Argument:
2065   options   the option bits
2066   brackets  -> int containing the number of extracting brackets used
2067   codeptr   -> the address of the current code pointer
2068   ptrptr    -> the address of the current pattern pointer
2069   errorptr  -> pointer to error message
2070
2071 Returns:    TRUE on success
2072 */
2073
2074 static BOOL
2075 compile_regex(int options, int *brackets, uschar **codeptr,
2076   const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
2077 {
2078 const uschar *ptr = *ptrptr;
2079 uschar *code = *codeptr;
2080 uschar *start_bracket = code;
2081
2082 for (;;)
2083   {
2084   int length;
2085   uschar *last_branch = code;
2086
2087   code += 3;
2088   if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
2089     {
2090     *ptrptr = ptr;
2091     return FALSE;
2092     }
2093
2094   /* Fill in the length of the last branch */
2095
2096   length = code - last_branch;
2097   last_branch[1] = length >> 8;
2098   last_branch[2] = length & 255;
2099
2100   /* Reached end of expression, either ')' or end of pattern. Insert a
2101   terminating ket and the length of the whole bracketed item, and return,
2102   leaving the pointer at the terminating char. */
2103
2104   if (*ptr != '|')
2105     {
2106     length = code - start_bracket;
2107     *code++ = OP_KET;
2108     *code++ = length >> 8;
2109     *code++ = length & 255;
2110     *codeptr = code;
2111     *ptrptr = ptr;
2112     return TRUE;
2113     }
2114
2115   /* Another branch follows; insert an "or" node and advance the pointer. */
2116
2117   *code = OP_ALT;
2118   ptr++;
2119   }
2120 /* Control never reaches here */
2121 }
2122
2123
2124
2125 /*************************************************
2126 *          Check for anchored expression         *
2127 *************************************************/
2128
2129 /* Try to find out if this is an anchored regular expression. Consider each
2130 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2131 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2132 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2133 counts, since OP_CIRC can match in the middle.
2134
2135 A branch is also implicitly anchored if it starts with .* because that will try
2136 the rest of the pattern at all possible matching points, so there is no point
2137 trying them again.
2138
2139 Argument:  points to start of expression (the bracket)
2140 Returns:   TRUE or FALSE
2141 */
2142
2143 static BOOL
2144 is_anchored(register const uschar *code, BOOL multiline)
2145 {
2146 do {
2147    int op = (int)code[3];
2148    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2149      { if (!is_anchored(code+3, multiline)) return FALSE; }
2150    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2151      { if (code[4] != OP_ANY) return FALSE; }
2152    else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2153    code += (code[1] << 8) + code[2];
2154    }
2155 while (*code == OP_ALT);
2156 return TRUE;
2157 }
2158
2159
2160
2161 /*************************************************
2162 *     Check for start with \n line expression    *
2163 *************************************************/
2164
2165 /* This is called for multiline expressions to try to find out if every branch
2166 starts with ^ so that "first char" processing can be done to speed things up.
2167
2168 Argument:  points to start of expression (the bracket)
2169 Returns:   TRUE or FALSE
2170 */
2171
2172 static BOOL
2173 is_startline(const uschar *code)
2174 {
2175 do {
2176    if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2177      { if (!is_startline(code+3)) return FALSE; }
2178    else if (code[3] != OP_CIRC) return FALSE;
2179    code += (code[1] << 8) + code[2];
2180    }
2181 while (*code == OP_ALT);
2182 return TRUE;
2183 }
2184
2185
2186
2187 /*************************************************
2188 *          Check for fixed first char            *
2189 *************************************************/
2190
2191 /* Try to find out if there is a fixed first character. This is called for
2192 unanchored expressions, as it speeds up their processing quite considerably.
2193 Consider each alternative branch. If they all start with the same char, or with
2194 a bracket all of whose alternatives start with the same char (recurse ad lib),
2195 then we return that char, otherwise -1.
2196
2197 Argument:  points to start of expression (the bracket)
2198 Returns:   -1 or the fixed first char
2199 */
2200
2201 static int
2202 find_firstchar(uschar *code)
2203 {
2204 register int c = -1;
2205 do
2206   {
2207   register int charoffset = 4;
2208
2209   if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2210     {
2211     register int d;
2212     if ((d = find_firstchar(code+3)) < 0) return -1;
2213     if (c < 0) c = d; else if (c != d) return -1;
2214     }
2215
2216   else switch(code[3])
2217     {
2218     default:
2219     return -1;
2220
2221     case OP_EXACT:       /* Fall through */
2222     charoffset++;
2223
2224     case OP_CHARS:       /* Fall through */
2225     charoffset++;
2226
2227     case OP_PLUS:
2228     case OP_MINPLUS:
2229     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2230     break;
2231     }
2232   code += (code[1] << 8) + code[2];
2233   }
2234 while (*code == OP_ALT);
2235 return c;
2236 }
2237
2238
2239
2240 /*************************************************
2241 *        Compile a Regular Expression            *
2242 *************************************************/
2243
2244 /* This function takes a string and returns a pointer to a block of store
2245 holding a compiled version of the expression.
2246
2247 Arguments:
2248   pattern      the regular expression
2249   options      various option bits
2250   errorptr     pointer to pointer to error text
2251   erroroffset  ptr offset in pattern where error was detected
2252
2253 Returns:       pointer to compiled data block, or NULL on error,
2254                with errorptr and erroroffset set
2255 */
2256
2257 pcre *
2258 pcre_compile(const char *pattern, int options, const char **errorptr,
2259              int *erroroffset, PyObject *dictionary)
2260 {
2261 real_pcre *re;
2262 int spaces = 0;
2263 int length = 3;      /* For initial BRA plus length */
2264 int runlength;
2265 int c, size;
2266 int bracount = 0;
2267 int brastack[200];
2268 int top_backref = 0;
2269 unsigned int brastackptr = 0;
2270 uschar *code;
2271 const uschar *ptr;
2272
2273 #ifdef DEBUG
2274 uschar *code_base, *code_end;
2275 #endif
2276
2277 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2278 can do is just return NULL. */
2279
2280 if (errorptr == NULL) return NULL;
2281 *errorptr = NULL;
2282
2283 /* However, we can give a message for this error */
2284
2285 if (erroroffset == NULL)
2286   {
2287   *errorptr = ERR16;
2288   return NULL;
2289   }
2290 *erroroffset = 0;
2291
2292 if ((options & ~PUBLIC_OPTIONS) != 0)
2293   {
2294   *errorptr = ERR17;
2295   return NULL;
2296   }
2297
2298 DPRINTF(("------------------------------------------------------------------\n"));
2299 DPRINTF(("%s\n", pattern));
2300
2301 /* The first thing to do is to make a pass over the pattern to compute the
2302 amount of store required to hold the compiled code. This does not have to be
2303 perfect as long as errors are overestimates. At the same time we can detect any
2304 internal flag settings. Make an attempt to correct for any counted white space
2305 if an "extended" flag setting appears late in the pattern. We can't be so
2306 clever for #-comments. */
2307
2308 ptr = (const uschar *)(pattern - 1);
2309 while ((c = *(++ptr)) != 0)
2310   {
2311   int min, max;
2312   int class_charcount;
2313
2314   if ((pcre_ctypes[c] & ctype_space) != 0)
2315     {
2316     if ((options & PCRE_EXTENDED) != 0) continue;
2317     spaces++;
2318     }
2319
2320   if (c == '#' && (options & PCRE_EXTENDED) != 0)
2321     {
2322     while ((c = *(++ptr)) != 0 && c != '\n');
2323     continue;
2324     }
2325
2326   switch(c)
2327     {
2328     /* A backslashed item may be an escaped "normal" character or a
2329     character type. For a "normal" character, put the pointers and
2330     character back so that tests for whitespace etc. in the input
2331     are done correctly. */
2332
2333     case '\\':
2334       {
2335       const uschar *save_ptr = ptr;
2336       c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2337       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2338       if (c >= 0)
2339         {
2340         ptr = save_ptr;
2341         c = '\\';
2342         goto NORMAL_CHAR;
2343         }
2344       }
2345     length++;
2346
2347     /* A back reference needs an additional char, plus either one or 5
2348     bytes for a repeat. We also need to keep the value of the highest
2349     back reference. */
2350
2351     if (c <= -ESC_REF)
2352       {
2353       int refnum = -c - ESC_REF;
2354       if (refnum > top_backref) top_backref = refnum;
2355       length++;   /* For single back reference */
2356       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2357         {
2358         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2359         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2360         if ((min == 0 && (max == 1 || max == -1)) ||
2361           (min == 1 && max == -1))
2362             length++;
2363         else length += 5;
2364         if (ptr[1] == '?') ptr++;
2365         }
2366       }
2367     continue;
2368
2369     case '^':
2370     case '.':
2371     case '$':
2372     case '*':     /* These repeats won't be after brackets; */
2373     case '+':     /* those are handled separately */
2374     case '?':
2375     length++;
2376     continue;
2377
2378     /* This covers the cases of repeats after a single char, metachar, class,
2379     or back reference. */
2380
2381     case '{':
2382     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2383     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2384     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2385     if ((min == 0 && (max == 1 || max == -1)) ||
2386       (min == 1 && max == -1))
2387         length++;
2388     else
2389       {
2390       length--;   /* Uncount the original char or metachar */
2391       if (min == 1) length++; else if (min > 0) length += 4;
2392       if (max > 0) length += 4; else length += 2;
2393       }
2394     if (ptr[1] == '?') ptr++;
2395     continue;
2396
2397     /* An alternation contains an offset to the next branch or ket. */
2398     case '|':
2399     length += 3;
2400     continue;
2401
2402     /* A character class uses 33 characters. Don't worry about character types
2403     that aren't allowed in classes - they'll get picked up during the compile.
2404     A character class that contains only one character uses 2 or 3 bytes,
2405     depending on whether it is negated or not. Notice this where we can. */
2406
2407     case '[':
2408     class_charcount = 0;
2409     if (*(++ptr) == '^') ptr++;
2410     do
2411       {
2412       if (*ptr == '\\')
2413         {
2414         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
2415         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2416         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2417         }
2418       else class_charcount++;
2419       ptr++;
2420       }
2421     while (*ptr != 0 && *ptr != ']');
2422
2423     /* Repeats for negated single chars are handled by the general code */
2424
2425     if (class_charcount == 1) length += 3; else
2426       {
2427       length += 33;
2428       if (options & PCRE_LOCALE) length++;  /* Add a byte for the localization flag */
2429
2430       /* A repeat needs either 1 or 5 bytes. */
2431
2432       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2433         {
2434         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2435         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2436         if ((min == 0 && (max == 1 || max == -1)) ||
2437           (min == 1 && max == -1))
2438             length++;
2439         else length += 5;
2440         if (ptr[1] == '?') ptr++;
2441         }
2442       }
2443     continue;
2444
2445     /* Brackets may be genuine groups or special things */
2446
2447     case '(':
2448
2449     /* Handle special forms of bracket, which all start (? */
2450
2451     if (ptr[1] == '?') switch (c = ptr[2])
2452       {
2453       /* Skip over comments entirely */
2454       case '#':
2455       ptr += 3;
2456       while (*ptr != 0 && *ptr != ')') ptr++;
2457       if (*ptr == 0)
2458         {
2459         *errorptr = ERR18;
2460         goto PCRE_ERROR_RETURN;
2461         }
2462       continue;
2463
2464       /* Non-referencing groups and lookaheads just move the pointer on, and
2465       then behave like a non-special bracket, except that they don't increment
2466       the count of extracting brackets. */
2467
2468       case ':':
2469       case '=':
2470       case '!':
2471       ptr += 2;
2472       break;
2473
2474       case ('P'):
2475         {
2476           int idlen;
2477           switch (*ptr++) {
2478           case ('<'):
2479             idlen = get_group_id(ptr++, '>', errorptr);
2480             if (*errorptr) goto PCRE_ERROR_RETURN;
2481             ptr += idlen+1;
2482             break;
2483           case ('='):
2484             idlen = get_group_id(ptr++, ')', errorptr);
2485             if (*errorptr) goto PCRE_ERROR_RETURN;
2486             ptr += idlen+1;
2487             length++;
2488             break;
2489           }
2490         }
2491         break;
2492
2493       /* Ditto for the "once only" bracket, allowed only if the extra bit
2494       is set. */
2495
2496       case '>':
2497       if ((options & PCRE_EXTRA) != 0)
2498         {
2499         ptr += 2;
2500         break;
2501         }
2502       /* Else fall through */
2503
2504       /* Else loop setting valid options until ) is met. Anything else is an
2505       error. */
2506
2507       default:
2508       ptr += 2;
2509       for (;; ptr++)
2510         {
2511         if ((c = *ptr) == 'i')
2512           {
2513           options |= PCRE_CASELESS;
2514           continue;
2515           }
2516         else if ((c = *ptr) == 'L')
2517           {
2518           options |= PCRE_LOCALE;
2519           continue;
2520           }
2521         else if ((c = *ptr) == 'm')
2522           {
2523           options |= PCRE_MULTILINE;
2524           continue;
2525           }
2526         else if (c == 's')
2527           {
2528           options |= PCRE_DOTALL;
2529           continue;
2530           }
2531         else if (c == 'x')
2532           {
2533           options |= PCRE_EXTENDED;
2534           length -= spaces;          /* Already counted spaces */
2535           continue;
2536           }
2537         else if (c == ')') break;
2538
2539         *errorptr = ERR12;
2540         goto PCRE_ERROR_RETURN;
2541         }
2542       continue;                      /* End of this bracket handling */
2543       }
2544
2545     /* Extracting brackets must be counted so we can process escapes in a
2546     Perlish way. */
2547
2548     else bracount++;
2549
2550     /* Non-special forms of bracket. Save length for computing whole length
2551     at end if there's a repeat that requires duplication of the group. */
2552
2553     if (brastackptr >= sizeof(brastack)/sizeof(int))
2554       {
2555       *errorptr = ERR19;
2556       goto PCRE_ERROR_RETURN;
2557       }
2558
2559     brastack[brastackptr++] = length;
2560     length += 3;
2561     continue;
2562
2563     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2564     have to replicate this bracket up to that many times. If brastackptr is
2565     0 this is an unmatched bracket which will generate an error, but take care
2566     not to try to access brastack[-1]. */
2567
2568     case ')':
2569     length += 3;
2570       {
2571       int minval = 1;
2572       int maxval = 1;
2573       int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
2574
2575       /* Leave ptr at the final char; for read_repeat_counts this happens
2576       automatically; for the others we need an increment. */
2577
2578       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2579         {
2580         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2581         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2582         }
2583       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2584       else if (c == '+') { maxval = -1; ptr++; }
2585       else if (c == '?') { minval = 0; ptr++; }
2586
2587       /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2588       if there is a limited maximum we have to replicate up to maxval-1 times
2589       and allow for a BRAZERO item before each optional copy, as we also have
2590       to do before the first copy if the minimum is zero. */
2591
2592       if (minval == 0) length++;
2593         else if (minval > 1) length += (minval - 1) * duplength;
2594       if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2595       }
2596     continue;
2597
2598     /* Non-special character. For a run of such characters the length required
2599     is the number of characters + 2, except that the maximum run length is 255.
2600     We won't get a skipped space or a non-data escape or the start of a #
2601     comment as the first character, so the length can't be zero. */
2602
2603     NORMAL_CHAR:
2604     default:
2605     length += 2;
2606     runlength = 0;
2607     do
2608       {
2609       if ((pcre_ctypes[c] & ctype_space) != 0)
2610         {
2611         if ((options & PCRE_EXTENDED) != 0) continue;
2612         spaces++;
2613         }
2614
2615       if (c == '#' && (options & PCRE_EXTENDED) != 0)
2616         {
2617         while ((c = *(++ptr)) != 0 && c != '\n');
2618         continue;
2619         }
2620
2621       /* Backslash may introduce a data char or a metacharacter; stop the
2622       string before the latter. */
2623
2624       if (c == '\\')
2625         {
2626         const uschar *saveptr = ptr;
2627         c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2628         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2629         if (c < 0) { ptr = saveptr; break; }
2630         }
2631
2632       /* Ordinary character or single-char escape */
2633
2634       runlength++;
2635       }
2636
2637     /* This "while" is the end of the "do" above. */
2638
2639     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2640
2641     ptr--;
2642     length += runlength;
2643     continue;
2644     }
2645   }
2646
2647 length += 4;    /* For final KET and END */
2648
2649 if (length > 65539)
2650   {
2651   *errorptr = ERR20;
2652   return NULL;
2653   }
2654
2655 /* Compute the size of data block needed and get it, either from malloc or
2656 externally provided function. We specify "code[0]" in the offsetof() expression
2657 rather than just "code", because it has been reported that one broken compiler
2658 fails on "code" because it is also an independent variable. It should make no
2659 difference to the value of the offsetof(). */
2660
2661 size = length + offsetof(real_pcre, code[0]);
2662 re = (real_pcre *)(pcre_malloc)(size+50);
2663
2664 if (re == NULL)
2665   {
2666   *errorptr = ERR21;
2667   return NULL;
2668   }
2669
2670 /* Put in the magic number and the options. */
2671
2672 re->magic_number = MAGIC_NUMBER;
2673 re->options = options;
2674
2675 /* Set up a starting, non-extracting bracket, then compile the expression. On
2676 error, *errorptr will be set non-NULL, so we don't need to look at the result
2677 of the function here. */
2678
2679 ptr = (const uschar *)pattern;
2680 code = re->code;
2681 *code = OP_BRA;
2682 bracount = 0;
2683 (void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2684 re->top_bracket = bracount;
2685 re->top_backref = top_backref;
2686
2687 /* If not reached end of pattern on success, there's an excess bracket. */
2688
2689 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2690
2691 /* Fill in the terminating state and check for disastrous overflow, but
2692 if debugging, leave the test till after things are printed out. */
2693
2694 *code++ = OP_END;
2695
2696
2697 #ifndef DEBUG
2698 if (code - re->code > length) *errorptr = ERR23;
2699 #endif
2700
2701 /* Failed to compile */
2702
2703 if (*errorptr != NULL)
2704   {
2705   (pcre_free)(re);
2706   PCRE_ERROR_RETURN:
2707   *erroroffset = ptr - (const uschar *)pattern;
2708   return NULL;
2709   }
2710
2711 /* If the anchored option was not passed, set flag if we can determine that it
2712 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2713 we can determine what the first character has to be, because that speeds up
2714 unanchored matches no end. In the case of multiline matches, an alternative is
2715 to set the PCRE_STARTLINE flag if all branches start with ^. */
2716
2717 if ((options & PCRE_ANCHORED) == 0)
2718   {
2719   if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2720     re->options |= PCRE_ANCHORED;
2721   else
2722     {
2723     int ch = find_firstchar(re->code);
2724     if (ch >= 0)
2725       {
2726       re->first_char = ch;
2727       re->options |= PCRE_FIRSTSET;
2728       }
2729     else if (is_startline(re->code))
2730       re->options |= PCRE_STARTLINE;
2731     }
2732   }
2733
2734 /* Print out the compiled data for debugging */
2735
2736 #ifdef DEBUG
2737
2738 printf("Length = %d top_bracket = %d top_backref=%d\n",
2739   length, re->top_bracket, re->top_backref);
2740
2741 if (re->options != 0)
2742   {
2743   printf("%s%s%s%s%s%s%s%s\n",
2744     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2745     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2746     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2747     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2748     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2749     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2750     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2751     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2752   }
2753
2754 if ((re->options & PCRE_FIRSTSET) != 0)
2755   {
2756   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2757     else printf("First char = \\x%02x\n", re->first_char);
2758   }
2759
2760 code_end = code;
2761 code_base = code = re->code;
2762
2763 while (code < code_end)
2764   {
2765   int charlength;
2766
2767   printf("%3d ", code - code_base);
2768
2769   if (*code >= OP_BRA)
2770     {
2771     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2772     code += 2;
2773     }
2774
2775   else switch(*code)
2776     {
2777     case OP_CHARS:
2778     charlength = *(++code);
2779     printf("%3d ", charlength);
2780     while (charlength-- > 0)
2781       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2782     break;
2783
2784     case OP_KETRMAX:
2785     case OP_KETRMIN:
2786     case OP_ALT:
2787     case OP_KET:
2788     case OP_ASSERT:
2789     case OP_ASSERT_NOT:
2790     case OP_ONCE:
2791     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2792     code += 2;
2793     break;
2794
2795     case OP_STAR:
2796     case OP_MINSTAR:
2797     case OP_PLUS:
2798     case OP_MINPLUS:
2799     case OP_QUERY:
2800     case OP_MINQUERY:
2801     case OP_TYPESTAR:
2802     case OP_TYPEMINSTAR:
2803     case OP_TYPEPLUS:
2804     case OP_TYPEMINPLUS:
2805     case OP_TYPEQUERY:
2806     case OP_TYPEMINQUERY:
2807     if (*code >= OP_TYPESTAR)
2808       printf("    %s", OP_names[code[1]]);
2809     else if (isprint(c = code[1])) printf("    %c", c);
2810       else printf("    \\x%02x", c);
2811     printf("%s", OP_names[*code++]);
2812     break;
2813
2814     case OP_EXACT:
2815     case OP_UPTO:
2816     case OP_MINUPTO:
2817     if (isprint(c = code[3])) printf("    %c{", c);
2818       else printf("    \\x%02x{", c);
2819     if (*code != OP_EXACT) printf("0,");
2820     printf("%d}", (code[1] << 8) + code[2]);
2821     if (*code == OP_MINUPTO) printf("?");
2822     code += 3;
2823     break;
2824
2825     case OP_TYPEEXACT:
2826     case OP_TYPEUPTO:
2827     case OP_TYPEMINUPTO:
2828     printf("    %s{", OP_names[code[3]]);
2829     if (*code != OP_TYPEEXACT) printf(",");
2830     printf("%d}", (code[1] << 8) + code[2]);
2831     if (*code == OP_TYPEMINUPTO) printf("?");
2832     code += 3;
2833     break;
2834
2835     case OP_NOT:
2836     if (isprint(c = *(++code))) printf("    [^%c]", c);
2837       else printf("    [^\\x%02x]", c);
2838     break;
2839
2840     case OP_NOTSTAR:
2841     case OP_NOTMINSTAR:
2842     case OP_NOTPLUS:
2843     case OP_NOTMINPLUS:
2844     case OP_NOTQUERY:
2845     case OP_NOTMINQUERY:
2846     if (isprint(c = code[1])) printf("    [^%c]", c);
2847       else printf("    [^\\x%02x]", c);
2848     printf("%s", OP_names[*code++]);
2849     break;
2850
2851     case OP_NOTEXACT:
2852     case OP_NOTUPTO:
2853     case OP_NOTMINUPTO:
2854     if (isprint(c = code[3])) printf("    [^%c]{", c);
2855       else printf("    [^\\x%02x]{", c);
2856     if (*code != OP_NOTEXACT) printf(",");
2857     printf("%d}", (code[1] << 8) + code[2]);
2858     if (*code == OP_NOTMINUPTO) printf("?");
2859     code += 3;
2860     break;
2861
2862     case OP_REF:
2863     printf("    \\%d", *(++code));
2864     code ++;
2865     goto CLASS_REF_REPEAT;
2866
2867     case OP_CLASS:
2868     case OP_NEGCLASS:
2869     case OP_CLASS_L:
2870       {
2871       int i, min, max;
2872
2873       if (*code==OP_CLASS_L)
2874         {
2875           code++;
2876           printf("Locflag = %i ", *code++);
2877           printf("    [");
2878         }
2879       else
2880         {
2881           if (*code++ == OP_CLASS) printf("    [");
2882           else printf("   ^[");
2883         }
2884
2885
2886       for (i = 0; i < 256; i++)
2887         {
2888         if ((code[i/8] & (1 << (i&7))) != 0)
2889           {
2890           int j;
2891           for (j = i+1; j < 256; j++)
2892             if ((code[j/8] & (1 << (j&7))) == 0) break;
2893           if (i == '-' || i == ']') printf("\\");
2894           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2895           if (--j > i)
2896             {
2897             printf("-");
2898             if (j == '-' || j == ']') printf("\\");
2899             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2900             }
2901           i = j;
2902           }
2903         }
2904       printf("]");
2905       code += 32;
2906       /*      code ++;*/
2907
2908       CLASS_REF_REPEAT:
2909
2910       switch(*code)
2911         {
2912         case OP_CRSTAR:
2913         case OP_CRMINSTAR:
2914         case OP_CRPLUS:
2915         case OP_CRMINPLUS:
2916         case OP_CRQUERY:
2917         case OP_CRMINQUERY:
2918         printf("%s", OP_names[*code]);
2919         break;
2920
2921         case OP_CRRANGE:
2922         case OP_CRMINRANGE:
2923         min = (code[1] << 8) + code[2];
2924         max = (code[3] << 8) + code[4];
2925         if (max == 0) printf("{%d,}", min);
2926         else printf("{%d,%d}", min, max);
2927         if (*code == OP_CRMINRANGE) printf("?");
2928         code += 4;
2929         break;
2930
2931         default:
2932         code--;
2933         }
2934       }
2935     break;
2936
2937     /* Anything else is just a one-node item */
2938
2939     default:
2940     printf("    %s", OP_names[*code]);
2941     break;
2942     }
2943
2944   code++;
2945   printf("\n");
2946   }
2947 printf("------------------------------------------------------------------\n");
2948
2949 /* This check is done here in the debugging case so that the code that
2950 was compiled can be seen. */
2951
2952 if (code - re->code > length)
2953   {
2954   printf("length=%i, code length=%i\n", length, code-re->code);
2955   *errorptr = ERR23;
2956   (pcre_free)(re);
2957   *erroroffset = ptr - (uschar *)pattern;
2958   return NULL;
2959   }
2960 #endif
2961
2962 return (pcre *)re;
2963 }
2964
2965
2966
2967 /*************************************************
2968 *        Match a character type                  *
2969 *************************************************/
2970
2971 /* Not used in all the places it might be as it's sometimes faster
2972 to put the code inline.
2973
2974 Arguments:
2975   type        the character type
2976   c           the character
2977   dotall      the dotall flag
2978
2979 Returns:      TRUE if character is of the type
2980 */
2981
2982 static BOOL
2983 match_type(int type, int c, BOOL dotall)
2984 {
2985
2986 #ifdef DEBUG
2987 if (isprint(c)) printf("matching subject %c against ", c);
2988   else printf("matching subject \\x%02x against ", c);
2989 printf("%s\n", OP_names[type]);
2990 #endif
2991
2992 switch(type)
2993   {
2994   case OP_ANY:            return dotall || c != '\n';
2995   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;
2996   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;
2997   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2998   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;
2999   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;
3000   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;
3001   case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3002   case OP_WORDCHAR_L:     return (c=='_' || isalnum(c));
3003   }
3004 return FALSE;
3005 }
3006
3007
3008
3009 /*************************************************
3010 *          Match a back-reference                *
3011 *************************************************/
3012
3013 /* If a back reference hasn't been set, the match fails.
3014
3015 Arguments:
3016   number      reference number
3017   eptr        points into the subject
3018   length      length to be matched
3019   md          points to match data block
3020
3021 Returns:      TRUE if matched
3022 */
3023
3024 static BOOL
3025 match_ref(int number, register const uschar *eptr, int length, match_data *md)
3026 {
3027 const uschar *p = md->start_subject + md->offset_vector[number];
3028
3029 #ifdef DEBUG
3030 if (eptr >= md->end_subject)
3031   printf("matching subject <null>");
3032 else
3033   {
3034   printf("matching subject ");
3035   pchars(eptr, length, TRUE, md);
3036   }
3037 printf(" against backref ");
3038 pchars(p, length, FALSE, md);
3039 printf("\n");
3040 #endif
3041
3042 /* Always fail if not enough characters left */
3043
3044 if (length > md->end_subject - p) return FALSE;
3045
3046 /* Separate the caseless case for speed */
3047
3048 if (md->caseless)
3049   { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3050 else
3051   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3052
3053 return TRUE;
3054 }
3055
3056 static int free_stack(match_data *md)
3057 {
3058 /* Free any stack space that was allocated by the call to match(). */
3059 if (md->off_num)    PyMem_DEL(md->off_num);
3060 if (md->offset_top) PyMem_DEL(md->offset_top);
3061 if (md->r1)         PyMem_DEL(md->r1);
3062 if (md->r2)         PyMem_DEL(md->r2);
3063 if (md->eptr)       PyMem_DEL((char *)md->eptr);
3064 if (md->ecode)      PyMem_DEL((char *)md->ecode);
3065 return 0;
3066 }
3067
3068 static int grow_stack(match_data *md)
3069 {
3070   if (md->length != 0)
3071     {
3072       md->length = md->length + md->length/2;
3073     }
3074   else
3075     {
3076       int string_len = md->end_subject - md->start_subject + 1;
3077       if (string_len < 80) {md->length = string_len; }
3078       else {md->length = 80;}
3079     }
3080   PyMem_RESIZE(md->offset_top, int, md->length);
3081   PyMem_RESIZE(md->eptr, const uschar *, md->length);
3082   PyMem_RESIZE(md->ecode, const uschar *, md->length);
3083   PyMem_RESIZE(md->off_num, int, md->length);
3084   PyMem_RESIZE(md->r1, int, md->length);
3085   PyMem_RESIZE(md->r2, int, md->length);
3086   if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3087       md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3088     {
3089       PyErr_NoMemory();
3090       longjmp(md->error_env, 1);
3091     }
3092   return 0;
3093 }
3094
3095
3096 /*************************************************
3097 *         Match from current position            *
3098 *************************************************/
3099
3100 /* On entry ecode points to the first opcode, and eptr to the first character.
3101
3102 Arguments:
3103    eptr        pointer in subject
3104    ecode       position in code
3105    offset_top  current top pointer
3106    md          pointer to "static" info for the match
3107
3108 Returns:       TRUE if matched
3109 */
3110
3111 static BOOL
3112 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
3113   match_data *md)
3114 {
3115   int save_stack_position = md->point;
3116 match_loop:
3117
3118 #define SUCCEED goto succeed
3119 #define FAIL    goto fail
3120
3121 for (;;)
3122   {
3123   int min, max, ctype;
3124   register int i;
3125   register int c;
3126   BOOL minimize = FALSE;
3127
3128   /* Opening bracket. Check the alternative branches in turn, failing if none
3129   match. We have to set the start offset if required and there is space
3130   in the offset vector so that it is available for subsequent back references
3131   if the bracket matches. However, if the bracket fails, we must put back the
3132   previous value of both offsets in case they were set by a previous copy of
3133   the same bracket. Don't worry about setting the flag for the error case here;
3134   that is handled in the code for KET. */
3135
3136   if ((int)*ecode >= OP_BRA)
3137     {
3138     int number = (*ecode - OP_BRA) << 1;
3139     int save_offset1 = 0, save_offset2 = 0;
3140
3141     DPRINTF(("start bracket %d\n", number/2));
3142
3143     if (number > 0 && number < md->offset_end)
3144       {
3145       save_offset1 = md->offset_vector[number];
3146       save_offset2 = md->offset_vector[number+1];
3147       md->offset_vector[number] = eptr - md->start_subject;
3148
3149       DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
3150       }
3151
3152     /* Recurse for all the alternatives. */
3153
3154     do
3155       {
3156       if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3157       ecode += (ecode[1] << 8) + ecode[2];
3158       }
3159     while (*ecode == OP_ALT);
3160
3161     DPRINTF(("bracket %d failed\n", number/2));
3162
3163     if (number > 0 && number < md->offset_end)
3164       {
3165       md->offset_vector[number] = save_offset1;
3166       md->offset_vector[number+1] = save_offset2;
3167       }
3168
3169     FAIL;
3170     }
3171
3172   /* Other types of node can be handled by a switch */
3173
3174   switch(*ecode)
3175     {
3176     case OP_END:
3177     md->end_match_ptr = eptr;          /* Record where we ended */
3178     md->end_offset_top = offset_top;   /* and how many extracts were taken */
3179     SUCCEED;
3180
3181     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3182     whole thing doesn't match, so we have to get out via a longjmp(). */
3183
3184     case OP_CUT:
3185     if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3186     longjmp(md->fail_env, 1);
3187
3188     /* Assertion brackets. Check the alternative branches in turn - the
3189     matching won't pass the KET for an assertion. If any one branch matches,
3190     the assertion is true. */
3191
3192     case OP_ASSERT:
3193     do
3194       {
3195       if (match(eptr, ecode+3, offset_top, md)) break;
3196       ecode += (ecode[1] << 8) + ecode[2];
3197       }
3198     while (*ecode == OP_ALT);
3199     if (*ecode == OP_KET) FAIL;
3200
3201     /* Continue from after the assertion, updating the offsets high water
3202     mark, since extracts may have been taken during the assertion. */
3203
3204     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3205     ecode += 3;
3206     offset_top = md->end_offset_top;
3207     continue;
3208
3209     /* Negative assertion: all branches must fail to match */
3210
3211     case OP_ASSERT_NOT:
3212     do
3213       {
3214       if (match(eptr, ecode+3, offset_top, md)) FAIL;
3215       ecode += (ecode[1] << 8) + ecode[2];
3216       }
3217     while (*ecode == OP_ALT);
3218     ecode += 3;
3219     continue;
3220
3221     /* "Once" brackets are like assertion brackets except that after a match,
3222     the point in the subject string is not moved back. Thus there can never be
3223     a move back into the brackets. Check the alternative branches in turn - the
3224     matching won't pass the KET for this kind of subpattern. If any one branch
3225     matches, we carry on, leaving the subject pointer. */
3226
3227     case OP_ONCE:
3228     do
3229       {
3230       if (match(eptr, ecode+3, offset_top, md)) break;
3231       ecode += (ecode[1] << 8) + ecode[2];
3232       }
3233     while (*ecode == OP_ALT);
3234     if (*ecode == OP_KET) FAIL;
3235
3236     /* Continue as from after the assertion, updating the offsets high water
3237     mark, since extracts may have been taken. */
3238
3239     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3240     ecode += 3;
3241     offset_top = md->end_offset_top;
3242     eptr = md->end_match_ptr;
3243     continue;
3244
3245     /* An alternation is the end of a branch; scan along to find the end of the
3246     bracketed group and go to there. */
3247
3248     case OP_ALT:
3249     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3250     break;
3251
3252     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3253     that it may occur zero times. It may repeat infinitely, or not at all -
3254     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3255     repeat limits are compiled as a number of copies, with the optional ones
3256     preceded by BRAZERO or BRAMINZERO. */
3257
3258     case OP_BRAZERO:
3259       {
3260       const uschar *next = ecode+1;
3261       if (match(eptr, next, offset_top, md)) SUCCEED;
3262       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3263       ecode = next + 3;
3264       }
3265     break;
3266
3267     case OP_BRAMINZERO:
3268       {
3269       const uschar *next = ecode+1;
3270       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3271       if (match(eptr, next+3, offset_top, md)) SUCCEED;
3272       ecode++;
3273       }
3274     break;;
3275
3276     /* End of a group, repeated or non-repeating. If we are at the end of
3277     an assertion "group", stop matching and SUCCEED, but record the
3278     current high water mark for use by positive assertions. */
3279
3280     case OP_KET:
3281     case OP_KETRMIN:
3282     case OP_KETRMAX:
3283       {
3284       int number;
3285       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3286
3287       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
3288         {
3289         md->end_match_ptr = eptr;      /* For ONCE */
3290         md->end_offset_top = offset_top;
3291         SUCCEED;
3292         }
3293
3294       /* In all other cases we have to check the group number back at the
3295       start and if necessary complete handling an extraction by setting the
3296       final offset and bumping the high water mark. */
3297
3298       number = (*prev - OP_BRA) << 1;
3299
3300       DPRINTF(("end bracket %d\n", number/2));
3301
3302       if (number > 0)
3303         {
3304         if (number >= md->offset_end) md->offset_overflow = TRUE; else
3305           {
3306           md->offset_vector[number+1] = eptr - md->start_subject;
3307           if (offset_top <= number) offset_top = number + 2;
3308           }
3309         }
3310
3311       /* For a non-repeating ket, just advance to the next node and continue at
3312       this level. */
3313
3314       if (*ecode == OP_KET)
3315         {
3316         ecode += 3;
3317         break;
3318         }
3319
3320       /* The repeating kets try the rest of the pattern or restart from the
3321       preceding bracket, in the appropriate order. */
3322
3323       if (*ecode == OP_KETRMIN)
3324         {
3325         const uschar *ptr;
3326         if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3327         /* Handle alternation inside the BRA...KET; push the additional
3328            alternatives onto the stack */
3329         ptr=prev;
3330         do {
3331           ptr += (ptr[1]<<8)+ ptr[2];
3332           if (*ptr==OP_ALT)
3333             {
3334               if (md->length == md->point)
3335                 {
3336                   grow_stack(md);
3337                 }
3338               md->offset_top[md->point] = offset_top;
3339               md->eptr[md->point]       = eptr;
3340               md->ecode[md->point]      = ptr+3;
3341               md->r1[md->point]         = 0;
3342               md->r2[md->point]         = 0;
3343               md->off_num[md->point]    = 0;
3344               md->point++;
3345             }
3346         } while (*ptr==OP_ALT);
3347         ecode=prev+3; goto match_loop;
3348         }
3349       else  /* OP_KETRMAX */
3350         {
3351         const uschar *ptr;
3352         /*int points_pushed=0;*/
3353
3354         /* Push one failure point, that will resume matching at the code after
3355            the KETRMAX opcode. */
3356         if (md->length == md->point)
3357           {
3358             grow_stack(md);
3359           }
3360         md->offset_top[md->point] = offset_top;
3361         md->eptr[md->point]       = eptr;
3362         md->ecode[md->point]      = ecode+3;
3363         md->r1[md->point]         = md->offset_vector[number];
3364         md->r2[md->point]         = md->offset_vector[number+1];
3365         md->off_num[md->point]    = number;
3366         md->point++;
3367
3368         md->offset_vector[number] = eptr - md->start_subject;
3369         /* Handle alternation inside the BRA...KET; push each of the
3370            additional alternatives onto the stack */
3371         ptr=prev;
3372         do {
3373           ptr += (ptr[1]<<8)+ ptr[2];
3374           if (*ptr==OP_ALT)
3375             {
3376               if (md->length == md->point)
3377                 if (md->length == md->point)
3378                   {
3379                     grow_stack(md);
3380                   }
3381               md->offset_top[md->point] = offset_top;
3382               md->eptr[md->point]       = eptr;
3383               md->ecode[md->point]      = ptr+3;
3384               md->r1[md->point]         = 0;
3385               md->r2[md->point]         = 0;
3386               md->off_num[md->point]    = 0;
3387               md->point++;
3388               /*points_pushed++;*/
3389             }
3390         } while (*ptr==OP_ALT);
3391         /* Jump to the first (or only) alternative and resume trying to match */
3392         ecode=prev+3; goto match_loop;
3393         }
3394       }
3395
3396     /* Start of subject unless notbol, or after internal newline if multiline */
3397
3398     case OP_CIRC:
3399     if (md->notbol && eptr == md->start_subject) FAIL;
3400     if (md->multiline)
3401       {
3402       if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3403       ecode++;
3404       break;
3405       }
3406     /* ... else fall through */
3407
3408     /* Start of subject assertion */
3409
3410     case OP_SOD:
3411     if (eptr != md->start_subject) FAIL;
3412     ecode++;
3413     break;
3414
3415     /* Assert before internal newline if multiline, or before
3416     a terminating newline unless endonly is set, else end of subject unless
3417     noteol is set. */
3418
3419     case OP_DOLL:
3420     if (md->noteol && eptr >= md->end_subject) FAIL;
3421     if (md->multiline)
3422       {
3423       if (eptr < md->end_subject && *eptr != '\n') FAIL;
3424       ecode++;
3425       break;
3426       }
3427     else if (!md->endonly)
3428       {
3429       if (eptr < md->end_subject - 1 ||
3430          (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3431       ecode++;
3432       break;
3433       }
3434     /* ... else fall through */
3435
3436     /* End of subject assertion */
3437
3438     case OP_EOD:
3439     if (eptr < md->end_subject) FAIL;
3440     ecode++;
3441     break;
3442
3443     /* Word boundary assertions */
3444
3445     case OP_NOT_WORD_BOUNDARY:
3446     case OP_WORD_BOUNDARY:
3447       {
3448       BOOL prev_is_word = (eptr != md->start_subject) &&
3449         ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3450       BOOL cur_is_word = (eptr < md->end_subject) &&
3451         ((pcre_ctypes[*eptr] & ctype_word) != 0);
3452       if ((*ecode++ == OP_WORD_BOUNDARY)?
3453            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3454         FAIL;
3455       }
3456     break;
3457
3458     case OP_NOT_WORD_BOUNDARY_L:
3459     case OP_WORD_BOUNDARY_L:
3460       {
3461         BOOL prev_is_word = (eptr != md->start_subject) &&
3462           (isalnum(eptr[-1]) || eptr[-1]=='_');
3463         BOOL cur_is_word = (eptr < md->end_subject) &&
3464           (isalnum(*eptr) || *eptr=='_');
3465         if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3466             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3467           FAIL;
3468       }
3469       break;
3470
3471
3472     /* Match a single character type; inline for speed */
3473
3474     case OP_ANY:
3475     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3476     if (eptr++ >= md->end_subject) FAIL;
3477     ecode++;
3478     break;
3479
3480     case OP_NOT_DIGIT:
3481     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3482       FAIL;
3483     ecode++;
3484     break;
3485
3486     case OP_DIGIT:
3487     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3488       FAIL;
3489     ecode++;
3490     break;
3491
3492     case OP_NOT_WHITESPACE:
3493     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3494       FAIL;
3495     ecode++;
3496     break;
3497
3498     case OP_WHITESPACE:
3499     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3500       FAIL;
3501     ecode++;
3502     break;
3503
3504     case OP_NOT_WORDCHAR:
3505     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3506       FAIL;
3507     ecode++;
3508     break;
3509
3510     case OP_WORDCHAR:
3511     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3512       FAIL;
3513     ecode++;
3514     break;
3515
3516     case OP_NOT_WORDCHAR_L:
3517     if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
3518       FAIL;
3519     eptr++;
3520     ecode++;
3521     break;
3522
3523     case OP_WORDCHAR_L:
3524     if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
3525       FAIL;
3526     eptr++;
3527     ecode++;
3528     break;
3529
3530     /* Match a back reference, possibly repeatedly. Look past the end of the
3531     item to see if there is repeat information following. The code is similar
3532     to that for character classes, but repeated for efficiency. Then obey
3533     similar code to character type repeats - written out again for speed.
3534     However, if the referenced string is the empty string, always treat
3535     it as matched, any number of times (otherwise there could be infinite
3536     loops). */
3537
3538     case OP_REF:
3539       {
3540       int length;
3541       int number = ecode[1] << 1;                /* Doubled reference number */
3542       ecode += 2;                                /* Advance past the item */
3543
3544       if (number >= offset_top || md->offset_vector[number] < 0)
3545         {
3546         md->errorcode = PCRE_ERROR_BADREF;
3547         FAIL;
3548         }
3549
3550       length = md->offset_vector[number+1] - md->offset_vector[number];
3551
3552       switch (*ecode)
3553         {
3554         case OP_CRSTAR:
3555         case OP_CRMINSTAR:
3556         case OP_CRPLUS:
3557         case OP_CRMINPLUS:
3558         case OP_CRQUERY:
3559         case OP_CRMINQUERY:
3560         c = *ecode++ - OP_CRSTAR;
3561         minimize = (c & 1) != 0;
3562         min = rep_min[c];                 /* Pick up values from tables; */
3563         max = rep_max[c];                 /* zero for max => infinity */
3564         if (max == 0) max = INT_MAX;
3565         break;
3566
3567         case OP_CRRANGE:
3568         case OP_CRMINRANGE:
3569         minimize = (*ecode == OP_CRMINRANGE);
3570         min = (ecode[1] << 8) + ecode[2];
3571         max = (ecode[3] << 8) + ecode[4];
3572         if (max == 0) max = INT_MAX;
3573         ecode += 5;
3574         break;
3575
3576         default:               /* No repeat follows */
3577         if (!match_ref(number, eptr, length, md)) FAIL;
3578         eptr += length;
3579         continue;              /* With the main loop */
3580         }
3581
3582       /* If the length of the reference is zero, just continue with the
3583       main loop. */
3584
3585       if (length == 0) continue;
3586
3587       /* First, ensure the minimum number of matches are present. We get back
3588       the length of the reference string explicitly rather than passing the
3589       address of eptr, so that eptr can be a register variable. */
3590
3591       for (i = 1; i <= min; i++)
3592         {
3593         if (!match_ref(number, eptr, length, md)) FAIL;
3594         eptr += length;
3595         }
3596
3597       /* If min = max, continue at the same level without recursion.
3598       They are not both allowed to be zero. */
3599
3600       if (min == max) continue;
3601
3602       /* If minimizing, keep trying and advancing the pointer */
3603
3604       if (minimize)
3605         {
3606         for (i = min;; i++)
3607           {
3608           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3609           if (i >= max || !match_ref(number, eptr, length, md))
3610             FAIL;
3611           eptr += length;
3612           }
3613         /* Control never gets here */
3614         }
3615
3616       /* If maximizing, find the longest string and work backwards */
3617
3618       else
3619         {
3620         const uschar *pp = eptr;
3621         for (i = min; i < max; i++)
3622           {
3623           if (!match_ref(number, eptr, length, md)) break;
3624           eptr += length;
3625           }
3626         while (eptr >= pp)
3627           {
3628           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3629           eptr -= length;
3630           }
3631         FAIL;
3632         }
3633       }
3634     /* Control never gets here */
3635
3636     /* Match a character class, possibly repeatedly. Look past the end of the
3637     item to see if there is repeat information following. Then obey similar
3638     code to character type repeats - written out again for speed. If caseless
3639     matching was set at runtime but not at compile time, we have to check both
3640     versions of a character, and we have to behave differently for positive and
3641     negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3642     treated differently. */
3643
3644     case OP_CLASS:
3645     case OP_NEGCLASS:
3646       {
3647       BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
3648       const uschar *data = ecode + 1;  /* Save for matching */
3649       ecode += 33;                     /* Advance past the item */
3650
3651       switch (*ecode)
3652         {
3653         case OP_CRSTAR:
3654         case OP_CRMINSTAR:
3655         case OP_CRPLUS:
3656         case OP_CRMINPLUS:
3657         case OP_CRQUERY:
3658         case OP_CRMINQUERY:
3659         c = *ecode++ - OP_CRSTAR;
3660         minimize = (c & 1) != 0;
3661         min = rep_min[c];                 /* Pick up values from tables; */
3662         max = rep_max[c];                 /* zero for max => infinity */
3663         if (max == 0) max = INT_MAX;
3664         break;
3665
3666         case OP_CRRANGE:
3667         case OP_CRMINRANGE:
3668         minimize = (*ecode == OP_CRMINRANGE);
3669         min = (ecode[1] << 8) + ecode[2];
3670         max = (ecode[3] << 8) + ecode[4];
3671         if (max == 0) max = INT_MAX;
3672         ecode += 5;
3673         break;
3674
3675         default:               /* No repeat follows */
3676           min = max = 1;
3677           break;
3678         }
3679
3680       /* First, ensure the minimum number of matches are present. */
3681
3682       for (i = 1; i <= min; i++)
3683         {
3684         if (eptr >= md->end_subject) FAIL;
3685         c = *eptr++;
3686
3687         /* Either not runtime caseless, or it was a positive class. For
3688         runtime caseless, continue if either case is in the map. */
3689
3690         if (!nasty_case)
3691           {
3692           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3693           if (md->runtime_caseless)
3694             {
3695             c = pcre_fcc[c];
3696             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3697             }
3698           }
3699
3700         /* Runtime caseless and it was a negative class. Continue only if
3701         both cases are in the map. */
3702
3703         else
3704           {
3705            if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3706            c = pcre_fcc[c];
3707            if ((data[c/8] & (1 << (c&7))) != 0) continue;
3708            }
3709
3710         FAIL;
3711         }
3712
3713       /* If max == min we can continue with the main loop without the
3714       need to recurse. */
3715
3716       if (min == max) continue;
3717
3718       /* If minimizing, keep testing the rest of the expression and advancing
3719       the pointer while it matches the class. */
3720
3721       if (minimize)
3722         {
3723         for (i = min;; i++)
3724           {
3725           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3726           if (i >= max || eptr >= md->end_subject) FAIL;
3727           c = *eptr++;
3728
3729           /* Either not runtime caseless, or it was a positive class. For
3730           runtime caseless, continue if either case is in the map. */
3731
3732           if (!nasty_case)
3733             {
3734             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3735             if (md->runtime_caseless)
3736               {
3737               c = pcre_fcc[c];
3738               if ((data[c/8] & (1 << (c&7))) != 0) continue;
3739               }
3740             }
3741
3742           /* Runtime caseless and it was a negative class. Continue only if
3743           both cases are in the map. */
3744
3745           else
3746              {
3747              if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3748              c = pcre_fcc[c];
3749              if ((data[c/8] & (1 << (c&7))) != 0) continue;
3750              }
3751
3752           FAIL;
3753           }
3754         /* Control never gets here */
3755         }
3756
3757       /* If maximizing, find the longest possible run, then work backwards. */
3758
3759       else
3760         {
3761         const uschar *pp = eptr;
3762         for (i = min; i < max; eptr++, i++)
3763           {
3764           if (eptr >= md->end_subject) break;
3765           c = *eptr;
3766
3767           /* Either not runtime caseless, or it was a positive class. For
3768           runtime caseless, continue if either case is in the map. */
3769
3770           if (!nasty_case)
3771             {
3772             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3773             if (md->runtime_caseless)
3774               {
3775               c = pcre_fcc[c];
3776               if ((data[c/8] & (1 << (c&7))) != 0) continue;
3777               }
3778             }
3779
3780           /* Runtime caseless and it was a negative class. Continue only if
3781           both cases are in the map. */
3782
3783           else
3784             {
3785             if ((data[c/8] & (1 << (c&7))) == 0) break;
3786             c = pcre_fcc[c];
3787             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3788             }
3789
3790           break;
3791           }
3792
3793         while (eptr >= pp)
3794           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3795         FAIL;
3796         }
3797       }
3798     /* Control never gets here */
3799
3800    /* OP_CLASS_L opcode: handles localized character classes */
3801
3802    case OP_CLASS_L:
3803      {
3804       const uschar *data = ecode + 1;  /* Save for matching */
3805       const uschar locale_flag = *data;
3806       ecode++; data++;          /* The localization support adds an extra byte */
3807
3808       ecode += 33;               /* Advance past the item */
3809
3810       switch (*ecode)
3811         {
3812         case OP_CRSTAR:
3813         case OP_CRMINSTAR:
3814         case OP_CRPLUS:
3815         case OP_CRMINPLUS:
3816         case OP_CRQUERY:
3817         case OP_CRMINQUERY:
3818         c = *ecode++ - OP_CRSTAR;
3819         minimize = (c & 1) != 0;
3820         min = rep_min[c];                 /* Pick up values from tables; */
3821         max = rep_max[c];                 /* zero for max => infinity */
3822         if (max == 0) max = INT_MAX;
3823         break;
3824
3825         case OP_CRRANGE:
3826         case OP_CRMINRANGE:
3827         minimize = (*ecode == OP_CRMINRANGE);
3828         min = (ecode[1] << 8) + ecode[2];
3829         max = (ecode[3] << 8) + ecode[4];
3830         if (max == 0) max = INT_MAX;
3831         ecode += 5;
3832         break;
3833
3834         default:               /* No repeat follows */
3835         if (eptr >= md->end_subject) FAIL;
3836         c = *eptr++;
3837         if ((data[c/8] & (1 << (c&7))) != 0) continue;    /* With main loop */
3838         if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3839         if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3840 #if 0
3841         if ( (locale_flag &  4) && isdigit(c) ) continue;    /* Locale \d */
3842         if ( (locale_flag &  8) && !isdigit(c) ) continue;   /* Locale \D */
3843         if ( (locale_flag & 16) && isspace(c) ) continue;    /* Locale \s */
3844         if ( (locale_flag & 32) && !isspace(c) ) continue;   /* Locale \S */
3845 #endif
3846
3847         if (md->runtime_caseless)
3848           {
3849           c = pcre_fcc[c];
3850           if ((data[c/8] & (1 << (c&7))) != 0) continue;  /* With main loop */
3851
3852           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3853           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3854           }
3855         FAIL;
3856         }
3857
3858       /* First, ensure the minimum number of matches are present. */
3859
3860       for (i = 1; i <= min; i++)
3861         {
3862         if (eptr >= md->end_subject) FAIL;
3863         c = *eptr++;
3864         if ((data[c/8] & (1 << (c&7))) != 0) continue;
3865         if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3866         if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3867
3868         if (md->runtime_caseless)
3869           {
3870           c = pcre_fcc[c];
3871           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3872           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3873           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3874           }
3875         FAIL;
3876         }
3877
3878       /* If max == min we can continue with the main loop without the
3879       need to recurse. */
3880
3881       if (min == max) continue;
3882
3883       /* If minimizing, keep testing the rest of the expression and advancing
3884       the pointer while it matches the class. */
3885
3886       if (minimize)
3887         {
3888         for (i = min;; i++)
3889           {
3890           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3891           if (i >= max || eptr >= md->end_subject) FAIL;
3892           c = *eptr++;
3893           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3894           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3895           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3896
3897           if (md->runtime_caseless)
3898             {
3899             c = pcre_fcc[c];
3900             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3901             if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3902             if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3903             }
3904           FAIL;
3905           }
3906         /* Control never gets here */
3907         }
3908
3909       /* If maximizing, find the longest possible run, then work backwards. */
3910
3911       else
3912         {
3913         const uschar *pp = eptr;
3914         for (i = min; i < max; eptr++, i++)
3915           {
3916           if (eptr >= md->end_subject) break;
3917           c = *eptr;
3918           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3919           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3920           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3921           if (md->runtime_caseless)
3922             {
3923             c = pcre_fcc[c];
3924             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3925             if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3926             if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3927             }
3928           break;
3929           }
3930
3931         while (eptr >= pp)
3932           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3933         FAIL;
3934         }
3935       }
3936     /* Control never gets here */
3937
3938     /* Match a run of characters */
3939
3940     case OP_CHARS:
3941       {
3942       register int length = ecode[1];
3943       ecode += 2;
3944
3945 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
3946       if (eptr >= md->end_subject)
3947         printf("matching subject <null> against pattern ");
3948       else
3949         {
3950         printf("matching subject ");
3951         pchars(eptr, length, TRUE, md);
3952         printf(" against pattern ");
3953         }
3954       pchars(ecode, length, FALSE, md);
3955       printf("\n");
3956 #endif
3957
3958       if (length > md->end_subject - eptr) FAIL;
3959       if (md->caseless)
3960         {
3961         while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3962         }
3963       else
3964         {
3965         while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3966         }
3967       }
3968     break;
3969
3970     /* Match a single character repeatedly; different opcodes share code. */
3971
3972     case OP_EXACT:
3973     min = max = (ecode[1] << 8) + ecode[2];
3974     ecode += 3;
3975     goto REPEATCHAR;
3976
3977     case OP_UPTO:
3978     case OP_MINUPTO:
3979     min = 0;
3980     max = (ecode[1] << 8) + ecode[2];
3981     minimize = *ecode == OP_MINUPTO;
3982     ecode += 3;
3983     goto REPEATCHAR;
3984
3985     case OP_STAR:
3986     case OP_MINSTAR:
3987     case OP_PLUS:
3988     case OP_MINPLUS:
3989     case OP_QUERY:
3990     case OP_MINQUERY:
3991     c = *ecode++ - OP_STAR;
3992     minimize = (c & 1) != 0;
3993     min = rep_min[c];                 /* Pick up values from tables; */
3994     max = rep_max[c];                 /* zero for max => infinity */
3995     if (max == 0) max = INT_MAX;
3996
3997     /* Common code for all repeated single-character matches. We can give
3998     up quickly if there are fewer than the minimum number of characters left in
3999     the subject. */
4000
4001     REPEATCHAR:
4002     if (min > md->end_subject - eptr) FAIL;
4003     c = *ecode++;
4004
4005     /* The code is duplicated for the caseless and caseful cases, for speed,
4006     since matching characters is likely to be quite common. First, ensure the
4007     minimum number of matches are present. If min = max, continue at the same
4008     level without recursing. Otherwise, if minimizing, keep trying the rest of
4009     the expression and advancing one matching character if failing, up to the
4010     maximum. Alternatively, if maximizing, find the maximum number of
4011     characters and work backwards. */
4012
4013     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4014       max, eptr));
4015
4016     if (md->caseless)
4017       {
4018       c = pcre_lcc[c];
4019       for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4020       if (min == max) continue;
4021       if (minimize)
4022         {
4023         for (i = min;; i++)
4024           {
4025           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4026           if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4027             FAIL;
4028           }
4029         /* Control never gets here */
4030         }
4031       else
4032         {
4033         const uschar *pp = eptr;
4034         for (i = min; i < max; i++)
4035           {
4036           if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4037           eptr++;
4038           }
4039         while (eptr >= pp)
4040           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4041         FAIL;
4042         }
4043       /* Control never gets here */
4044       }
4045
4046     /* Caseful comparisons */
4047
4048     else
4049       {
4050       for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4051       if (min == max) continue;
4052       if (minimize)
4053         {
4054         for (i = min;; i++)
4055           {
4056           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4057           if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4058           }
4059         /* Control never gets here */
4060         }
4061       else
4062         {
4063         const uschar *pp = eptr;
4064         for (i = min; i < max; i++)
4065           {
4066           if (eptr >= md->end_subject || c != *eptr) break;
4067           eptr++;
4068           }
4069         while (eptr >= pp)
4070          if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4071         FAIL;
4072         }
4073       }
4074     /* Control never gets here */
4075
4076     /* Match a negated single character */
4077
4078     case OP_NOT:
4079     if (eptr >= md->end_subject) FAIL;
4080     ecode++;
4081     if (md->caseless)
4082       {
4083       if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4084       }
4085     else
4086       {
4087       if (*ecode++ == *eptr++) FAIL;
4088       }
4089     break;
4090
4091     /* Match a negated single character repeatedly. This is almost a repeat of
4092     the code for a repeated single character, but I haven't found a nice way of
4093     commoning these up that doesn't require a test of the positive/negative
4094     option for each character match. Maybe that wouldn't add very much to the
4095     time taken, but character matching *is* what this is all about... */
4096
4097     case OP_NOTEXACT:
4098     min = max = (ecode[1] << 8) + ecode[2];
4099     ecode += 3;
4100     goto REPEATNOTCHAR;
4101
4102     case OP_NOTUPTO:
4103     case OP_NOTMINUPTO:
4104     min = 0;
4105     max = (ecode[1] << 8) + ecode[2];
4106     minimize = *ecode == OP_NOTMINUPTO;
4107     ecode += 3;
4108     goto REPEATNOTCHAR;
4109
4110     case OP_NOTSTAR:
4111     case OP_NOTMINSTAR:
4112     case OP_NOTPLUS:
4113     case OP_NOTMINPLUS:
4114     case OP_NOTQUERY:
4115     case OP_NOTMINQUERY:
4116     c = *ecode++ - OP_NOTSTAR;
4117     minimize = (c & 1) != 0;
4118     min = rep_min[c];                 /* Pick up values from tables; */
4119     max = rep_max[c];                 /* zero for max => infinity */
4120     if (max == 0) max = INT_MAX;
4121
4122     /* Common code for all repeated single-character matches. We can give
4123     up quickly if there are fewer than the minimum number of characters left in
4124     the subject. */
4125
4126     REPEATNOTCHAR:
4127     if (min > md->end_subject - eptr) FAIL;
4128     c = *ecode++;
4129
4130     /* The code is duplicated for the caseless and caseful cases, for speed,
4131     since matching characters is likely to be quite common. First, ensure the
4132     minimum number of matches are present. If min = max, continue at the same
4133     level without recursing. Otherwise, if minimizing, keep trying the rest of
4134     the expression and advancing one matching character if failing, up to the
4135     maximum. Alternatively, if maximizing, find the maximum number of
4136     characters and work backwards. */
4137
4138     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4139       max, eptr));
4140
4141     if (md->caseless)
4142       {
4143       c = pcre_lcc[c];
4144       for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4145       if (min == max) continue;
4146       if (minimize)
4147         {
4148         for (i = min;; i++)
4149           {
4150           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4151           if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4152             FAIL;
4153           }
4154         /* Control never gets here */
4155         }
4156       else
4157         {
4158         const uschar *pp = eptr;
4159         for (i = min; i < max; i++)
4160           {
4161           if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4162           eptr++;
4163           }
4164         while (eptr >= pp)
4165           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4166         FAIL;
4167         }
4168       /* Control never gets here */
4169       }
4170
4171     /* Caseful comparisons */
4172
4173     else
4174       {
4175       for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4176       if (min == max) continue;
4177       if (minimize)
4178         {
4179         for (i = min;; i++)
4180           {
4181           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4182           if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4183           }
4184         /* Control never gets here */
4185         }
4186       else
4187         {
4188         const uschar *pp = eptr;
4189         for (i = min; i < max; i++)
4190           {
4191           if (eptr >= md->end_subject || c == *eptr) break;
4192           eptr++;
4193           }
4194         while (eptr >= pp)
4195          if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4196         FAIL;
4197         }
4198       }
4199     /* Control never gets here */
4200
4201     /* Match a single character type repeatedly; several different opcodes
4202     share code. This is very similar to the code for single characters, but we
4203     repeat it in the interests of efficiency. */
4204
4205     case OP_TYPEEXACT:
4206     min = max = (ecode[1] << 8) + ecode[2];
4207     minimize = TRUE;
4208     ecode += 3;
4209     goto REPEATTYPE;
4210
4211     case OP_TYPEUPTO:
4212     case OP_TYPEMINUPTO:
4213     min = 0;
4214     max = (ecode[1] << 8) + ecode[2];
4215     minimize = *ecode == OP_TYPEMINUPTO;
4216     ecode += 3;
4217     goto REPEATTYPE;
4218
4219     case OP_TYPESTAR:
4220     case OP_TYPEMINSTAR:
4221     case OP_TYPEPLUS:
4222     case OP_TYPEMINPLUS:
4223     case OP_TYPEQUERY:
4224     case OP_TYPEMINQUERY:
4225     c = *ecode++ - OP_TYPESTAR;
4226     minimize = (c & 1) != 0;
4227     min = rep_min[c];                 /* Pick up values from tables; */
4228     max = rep_max[c];                 /* zero for max => infinity */
4229     if (max == 0) max = INT_MAX;
4230
4231     /* Common code for all repeated single character type matches */
4232
4233     REPEATTYPE:
4234     ctype = *ecode++;      /* Code for the character type */
4235
4236     /* First, ensure the minimum number of matches are present. Use inline
4237     code for maximizing the speed, and do the type test once at the start
4238     (i.e. keep it out of the loop). Also test that there are at least the
4239     minimum number of characters before we start. */
4240
4241     if (min > md->end_subject - eptr) FAIL;
4242     if (min > 0) switch(ctype)
4243       {
4244       case OP_ANY:
4245       if (!md->dotall)
4246         { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4247       else eptr += min;
4248       break;
4249
4250       case OP_NOT_DIGIT:
4251       for (i = 1; i <= min; i++)
4252         if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4253       break;
4254
4255       case OP_DIGIT:
4256       for (i = 1; i <= min; i++)
4257         if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4258       break;
4259
4260       case OP_NOT_WHITESPACE:
4261       for (i = 1; i <= min; i++)
4262         if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4263       break;
4264
4265       case OP_WHITESPACE:
4266       for (i = 1; i <= min; i++)
4267         if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4268       break;
4269
4270       case OP_NOT_WORDCHAR:
4271       for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4272         FAIL;
4273       break;
4274
4275       case OP_WORDCHAR:
4276       for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4277         FAIL;
4278       break;
4279
4280       case OP_NOT_WORDCHAR_L:
4281       for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
4282         FAIL;
4283       break;
4284
4285       case OP_WORDCHAR_L:
4286       for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
4287         FAIL;
4288       break;
4289       }
4290
4291     /* If min = max, continue at the same level without recursing */
4292
4293     if (min == max) continue;
4294
4295     /* If minimizing, we have to test the rest of the pattern before each
4296     subsequent match, so inlining isn't much help; just use the function. */
4297
4298     if (minimize)
4299       {
4300       for (i = min;; i++)
4301         {
4302         if (match(eptr, ecode, offset_top, md)) SUCCEED;
4303         if (i >= max || eptr >= md->end_subject ||
4304           !match_type(ctype, *eptr++, md->dotall))
4305             FAIL;
4306         }
4307       /* Control never gets here */
4308       }
4309
4310     /* If maximizing it is worth using inline code for speed, doing the type
4311     test once at the start (i.e. keep it out of the loop). */
4312
4313     else
4314       {
4315       const uschar *pp = eptr;
4316       switch(ctype)
4317         {
4318         case OP_ANY:
4319         if (!md->dotall)
4320           {
4321           for (i = min; i < max; i++)
4322             {
4323             if (eptr >= md->end_subject || *eptr == '\n') break;
4324             eptr++;
4325             }
4326           }
4327         else
4328           {
4329           c = max - min;
4330           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4331           eptr += c;
4332           }
4333         break;
4334
4335         case OP_NOT_DIGIT:
4336         for (i = min; i < max; i++)
4337           {
4338           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4339             break;
4340           eptr++;
4341           }
4342         break;
4343
4344         case OP_DIGIT:
4345         for (i = min; i < max; i++)
4346           {
4347           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4348             break;
4349           eptr++;
4350           }
4351         break;
4352
4353         case OP_NOT_WHITESPACE:
4354         for (i = min; i < max; i++)
4355           {
4356           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4357             break;
4358           eptr++;
4359           }
4360         break;
4361
4362         case OP_WHITESPACE:
4363         for (i = min; i < max; i++)
4364           {
4365           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4366             break;
4367           eptr++;
4368           }
4369         break;
4370
4371         case OP_NOT_WORDCHAR:
4372         for (i = min; i < max; i++)
4373           {
4374           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4375             break;
4376           eptr++;
4377           }
4378         break;
4379
4380         case OP_WORDCHAR:
4381         for (i = min; i < max; i++)
4382           {
4383             if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4384               break;
4385             eptr++;
4386           }
4387         break;
4388         case OP_NOT_WORDCHAR_L:
4389           for (i = min; i < max; i++)
4390          {
4391          if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
4392            break;
4393          eptr++;
4394          }
4395        break;
4396
4397        case OP_WORDCHAR_L:
4398        for (i = min; i < max; i++)
4399          {
4400          if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
4401              break;
4402           eptr++;
4403           }
4404         break;
4405         }
4406
4407       while (eptr >= pp)
4408         if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4409       FAIL;
4410       }
4411     /* Control never gets here */
4412
4413     /* There's been some horrible disaster. */
4414
4415     default:
4416     DPRINTF(("Unknown opcode %d\n", *ecode));
4417     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4418     FAIL;
4419     }
4420
4421   /* Do not stick any code in here without much thought; it is assumed
4422   that "continue" in the code above comes out to here to repeat the main
4423   loop. */
4424
4425   }             /* End of main loop */
4426 /* Control never reaches here */
4427
4428 fail:
4429  if (md->point > save_stack_position)
4430  {
4431    /* If there are still points remaining on the stack, pop the next one off */
4432    int off_num;
4433
4434    md->point--;
4435    offset_top = md->offset_top[md->point];
4436    eptr       = md->eptr[md->point];
4437    ecode      = md->ecode[md->point];
4438    off_num    = md->off_num[md->point];
4439    md->offset_vector[off_num]   = md->r1[md->point];
4440    md->offset_vector[off_num+1] = md->r2[md->point];
4441    goto match_loop;
4442   }
4443    /* Failure, and nothing left on the stack, so end this function call */
4444
4445  /* Restore the top of the stack to where it was before this function
4446     call.  This lets us use one stack for everything; recursive calls
4447     can push and pop information, and may increase the stack.  When
4448     the call returns, the parent function can resume pushing and
4449     popping wherever it was. */
4450
4451  md->point = save_stack_position;
4452  return FALSE;
4453
4454 succeed:
4455  return TRUE;
4456 }
4457
4458
4459
4460 /*************************************************
4461 *         Segregate setjmp()                     *
4462 *************************************************/
4463
4464 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4465 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4466 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4467 since it's needed only for the extension \X option, and with any luck, a good
4468 compiler will spot the tail recursion and compile it efficiently.
4469
4470 Arguments:
4471    eptr        pointer in subject
4472    ecode       position in code
4473    offset_top  current top pointer
4474    md          pointer to "static" info for the match
4475
4476 Returns:       TRUE if matched
4477 */
4478
4479 static BOOL
4480 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4481   match_data *match_block)
4482 {
4483 return setjmp(match_block->fail_env) == 0 &&
4484       match(eptr, ecode, offset_top, match_block);
4485 }
4486
4487
4488
4489 /*************************************************
4490 *         Execute a Regular Expression           *
4491 *************************************************/
4492
4493 /* This function applies a compiled re to a subject string and picks out
4494 portions of the string if it matches. Two elements in the vector are set for
4495 each substring: the offsets to the start and end of the substring.
4496
4497 Arguments:
4498   external_re     points to the compiled expression
4499   external_extra  points to "hints" from pcre_study() or is NULL
4500   subject         points to the subject string
4501   length          length of subject string (may contain binary zeros)
4502   options         option bits
4503   offsets         points to a vector of ints to be filled in with offsets
4504   offsetcount     the number of elements in the vector
4505
4506 Returns:          > 0 => success; value is the number of elements filled in
4507                   = 0 => success, but offsets is not big enough
4508                    -1 => failed to match
4509                  < -1 => some kind of unexpected problem
4510 */
4511
4512 int
4513 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4514   const char *subject, int length, int start_pos, int options,
4515   int *offsets, int offsetcount)
4516 {
4517   /* The "volatile" directives are to make gcc -Wall stop complaining
4518      that these variables can be clobbered by the longjmp.  Hopefully
4519      they won't cost too much performance. */
4520 volatile int resetcount, ocount;
4521 volatile int first_char = -1;
4522 const uschar * volatile start_bits = NULL;
4523 const uschar * volatile start_match = (const uschar *)subject + start_pos;
4524 match_data match_block;
4525 const uschar *end_subject;
4526 const real_pcre *re = (const real_pcre *)external_re;
4527 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4528 volatile BOOL using_temporary_offsets = FALSE;
4529 volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4530 volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4531
4532 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4533
4534 if (re == NULL || subject == NULL ||
4535    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4536 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4537
4538 match_block.start_subject = (const uschar *)subject;
4539 match_block.end_subject = match_block.start_subject + length;
4540 end_subject = match_block.end_subject;
4541
4542 match_block.caseless  = ((re->options | options) & PCRE_CASELESS) != 0;
4543 match_block.runtime_caseless = match_block.caseless &&
4544   (re->options & PCRE_CASELESS) == 0;
4545
4546 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4547 match_block.dotall    = ((re->options | options) & PCRE_DOTALL) != 0;
4548 match_block.endonly   = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4549
4550 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4551 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4552
4553 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4554
4555 /* Set the stack state to empty */
4556   match_block.off_num = match_block.offset_top = NULL;
4557   match_block.r1 = match_block.r2 = NULL;
4558   match_block.eptr = match_block.ecode = NULL;
4559   match_block.point = match_block.length = 0;
4560
4561 /* If the expression has got more back references than the offsets supplied can
4562 hold, we get a temporary bit of working store to use during the matching.
4563 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4564 of 2. */
4565
4566 ocount = offsetcount & (-2);
4567 if (re->top_backref > 0 && re->top_backref >= ocount/2)
4568   {
4569   ocount = re->top_backref * 2 + 2;
4570   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4571   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4572   using_temporary_offsets = TRUE;
4573   DPRINTF(("Got memory to hold back references\n"));
4574   }
4575 else match_block.offset_vector = offsets;
4576
4577 match_block.offset_end = ocount;
4578 match_block.offset_overflow = FALSE;
4579
4580 /* Compute the minimum number of offsets that we need to reset each time. Doing
4581 this makes a huge difference to execution time when there aren't many brackets
4582 in the pattern. */
4583
4584 resetcount = 2 + re->top_bracket * 2;
4585 if (resetcount > offsetcount) resetcount = ocount;
4586
4587 /* If MULTILINE is set at exec time but was not set at compile time, and the
4588 anchored flag is set, we must re-check because a setting provoked by ^ in the
4589 pattern is not right in multi-line mode. Calling is_anchored() again here does
4590 the right check, because multiline is now set. If it now yields FALSE, the
4591 expression must have had ^ starting some of its branches. Check to see if
4592 that is true for *all* branches, and if so, set the startline flag. */
4593
4594 if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4595     !is_anchored(re->code, match_block.multiline))
4596   {
4597   anchored = FALSE;
4598   if (is_startline(re->code)) startline = TRUE;
4599   }
4600
4601 /* Set up the first character to match, if available. The first_char value is
4602 never set for an anchored regular expression, but the anchoring may be forced
4603 at run time, so we have to test for anchoring. The first char may be unset for
4604 an unanchored pattern, of course. If there's no first char and the pattern was
4605 studied, the may be a bitmap of possible first characters. However, we can
4606 use this only if the caseless state of the studying was correct. */
4607
4608 if (!anchored)
4609   {
4610   if ((re->options & PCRE_FIRSTSET) != 0)
4611     {
4612     first_char = re->first_char;
4613     if (match_block.caseless) first_char = pcre_lcc[first_char];
4614     }
4615   else
4616     if (!startline && extra != NULL &&
4617       (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4618       ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4619         start_bits = extra->start_bits;
4620   }
4621
4622 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4623
4624 do
4625   {
4626   int rc;
4627   register int *iptr = match_block.offset_vector;
4628   register int *iend = iptr + resetcount;
4629
4630   /* Reset the maximum number of extractions we might see. */
4631
4632   while (iptr < iend) *iptr++ = -1;
4633
4634   /* Advance to a unique first char if possible */
4635
4636   if (first_char >= 0)
4637     {
4638     if (match_block.caseless)
4639       while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4640         start_match++;
4641     else
4642       while (start_match < end_subject && *start_match != first_char)
4643         start_match++;
4644     }
4645
4646   /* Or to just after \n for a multiline match if possible */
4647
4648   else if (startline)
4649     {
4650     if (start_match > match_block.start_subject)
4651       {
4652       while (start_match < end_subject && start_match[-1] != '\n')
4653         start_match++;
4654       }
4655     }
4656
4657   /* Or to a non-unique first char */
4658
4659   else if (start_bits != NULL)
4660     {
4661     while (start_match < end_subject)
4662       {
4663       register int c = *start_match;
4664       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4665       }
4666     }
4667
4668 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
4669   printf(">>>> Match against: ");
4670   pchars(start_match, end_subject - start_match, TRUE, &match_block);
4671   printf("\n");
4672 #endif
4673
4674   /* When a match occurs, substrings will be set for all internal extractions;
4675   we just need to set up the whole thing as substring 0 before returning. If
4676   there were too many extractions, set the return code to zero. In the case
4677   where we had to get some local store to hold offsets for backreferences, copy
4678   those back references that we can. In this case there need not be overflow
4679   if certain parts of the pattern were not used.
4680
4681   Before starting the match, we have to set up a longjmp() target to enable
4682   the "cut" operation to fail a match completely without backtracking. This
4683   is done in a separate function to avoid compiler warnings. We need not do
4684   it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4685   enabled. */
4686
4687   /* To handle errors such as running out of memory for the failure
4688      stack, we need to save this location via setjmp(), so
4689      error-handling code can call longjmp() to jump out of deeply-nested code. */
4690   if (setjmp(match_block.error_env)==0)
4691     {
4692
4693   if ((re->options & PCRE_EXTRA) != 0)
4694     {
4695     if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4696       continue;
4697     }
4698   else if (!match(start_match, re->code, 2, &match_block)) continue;
4699
4700   /* Copy the offset information from temporary store if necessary */
4701
4702   if (using_temporary_offsets)
4703     {
4704     if (offsetcount >= 4)
4705       {
4706       memcpy(offsets + 2, match_block.offset_vector + 2,
4707         (offsetcount - 2) * sizeof(int));
4708       DPRINTF(("Copied offsets from temporary memory\n"));
4709       }
4710     if (match_block.end_offset_top > offsetcount)
4711       match_block.offset_overflow = TRUE;
4712
4713     DPRINTF(("Freeing temporary memory\n"));
4714     (pcre_free)(match_block.offset_vector);
4715     }
4716
4717   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4718
4719   if (match_block.offset_end < 2) rc = 0; else
4720     {
4721     offsets[0] = start_match - match_block.start_subject;
4722     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4723     }
4724
4725   DPRINTF((">>>> returning %d\n", rc));
4726   free_stack(&match_block);
4727   return rc;
4728   }  /* End of (if setjmp(match_block.error_env)...) */
4729   free_stack(&match_block);
4730
4731   /* Return an error code; pcremodule.c will preserve the exception */
4732   if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4733   }
4734 while (!anchored &&
4735        match_block.errorcode == PCRE_ERROR_NOMATCH &&
4736        start_match++ < end_subject);
4737
4738 if (using_temporary_offsets)
4739   {
4740   DPRINTF(("Freeing temporary memory\n"));
4741   (pcre_free)(match_block.offset_vector);
4742   }
4743
4744 #ifdef DEBUG
4745 printf(">>>> returning %d\n", match_block.errorcode);
4746 #endif
4747
4748  free_stack(&match_block);
4749  return match_block.errorcode;
4750 }
4751
4752 /* End of pcre.c */