Modules/pypcre.c

   1
   2 /*************************************************
   3 *      Perl-Compatible Regular Expressions       *
   4 *************************************************/
   5
   6 /*   DO NOT EDIT THIS FILE! */
   7
   8 /* This file is automatically written by the merge-files.py script
   9 included with the PCRE distribution for Python; it's produced from
  10 several C files, and code is removed in the process.  If you want to
  11 modify the code or track down bugs, it will be much easier to work
  12 with the code in its original, multiple-file form.  Don't edit this
  13 file by hand, or submit patches to it.
  14
  15 The Python-specific PCRE distribution can be retrieved from
  16        http://starship.skyport.net/crew/amk/regex/
  17
  18 The unmodified original PCRE distribution is available at
  19 ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
  20 written by: Philip Hazel <ph10@cam.ac.uk>
  21
  22 Extensively modified by the Python String-SIG: <string-sig@python.org>
  23 Send bug reports to:                           <string-sig@python.org>
  24 (They'll figure out if it's a bug in PCRE or in the Python-specific
  25 changes.)
  26
  27            Copyright (c) 1997 University of Cambridge
  28
  29 -----------------------------------------------------------------------------
  30 Permission is granted to anyone to use this software for any purpose on any
  31 computer system, and to redistribute it freely, subject to the following
  32 restrictions:
  33
  34 1. This software is distributed in the hope that it will be useful,
  35    but WITHOUT ANY WARRANTY; without even the implied warranty of
  36    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  37
  38 2. The origin of this software must not be misrepresented, either by
  39    explicit claim or by omission.
  40
  41 3. Altered versions must be plainly marked as such, and must not be
  42    misrepresented as being the original software.
  43 -----------------------------------------------------------------------------
  44 */
  45
  46
  47 #define FOR_PYTHON
  48 #include "pcre-int.h"
  49 #include "Python.h"
  50 #include "mymalloc.h"
  51 #include <ctype.h>
  52 #include "graminit.h"
  53
  54 /*************************************************
  55 *      Perl-Compatible Regular Expressions       *
  56 *************************************************/
  57
  58 /* This file is automatically written by the makechartables auxiliary
  59 program. If you edit it by hand, you might like to edit the Makefile to
  60 prevent its ever being regenerated. */
  61
  62 /* This table is a lower casing table. */
  63
  64 unsigned char pcre_lcc[] = {
  65     0,  1,  2,  3,  4,  5,  6,  7,
  66     8,  9, 10, 11, 12, 13, 14, 15,
  67    16, 17, 18, 19, 20, 21, 22, 23,
  68    24, 25, 26, 27, 28, 29, 30, 31,
  69    32, 33, 34, 35, 36, 37, 38, 39,
  70    40, 41, 42, 43, 44, 45, 46, 47,
  71    48, 49, 50, 51, 52, 53, 54, 55,
  72    56, 57, 58, 59, 60, 61, 62, 63,
  73    64, 97, 98, 99,100,101,102,103,
  74   104,105,106,107,108,109,110,111,
  75   112,113,114,115,116,117,118,119,
  76   120,121,122, 91, 92, 93, 94, 95,
  77    96, 97, 98, 99,100,101,102,103,
  78   104,105,106,107,108,109,110,111,
  79   112,113,114,115,116,117,118,119,
  80   120,121,122,123,124,125,126,127,
  81   128,129,130,131,132,133,134,135,
  82   136,137,138,139,140,141,142,143,
  83   144,145,146,147,148,149,150,151,
  84   152,153,154,155,156,157,158,159,
  85   160,161,162,163,164,165,166,167,
  86   168,169,170,171,172,173,174,175,
  87   176,177,178,179,180,181,182,183,
  88   184,185,186,187,188,189,190,191,
  89   192,193,194,195,196,197,198,199,
  90   200,201,202,203,204,205,206,207,
  91   208,209,210,211,212,213,214,215,
  92   216,217,218,219,220,221,222,223,
  93   224,225,226,227,228,229,230,231,
  94   232,233,234,235,236,237,238,239,
  95   240,241,242,243,244,245,246,247,
  96   248,249,250,251,252,253,254,255 };
  97
  98 /* This table is a case flipping table. */
  99
 100 unsigned char pcre_fcc[] = {
 101     0,  1,  2,  3,  4,  5,  6,  7,
 102     8,  9, 10, 11, 12, 13, 14, 15,
 103    16, 17, 18, 19, 20, 21, 22, 23,
 104    24, 25, 26, 27, 28, 29, 30, 31,
 105    32, 33, 34, 35, 36, 37, 38, 39,
 106    40, 41, 42, 43, 44, 45, 46, 47,
 107    48, 49, 50, 51, 52, 53, 54, 55,
 108    56, 57, 58, 59, 60, 61, 62, 63,
 109    64, 97, 98, 99,100,101,102,103,
 110   104,105,106,107,108,109,110,111,
 111   112,113,114,115,116,117,118,119,
 112   120,121,122, 91, 92, 93, 94, 95,
 113    96, 65, 66, 67, 68, 69, 70, 71,
 114    72, 73, 74, 75, 76, 77, 78, 79,
 115    80, 81, 82, 83, 84, 85, 86, 87,
 116    88, 89, 90,123,124,125,126,127,
 117   128,129,130,131,132,133,134,135,
 118   136,137,138,139,140,141,142,143,
 119   144,145,146,147,148,149,150,151,
 120   152,153,154,155,156,157,158,159,
 121   160,161,162,163,164,165,166,167,
 122   168,169,170,171,172,173,174,175,
 123   176,177,178,179,180,181,182,183,
 124   184,185,186,187,188,189,190,191,
 125   192,193,194,195,196,197,198,199,
 126   200,201,202,203,204,205,206,207,
 127   208,209,210,211,212,213,214,215,
 128   216,217,218,219,220,221,222,223,
 129   224,225,226,227,228,229,230,231,
 130   232,233,234,235,236,237,238,239,
 131   240,241,242,243,244,245,246,247,
 132   248,249,250,251,252,253,254,255 };
 133
 134 /* This table contains bit maps for digits, letters, 'word' chars, and
 135 white space. Each map is 32 bytes long and the bits run from the least
 136 significant end of each byte. */
 137
 138 unsigned char pcre_cbits[] = {
 139   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
 140   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 141   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 142   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 143
 144   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 145   0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
 146   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 147   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 148
 149   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
 150   0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
 151   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 152   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 153
 154   0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
 155   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 156   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 157   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
 158
 159 /* This table identifies various classes of character by individual bits:
 160   0x01   white space character
 161   0x02   letter
 162   0x04   decimal digit
 163   0x08   hexadecimal digit
 164   0x10   alphanumeric or '_'
 165   0x80   regular expression metacharacter or binary zero
 166 */
 167
 168 unsigned char pcre_ctypes[] = {
 169   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 170   0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /*   8- 15 */
 171   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 172   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 173   0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /*    - '  */
 174   0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /*  ( - /  */
 175   0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /*  0 - 7  */
 176   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /*  8 - ?  */
 177   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  @ - G  */
 178   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  H - O  */
 179   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  P - W  */
 180   0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /*  X - _  */
 181   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  ` - g  */
 182   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  h - o  */
 183   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  p - w  */
 184   0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /*  x -127 */
 185   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 186   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 187   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 188   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 189   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 190   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 191   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 192   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 193   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 194   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 195   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 196   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 197   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 198   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 199   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 200   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 201
 202 /* End of chartables.c */
 203 /*************************************************
 204 *      Perl-Compatible Regular Expressions       *
 205 *************************************************/
 206
 207 /*
 208 This is a library of functions to support regular expressions whose syntax
 209 and semantics are as close as possible to those of the Perl 5 language. See
 210 the file Tech.Notes for some information on the internals.
 211
 212 Written by: Philip Hazel <ph10@cam.ac.uk>
 213
 214            Copyright (c) 1998 University of Cambridge
 215
 216 -----------------------------------------------------------------------------
 217 Permission is granted to anyone to use this software for any purpose on any
 218 computer system, and to redistribute it freely, subject to the following
 219 restrictions:
 220
 221 1. This software is distributed in the hope that it will be useful,
 222    but WITHOUT ANY WARRANTY; without even the implied warranty of
 223    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 224
 225 2. The origin of this software must not be misrepresented, either by
 226    explicit claim or by omission.
 227
 228 3. Altered versions must be plainly marked as such, and must not be
 229    misrepresented as being the original software.
 230 -----------------------------------------------------------------------------
 231 */
 232
 233
 234 /* Include the internals header, which itself includes Standard C headers plus
 235 the external pcre header. */
 236
 237
 238
 239
 240 /*************************************************
 241 *          Create bitmap of starting chars       *
 242 *************************************************/
 243
 244 /* This function scans a compiled unanchored expression and attempts to build a
 245 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
 246 goes by, we may be able to get more clever at doing this.
 247
 248 Arguments:
 249   code         points to an expression
 250   start_bits   points to a 32-byte table, initialized to 0
 251
 252 Returns:       TRUE if table built, FALSE otherwise
 253 */
 254
 255 static BOOL
 256 set_start_bits(const uschar *code, uschar *start_bits)
 257 {
 258 register int c;
 259 volatile int dummy;
 260
 261 do
 262   {
 263   const uschar *tcode = code + 3;
 264   BOOL try_next = TRUE;
 265
 266   while (try_next)
 267     {
 268     try_next = FALSE;
 269
 270     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
 271       {
 272       if (!set_start_bits(tcode, start_bits)) return FALSE;
 273       }
 274
 275     else switch(*tcode)
 276       {
 277       default:
 278       return FALSE;
 279
 280       /* BRAZERO does the bracket, but carries on. */
 281
 282       case OP_BRAZERO:
 283       case OP_BRAMINZERO:
 284       if (!set_start_bits(++tcode, start_bits)) return FALSE;
 285       dummy = 1;
 286       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
 287       tcode += 3;
 288       try_next = TRUE;
 289       break;
 290
 291       /* Single-char * or ? sets the bit and tries the next item */
 292
 293       case OP_STAR:
 294       case OP_MINSTAR:
 295       case OP_QUERY:
 296       case OP_MINQUERY:
 297       start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
 298       tcode += 2;
 299       try_next = TRUE;
 300       break;
 301
 302       /* Single-char upto sets the bit and tries the next */
 303
 304       case OP_UPTO:
 305       case OP_MINUPTO:
 306       start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
 307       tcode += 4;
 308       try_next = TRUE;
 309       break;
 310
 311       /* At least one single char sets the bit and stops */
 312
 313       case OP_EXACT:       /* Fall through */
 314       tcode++;
 315
 316       case OP_CHARS:       /* Fall through */
 317       tcode++;
 318
 319       case OP_PLUS:
 320       case OP_MINPLUS:
 321       start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
 322       break;
 323
 324       /* Single character type sets the bits and stops */
 325
 326       case OP_NOT_DIGIT:
 327       for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
 328       break;
 329
 330       case OP_DIGIT:
 331       for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
 332       break;
 333
 334       case OP_NOT_WHITESPACE:
 335       for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
 336       break;
 337
 338       case OP_WHITESPACE:
 339       for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
 340       break;
 341
 342       case OP_NOT_WORDCHAR:
 343       for (c = 0; c < 32; c++)
 344         start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 345       break;
 346
 347       case OP_WORDCHAR:
 348       for (c = 0; c < 32; c++)
 349         start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 350       break;
 351
 352       /* One or more character type fudges the pointer and restarts, knowing
 353       it will hit a single character type and stop there. */
 354
 355       case OP_TYPEPLUS:
 356       case OP_TYPEMINPLUS:
 357       tcode++;
 358       try_next = TRUE;
 359       break;
 360
 361       case OP_TYPEEXACT:
 362       tcode += 3;
 363       try_next = TRUE;
 364       break;
 365
 366       /* Zero or more repeats of character types set the bits and then
 367       try again. */
 368
 369       case OP_TYPEUPTO:
 370       case OP_TYPEMINUPTO:
 371       tcode += 2;               /* Fall through */
 372
 373       case OP_TYPESTAR:
 374       case OP_TYPEMINSTAR:
 375       case OP_TYPEQUERY:
 376       case OP_TYPEMINQUERY:
 377       switch(tcode[1])
 378         {
 379         case OP_NOT_DIGIT:
 380         for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
 381         break;
 382
 383         case OP_DIGIT:
 384         for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
 385         break;
 386
 387         case OP_NOT_WHITESPACE:
 388         for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
 389         break;
 390
 391         case OP_WHITESPACE:
 392         for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
 393         break;
 394
 395         case OP_NOT_WORDCHAR:
 396         for (c = 0; c < 32; c++)
 397           start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 398         break;
 399
 400         case OP_WORDCHAR:
 401         for (c = 0; c < 32; c++)
 402           start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
 403         break;
 404         }
 405
 406       tcode += 2;
 407       try_next = TRUE;
 408       break;
 409
 410       /* Character class: set the bits and either carry on or not,
 411       according to the repeat count. */
 412
 413       case OP_CLASS:
 414       case OP_NEGCLASS:
 415         {
 416         tcode++;
 417         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
 418         tcode += 32;
 419         switch (*tcode)
 420           {
 421           case OP_CRSTAR:
 422           case OP_CRMINSTAR:
 423           case OP_CRQUERY:
 424           case OP_CRMINQUERY:
 425           tcode++;
 426           try_next = TRUE;
 427           break;
 428
 429           case OP_CRRANGE:
 430           case OP_CRMINRANGE:
 431           if (((tcode[1] << 8) + tcode[2]) == 0)
 432             {
 433             tcode += 5;
 434             try_next = TRUE;
 435             }
 436           break;
 437           }
 438         }
 439       break; /* End of class handling */
 440
 441       }      /* End of switch */
 442     }        /* End of try_next loop */
 443
 444   code += (code[1] << 8) + code[2];   /* Advance to next branch */
 445   }
 446 while (*code == OP_ALT);
 447 return TRUE;
 448 }
 449
 450
 451
 452 /*************************************************
 453 *          Study a compiled expression           *
 454 *************************************************/
 455
 456 /* This function is handed a compiled expression that it must study to produce
 457 information that will speed up the matching. It returns a pcre_extra block
 458 which then gets handed back to pcre_exec().
 459
 460 Arguments:
 461   re        points to the compiled expression
 462   options   contains option bits
 463   errorptr  points to where to place error messages;
 464             set NULL unless error
 465
 466 Returns:    pointer to a pcre_extra block,
 467             NULL on error or if no optimization possible
 468 */
 469
 470 pcre_extra *
 471 pcre_study(const pcre *external_re, int options, const char **errorptr)
 472 {
 473 BOOL caseless;
 474 uschar start_bits[32];
 475 real_pcre_extra *extra;
 476 const real_pcre *re = (const real_pcre *)external_re;
 477
 478 *errorptr = NULL;
 479
 480 if (re == NULL || re->magic_number != MAGIC_NUMBER)
 481   {
 482   *errorptr = "argument is not a compiled regular expression";
 483   return NULL;
 484   }
 485
 486 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
 487   {
 488   *errorptr = "unknown or incorrect option bit(s) set";
 489   return NULL;
 490   }
 491
 492 /* Caseless can either be from the compiled regex or from options. */
 493
 494 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
 495
 496 /* For an anchored pattern, or an unchored pattern that has a first char, or a
 497 multiline pattern that matches only at "line starts", no further processing at
 498 present. */
 499
 500 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
 501   return NULL;
 502
 503 /* See if we can find a fixed set of initial characters for the pattern. */
 504
 505 memset(start_bits, 0, 32 * sizeof(uschar));
 506 if (!set_start_bits(re->code, start_bits)) return NULL;
 507
 508 /* If this studying is caseless, scan the created bit map and duplicate the
 509 bits for any letters. */
 510
 511 if (caseless)
 512   {
 513   register int c;
 514   for (c = 0; c < 256; c++)
 515     {
 516     if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
 517         (pcre_ctypes[c] & ctype_letter) != 0)
 518       {
 519       int d = pcre_fcc[c];
 520       start_bits[d/8] |= (1 << (d&7));
 521       }
 522     }
 523   }
 524
 525 /* Get an "extra" block and put the information therein. */
 526
 527 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
 528
 529 if (extra == NULL)
 530   {
 531   *errorptr = "failed to get memory";
 532   return NULL;
 533   }
 534
 535 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
 536 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
 537
 538 return (pcre_extra *)extra;
 539 }
 540
 541 /* End of study.c */
 542 /*************************************************
 543 *      Perl-Compatible Regular Expressions       *
 544 *************************************************/
 545
 546 /*
 547 This is a library of functions to support regular expressions whose syntax
 548 and semantics are as close as possible to those of the Perl 5 language. See
 549 the file Tech.Notes for some information on the internals.
 550
 551 Written by: Philip Hazel <ph10@cam.ac.uk>
 552
 553            Copyright (c) 1998 University of Cambridge
 554
 555 -----------------------------------------------------------------------------
 556 Permission is granted to anyone to use this software for any purpose on any
 557 computer system, and to redistribute it freely, subject to the following
 558 restrictions:
 559
 560 1. This software is distributed in the hope that it will be useful,
 561    but WITHOUT ANY WARRANTY; without even the implied warranty of
 562    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 563
 564 2. The origin of this software must not be misrepresented, either by
 565    explicit claim or by omission.
 566
 567 3. Altered versions must be plainly marked as such, and must not be
 568    misrepresented as being the original software.
 569 -----------------------------------------------------------------------------
 570 */
 571
 572
 573 /* Define DEBUG to get debugging output on stdout. */
 574
 575 /* #define DEBUG */
 576
 577 /* Use a macro for debugging printing, 'cause that eliminates the the use
 578 of #ifdef inline, and there are *still* stupid compilers about that don't like
 579 indented pre-processor statements. I suppose it's only been 10 years... */
 580
 581 #ifdef DEBUG
 582 #define DPRINTF(p) printf p
 583 #else
 584 #define DPRINTF(p) /*nothing*/
 585 #endif
 586
 587 /* Include the internals header, which itself includes Standard C headers plus
 588 the external pcre header. */
 589
 590
 591
 592
 593 #ifndef Py_eval_input
 594 /* For Python 1.4, graminit.h has to be explicitly included */
 595 #define Py_eval_input eval_input
 596
 597 #endif /* FOR_PYTHON */
 598
 599 /* Allow compilation as C++ source code, should anybody want to do that. */
 600
 601 #ifdef __cplusplus
 602 #define class pcre_class
 603 #endif
 604
 605
 606 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
 607
 608 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
 609 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
 610
 611 /* Text forms of OP_ values and things, for debugging (not all used) */
 612
 613 #ifdef DEBUG
 614 static const char *OP_names[] = {
 615   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
 616   "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
 617   "localized \\B", "localized \\b", "localized \\W", "localized \\w",
 618   "^", "$", "Any", "chars",
 619   "not",
 620   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 621   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 622   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 623   "*", "*?", "+", "+?", "?", "??", "{", "{",
 624   "class", "negclass", "classL", "Ref",
 625   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
 626   "Brazero", "Braminzero", "Bra"
 627 };
 628 #endif
 629
 630 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 631 are simple data values; negative values are for special things like \d and so
 632 on. Zero means further processing is needed (for things like \x), or the escape
 633 is invalid. */
 634
 635 static const short int escapes[] = {
 636     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 637     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 638   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
 639     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 640     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 641     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 642   '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
 643     0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
 644     0,      0,   '\r', -ESC_s,   '\t',      0,   '\v', -ESC_w,   /* p - w */
 645     0,      0,      0                                            /* x - z */
 646 };
 647
 648 /* Definition to allow mutual recursion */
 649
 650 static BOOL
 651 compile_regex(int, int *, uschar **, const uschar **, const char **,
 652               PyObject *);
 653
 654 /* Structure for passing "static" information around between the functions
 655 doing the matching, so that they are thread-safe. */
 656
 657 typedef struct match_data {
 658   int    errorcode;             /* As it says */
 659   int   *offset_vector;         /* Offset vector */
 660   int    offset_end;            /* One past the end */
 661   BOOL   offset_overflow;       /* Set if too many extractions */
 662   BOOL   caseless;              /* Case-independent flag */
 663   BOOL   runtime_caseless;      /* Caseless forced at run time */
 664   BOOL   multiline;             /* Multiline flag */
 665   BOOL   notbol;                /* NOTBOL flag */
 666   BOOL   noteol;                /* NOTEOL flag */
 667   BOOL   dotall;                /* Dot matches any char */
 668   BOOL   endonly;               /* Dollar not before final \n */
 669   const uschar *start_subject;  /* Start of the subject string */
 670   const uschar *end_subject;    /* End of the subject string */
 671   jmp_buf fail_env;             /* Environment for longjump() break out */
 672   const uschar *end_match_ptr;  /* Subject position at end match */
 673   int     end_offset_top;       /* Highwater mark at end of match */
 674   jmp_buf error_env;          /* For longjmp() if an error occurs deep inside a
 675                                    matching operation */
 676   int    length;                /* Length of the allocated stacks */
 677   int    point;                 /* Point to add next item pushed onto stacks */
 678   /* Pointers to the 6 stacks */
 679   int *off_num, *offset_top, *r1, *r2;
 680   const uschar **eptr, **ecode;
 681 } match_data;
 682
 683
 684
 685 /*************************************************
 686 *               Global variables                 *
 687 *************************************************/
 688
 689 /* PCRE is thread-clean and doesn't use any global variables in the normal
 690 sense. However, it calls memory allocation and free functions via the two
 691 indirections below, which are can be changed by the caller, but are shared
 692 between all threads. */
 693
 694 void *(*pcre_malloc)(size_t) = malloc;
 695 void  (*pcre_free)(void *) = free;
 696
 697
 698
 699
 700 /*************************************************
 701 *          Return version string                 *
 702 *************************************************/
 703
 704 const char *
 705 pcre_version(void)
 706 {
 707 return PCRE_VERSION;
 708 }
 709
 710
 711
 712
 713 /*************************************************
 714 *       Return info about a compiled pattern     *
 715 *************************************************/
 716
 717 /* This function picks potentially useful data out of the private
 718 structure.
 719
 720 Arguments:
 721   external_re   points to compiled code
 722   optptr        where to pass back the options
 723   first_char    where to pass back the first character,
 724                 or -1 if multiline and all branches start ^,
 725                 or -2 otherwise
 726
 727 Returns:        number of identifying extraction brackets
 728                 or negative values on error
 729 */
 730
 731 int
 732 pcre_info(const pcre *external_re, int *optptr, int *first_char)
 733 {
 734 const real_pcre *re = (real_pcre *)external_re;
 735 if (re == NULL) return PCRE_ERROR_NULL;
 736 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 737 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
 738 if (first_char != NULL)
 739   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 740      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 741 return re->top_bracket;
 742 }
 743
 744
 745
 746
 747 #ifdef DEBUG
 748 /*************************************************
 749 *        Debugging function to print chars       *
 750 *************************************************/
 751
 752 /* Print a sequence of chars in printable format, stopping at the end of the
 753 subject if the requested.
 754
 755 Arguments:
 756   p           points to characters
 757   length      number to print
 758   is_subject  TRUE if printing from within md->start_subject
 759   md          pointer to matching data block, if is_subject is TRUE
 760
 761 Returns:     nothing
 762 */
 763
 764 static void
 765 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 766 {
 767 int c;
 768 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 769 while (length-- > 0)
 770   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 771 }
 772 #endif
 773
 774
 775
 776
 777 /*************************************************
 778 *         Check subpattern for empty operand     *
 779 *************************************************/
 780
 781 /* This function checks a bracketed subpattern to see if any of the paths
 782 through it could match an empty string. This is used to diagnose an error if
 783 such a subpattern is followed by a quantifier with an unlimited upper bound.
 784
 785 Argument:
 786   code      points to the opening bracket
 787
 788 Returns:    TRUE or FALSE
 789 */
 790
 791 static BOOL
 792 could_be_empty(uschar *code)
 793 {
 794 do {
 795   uschar *cc = code + 3;
 796
 797   /* Scan along the opcodes for this branch; as soon as we find something
 798   that matches a non-empty string, break out and advance to test the next
 799   branch. If we get to the end of the branch, return TRUE for the whole
 800   sub-expression. */
 801
 802   for (;;)
 803     {
 804     /* Test an embedded subpattern; if it could not be empty, break the
 805     loop. Otherwise carry on in the branch. */
 806
 807     if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
 808       {
 809       if (!could_be_empty(cc)) break;
 810       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 811       cc += 3;
 812       }
 813
 814     else switch (*cc)
 815       {
 816       /* Reached end of a branch: the subpattern may match the empty string */
 817
 818       case OP_ALT:
 819       case OP_KET:
 820       case OP_KETRMAX:
 821       case OP_KETRMIN:
 822       return TRUE;
 823
 824       /* Skip over entire bracket groups with zero lower bound */
 825
 826       case OP_BRAZERO:
 827       case OP_BRAMINZERO:
 828       cc++;
 829       /* Fall through */
 830
 831       /* Skip over assertive subpatterns */
 832
 833       case OP_ASSERT:
 834       case OP_ASSERT_NOT:
 835       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 836       cc += 3;
 837       break;
 838
 839       /* Skip over things that don't match chars */
 840
 841       case OP_SOD:
 842       case OP_EOD:
 843       case OP_CIRC:
 844       case OP_DOLL:
 845       case OP_NOT_WORD_BOUNDARY:
 846       case OP_WORD_BOUNDARY:
 847       case OP_NOT_WORD_BOUNDARY_L:
 848       case OP_WORD_BOUNDARY_L:
 849       cc++;
 850       break;
 851
 852       /* Skip over simple repeats with zero lower bound */
 853
 854       case OP_STAR:
 855       case OP_MINSTAR:
 856       case OP_QUERY:
 857       case OP_MINQUERY:
 858       case OP_NOTSTAR:
 859       case OP_NOTMINSTAR:
 860       case OP_NOTQUERY:
 861       case OP_NOTMINQUERY:
 862       case OP_TYPESTAR:
 863       case OP_TYPEMINSTAR:
 864       case OP_TYPEQUERY:
 865       case OP_TYPEMINQUERY:
 866       cc += 2;
 867       break;
 868
 869       /* Skip over UPTOs (lower bound is zero) */
 870
 871       case OP_UPTO:
 872       case OP_MINUPTO:
 873       case OP_TYPEUPTO:
 874       case OP_TYPEMINUPTO:
 875       cc += 4;
 876       break;
 877
 878       /* Check a class or a back reference for a zero minimum */
 879
 880       case OP_CLASS:
 881       case OP_NEGCLASS:
 882       case OP_REF:
 883       case OP_CLASS_L:
 884         switch(*cc)
 885           {
 886           case (OP_REF):    cc += 2; break;
 887           case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
 888           case (OP_CLASS_L): cc += 1+1+32; break;
 889           }
 890
 891       switch (*cc)
 892         {
 893         case OP_CRSTAR:
 894         case OP_CRMINSTAR:
 895         case OP_CRQUERY:
 896         case OP_CRMINQUERY:
 897         cc++;
 898         break;
 899
 900         case OP_CRRANGE:
 901         case OP_CRMINRANGE:
 902         if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
 903         cc += 3;
 904         break;
 905
 906         default:
 907         goto NEXT_BRANCH;
 908         }
 909       break;
 910
 911       /* Anything else matches at least one character */
 912
 913       default:
 914       goto NEXT_BRANCH;
 915       }
 916     }
 917
 918   NEXT_BRANCH:
 919   code += (code[1] << 8) + code[2];
 920   }
 921 while (*code == OP_ALT);
 922
 923 /* No branches match the empty string */
 924
 925 return FALSE;
 926 }
 927
 928 /* Determine the length of a group ID in an expression like
 929    (?P<foo_123>...)
 930 Arguments:
 931   ptr        pattern position pointer (say that 3 times fast)
 932   finalchar  the character that will mark the end of the ID
 933   errorptr   points to the pointer to the error message
 934 */
 935
 936 static int
 937 get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
 938 {
 939   const uschar *start = ptr;
 940
 941   /* If the first character is not in \w, or is in \w but is a digit,
 942      report an error */
 943   if (!(pcre_ctypes[*ptr] & ctype_word) ||
 944       (pcre_ctypes[*ptr++] & ctype_digit))
 945     {
 946       *errorptr = "(?P identifier must start with a letter or underscore";
 947       return 0;
 948     }
 949
 950   /* Increment ptr until we either hit a null byte, the desired
 951      final character, or a non-word character */
 952   for(; (*ptr != 0) && (*ptr != finalchar) &&
 953         (pcre_ctypes[*ptr] & ctype_word); ptr++)
 954     {
 955       /* Empty loop body */
 956     }
 957   if (*ptr==finalchar)
 958     return ptr-start;
 959   if (*ptr==0)
 960     {
 961       *errorptr = "unterminated (?P identifier";
 962       return 0;
 963     }
 964   *errorptr = "illegal character in (?P identifier";
 965   return 0;
 966 }
 967
 968 /*************************************************
 969 *            Handle escapes                      *
 970 *************************************************/
 971
 972 /* This function is called when a \ has been encountered. It either returns a
 973 positive value for a simple escape such as \n, or a negative value which
 974 encodes one of the more complicated things such as \d. On entry, ptr is
 975 pointing at the \. On exit, it is on the final character of the escape
 976 sequence.
 977
 978 Arguments:
 979   ptrptr     points to the pattern position pointer
 980   errorptr   points to the pointer to the error message
 981   bracount   number of previous extracting brackets
 982   options    the options bits
 983   isclass    TRUE if inside a character class
 984
 985 Returns:     zero or positive => a data character
 986              negative => a special escape sequence
 987              on error, errorptr is set
 988 */
 989
 990 static int
 991 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 992              int options, BOOL isclass)
 993 {
 994 const uschar *ptr = *ptrptr;
 995 int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
 996 int i;
 997
 998 if (c == 0) *errorptr = ERR1;
 999
1000 /* Digits or letters may have special meaning; all others are literals. */
1001
1002 else if (c < '0' || c > 'z') {}
1003
1004 /* Do an initial lookup in a table. A non-zero result is something that can be
1005 returned immediately. Otherwise further processing may be required. */
1006
1007 else if ((i = escapes[c - '0']) != 0) c = i;
1008
1009 /* Escapes that need further processing, or are illegal. */
1010
1011 else
1012   {
1013
1014   switch (c)
1015     {
1016     /* The handling of escape sequences consisting of a string of digits
1017     starting with one that is not zero is not straightforward. By experiment,
1018     the way Perl works seems to be as follows:
1019
1020     Outside a character class, the digits are read as a decimal number. If the
1021     number is less than 10, or if there are that many previous extracting
1022     left brackets, then it is a back reference. Otherwise, up to three octal
1023     digits are read to form an escaped byte. Thus \123 is likely to be octal
1024     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1025     value is greater than 377, the least significant 8 bits are taken. Inside a
1026     character class, \ followed by a digit is always an octal number. */
1027
1028     case '1': case '2': case '3': case '4': case '5':
1029     case '6': case '7': case '8': case '9':
1030
1031     {
1032       /* PYTHON: Try to compute an octal value for a character */
1033       for(c=0, i=0; ptr[i]!=0 && i<3; i++)
1034         {
1035           if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1036             c = c * 8 + ptr[i]-'0';
1037           else
1038             break; /* Non-octal character--break out of the loop */
1039         }
1040       /* It's a character if there were exactly 3 octal digits, or if
1041          we're inside a character class and there was at least one
1042          octal digit. */
1043       if ( (i == 3) || (isclass && i!=0) )
1044         {
1045           ptr += i-1;
1046           break;
1047         }
1048       c = ptr[0]; /* Restore the first character after the \ */
1049       c -= '0'; i = 1;
1050       while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1051         {
1052           c = c * 10 + ptr[1] - '0';
1053           ptr++; i++;
1054         }
1055       if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1056       c = -(ESC_REF + c);
1057     }
1058   break;
1059
1060     /* \0 always starts an octal number, but we may drop through to here with a
1061     larger first octal digit */
1062
1063     case '0':
1064     c -= '0';
1065     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1066       ptr[1] != '8' && ptr[1] != '9')
1067         c = c * 8 + *(++ptr) - '0';
1068     break;
1069
1070     /* Special escapes not starting with a digit are straightforward */
1071
1072     case 'x':
1073   c = 0;
1074   while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
1075     {
1076     ptr++;
1077     c = c * 16 + pcre_lcc[*ptr] -
1078       (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1079     c &= 255;
1080     }
1081   break;
1082
1083
1084     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1085     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1086     for Perl compatibility, it is a literal. */
1087
1088     default:
1089     if ((options & PCRE_EXTRA) != 0) switch(c)
1090       {
1091       case 'X':
1092       c = -ESC_X;      /* This could be a lookup if it ever got into Perl */
1093       break;
1094
1095       default:
1096       *errorptr = ERR3;
1097       break;
1098       }
1099     break;
1100     }
1101   }
1102
1103 *ptrptr = ptr;
1104 return c;
1105 }
1106
1107
1108
1109 /*************************************************
1110 *            Check for counted repeat            *
1111 *************************************************/
1112
1113 /* This function is called when a '{' is encountered in a place where it might
1114 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1115 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1116 where the ddds are digits.
1117
1118 Arguments:
1119   p         pointer to the first char after '{'
1120
1121 Returns:    TRUE or FALSE
1122 */
1123
1124 static BOOL
1125 is_counted_repeat(const uschar *p)
1126 {
1127 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1128 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1129 if (*p == '}') return TRUE;
1130
1131 if (*p++ != ',') return FALSE;
1132 if (*p == '}') return TRUE;
1133
1134 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1135 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1136 return (*p == '}');
1137 }
1138
1139
1140
1141 /*************************************************
1142 *         Read repeat counts                     *
1143 *************************************************/
1144
1145 /* Read an item of the form {n,m} and return the values. This is called only
1146 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1147 so the syntax is guaranteed to be correct, but we need to check the values.
1148
1149 Arguments:
1150   p          pointer to first char after '{'
1151   minp       pointer to int for min
1152   maxp       pointer to int for max
1153              returned as -1 if no max
1154   errorptr   points to pointer to error message
1155
1156 Returns:     pointer to '}' on success;
1157              current ptr on error, with errorptr set
1158 */
1159
1160 static const uschar *
1161 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1162 {
1163 int min = 0;
1164 int max = -1;
1165
1166 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1167
1168 if (*p == '}') max = min; else
1169   {
1170   if (*(++p) != '}')
1171     {
1172     max = 0;
1173     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1174     if (max < min)
1175       {
1176       *errorptr = ERR4;
1177       return p;
1178       }
1179     }
1180   }
1181
1182 /* Do paranoid checks, then fill in the required variables, and pass back the
1183 pointer to the terminating '}'. */
1184
1185 if (min > 65535 || max > 65535)
1186   *errorptr = ERR5;
1187 else
1188   {
1189   *minp = min;
1190   *maxp = max;
1191   }
1192 return p;
1193 }
1194
1195
1196
1197 /*************************************************
1198 *           Compile one branch                   *
1199 *************************************************/
1200
1201 /* Scan the pattern, compiling it into the code vector.
1202
1203 Arguments:
1204   options    the option bits
1205   bracket    points to number of brackets used
1206   code       points to the pointer to the current code point
1207   ptrptr     points to the current pattern pointer
1208   errorptr   points to pointer to error message
1209
1210 Returns:     TRUE on success
1211              FALSE, with *errorptr set on error
1212 */
1213
1214 static BOOL
1215 compile_branch(int options, int *brackets, uschar **codeptr,
1216                const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
1217 {
1218 int repeat_type, op_type;
1219 int repeat_min, repeat_max;
1220 int bravalue, length;
1221 int greedy_default, greedy_non_default;
1222 register int c;
1223 register uschar *code = *codeptr;
1224 const uschar *ptr = *ptrptr;
1225 const uschar *oldptr;
1226 uschar *previous = NULL;
1227 uschar class[32];
1228 uschar *class_flag;  /* Pointer to the single-byte flag for OP_CLASS_L */
1229
1230 /* Set up the default and non-default settings for greediness */
1231
1232 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1233 greedy_non_default = greedy_default ^ 1;
1234
1235 /* Switch on next character until the end of the branch */
1236
1237 for (;; ptr++)
1238   {
1239   BOOL negate_class;
1240   int  class_charcount;
1241   int  class_lastchar;
1242
1243   c = *ptr;
1244   if ((options & PCRE_EXTENDED) != 0)
1245     {
1246     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1247     if (c == '#')
1248       {
1249       while ((c = *(++ptr)) != 0 && c != '\n');
1250       continue;
1251       }
1252     }
1253
1254   switch(c)
1255     {
1256     /* The branch terminates at end of string, |, or ). */
1257
1258     case 0:
1259     case '|':
1260     case ')':
1261     *codeptr = code;
1262     *ptrptr = ptr;
1263     return TRUE;
1264
1265     /* Handle single-character metacharacters */
1266
1267     case '^':
1268     previous = NULL;
1269     *code++ = OP_CIRC;
1270     break;
1271
1272     case '$':
1273     previous = NULL;
1274     *code++ = OP_DOLL;
1275     break;
1276
1277     case '.':
1278     previous = code;
1279     *code++ = OP_ANY;
1280     break;
1281
1282     /* Character classes. These always build a 32-byte bitmap of the permitted
1283     characters, except in the special case where there is only one character.
1284     For negated classes, we build the map as usual, then invert it at the end.
1285     */
1286
1287     case '[':
1288     previous = code;
1289     if (options & PCRE_LOCALE)
1290       {
1291         *code++ = OP_CLASS_L;
1292         /* Set the flag for localized classes (like \w) to 0 */
1293         class_flag = code;
1294         *class_flag = 0;
1295       }
1296     else
1297       {
1298         *code++ = OP_CLASS;
1299         class_flag = NULL;
1300       }
1301
1302     /* If the first character is '^', set the negation flag, and use a
1303     different opcode. This only matters if caseless matching is specified at
1304     runtime. */
1305
1306     if ((c = *(++ptr)) == '^')
1307       {
1308       negate_class = TRUE;
1309       if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
1310       c = *(++ptr);
1311       }
1312     else negate_class = FALSE;
1313
1314     /* Keep a count of chars so that we can optimize the case of just a single
1315     character. */
1316
1317     class_charcount = 0;
1318     class_lastchar = -1;
1319
1320     /* Initialize the 32-char bit map to all zeros. We have to build the
1321     map in a temporary bit of store, in case the class contains only 1
1322     character, because in that case the compiled code doesn't use the
1323     bit map. */
1324
1325     memset(class, 0, 32 * sizeof(uschar));
1326
1327     /* Process characters until ] is reached. By writing this as a "do" it
1328     means that an initial ] is taken as a data character. */
1329
1330     do
1331       {
1332       if (c == 0)
1333         {
1334         *errorptr = ERR6;
1335         goto FAILED;
1336         }
1337
1338       /* Backslash may introduce a single character, or it may introduce one
1339       of the specials, which just set a flag. Escaped items are checked for
1340       validity in the pre-compiling pass. The sequence \b is a special case.
1341       Inside a class (and only there) it is treated as backspace. Elsewhere
1342       it marks a word boundary. Other escapes have preset maps ready to
1343       or into the one we are building. We assume they have more than one
1344       character in them, so set class_count bigger than one. */
1345
1346       if (c == '\\')
1347         {
1348         c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1349         if (-c == ESC_b) c = '\b';
1350         else if (c < 0)
1351           {
1352           class_charcount = 10;
1353           switch (-c)
1354             {
1355             case ESC_d:
1356               {
1357                 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1358               }
1359             continue;
1360
1361             case ESC_D:
1362               {
1363                 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1364               }
1365             continue;
1366
1367             case ESC_w:
1368             if (options & PCRE_LOCALE)
1369               {
1370                 *class_flag |= 1;
1371               }
1372             else
1373               {
1374                 for (c = 0; c < 32; c++)
1375                   class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1376               }
1377             continue;
1378
1379             case ESC_W:
1380             if (options & PCRE_LOCALE)
1381               {
1382                 *class_flag |= 2;
1383               }
1384             else
1385               {
1386                 for (c = 0; c < 32; c++)
1387                   class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1388               }
1389             continue;
1390
1391             case ESC_s:
1392               {
1393                 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1394               }
1395             continue;
1396
1397             case ESC_S:
1398               {
1399                 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1400               }
1401             continue;
1402
1403             default:
1404             *errorptr = ERR7;
1405             goto FAILED;
1406             }
1407           }
1408         /* Fall through if single character */
1409         }
1410
1411       /* A single character may be followed by '-' to form a range. However,
1412       Perl does not permit ']' to be the end of the range. A '-' character
1413       here is treated as a literal. */
1414
1415       if (ptr[1] == '-' && ptr[2] != ']')
1416         {
1417         int d;
1418         ptr += 2;
1419         d = *ptr;
1420
1421         if (d == 0)
1422           {
1423           *errorptr = ERR6;
1424           goto FAILED;
1425           }
1426
1427         /* The second part of a range can be a single-character escape, but
1428         not any of the other escapes. */
1429
1430         if (d == '\\')
1431           {
1432           d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1433           if (d < 0)
1434             {
1435             if (d == -ESC_b) d = '\b'; else
1436               {
1437               *errorptr = ERR7;
1438               goto FAILED;
1439               }
1440             }
1441           }
1442
1443         if (d < c)
1444           {
1445           *errorptr = ERR8;
1446           goto FAILED;
1447           }
1448
1449         for (; c <= d; c++)
1450           {
1451           class[c/8] |= (1 << (c&7));
1452           if ((options & PCRE_CASELESS) != 0)
1453             {
1454             int uc = pcre_fcc[c];           /* flip case */
1455             class[uc/8] |= (1 << (uc&7));
1456             }
1457           class_charcount++;                /* in case a one-char range */
1458           class_lastchar = c;
1459           }
1460         continue;   /* Go get the next char in the class */
1461         }
1462
1463       /* Handle a lone single character - we can get here for a normal
1464       non-escape char, or after \ that introduces a single character. */
1465
1466       class [c/8] |= (1 << (c&7));
1467       if ((options & PCRE_CASELESS) != 0)
1468         {
1469         c = pcre_fcc[c];   /* flip case */
1470         class[c/8] |= (1 << (c&7));
1471         }
1472       class_charcount++;
1473       class_lastchar = c;
1474       }
1475
1476     /* Loop until ']' reached; the check for end of string happens inside the
1477     loop. This "while" is the end of the "do" above. */
1478
1479     while ((c = *(++ptr)) != ']');
1480
1481     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1482     precisely one character. This doesn't need the whole 32-byte bit map.
1483     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1484     it's negative. */
1485
1486     if (class_charcount == 1 && class_lastchar >= 0)
1487       {
1488       if (negate_class)
1489         {
1490         code[-1] = OP_NOT;
1491         }
1492       else
1493         {
1494         code[-1] = OP_CHARS;
1495         *code++ = 1;
1496         }
1497       *code++ = class_lastchar;
1498       }
1499
1500     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1501     the code vector. */
1502
1503     else
1504       {
1505         /* If this is a localized opcode, bump the code pointer up */
1506         if (class_flag) code++;
1507       if (negate_class)
1508         {
1509           if (class_flag) *class_flag = (*class_flag) ^ 63;
1510           for (c = 0; c < 32; c++) code[c] = ~class[c];
1511         }
1512       else
1513         memcpy(code, class, 32);
1514       code += 32;
1515       }
1516     break;
1517
1518     /* Various kinds of repeat */
1519
1520     case '{':
1521     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1522     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1523     if (*errorptr != NULL) goto FAILED;
1524     goto REPEAT;
1525
1526     case '*':
1527     repeat_min = 0;
1528     repeat_max = -1;
1529     goto REPEAT;
1530
1531     case '+':
1532     repeat_min = 1;
1533     repeat_max = -1;
1534     goto REPEAT;
1535
1536     case '?':
1537     repeat_min = 0;
1538     repeat_max = 1;
1539
1540     REPEAT:
1541     if (previous == NULL)
1542       {
1543       *errorptr = ERR9;
1544       goto FAILED;
1545       }
1546
1547     /* If the next character is '?' this is a minimizing repeat, by default,
1548     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1549     next character. */
1550
1551     if (ptr[1] == '?')
1552       { repeat_type = greedy_non_default; ptr++; }
1553     else repeat_type = greedy_default;
1554
1555     /* If the maximum is zero then the minimum must also be zero; Perl allows
1556     this case, so we do too - by simply omitting the item altogether. */
1557
1558     if (repeat_max == 0) code = previous;
1559
1560     /* If previous was a string of characters, chop off the last one and use it
1561     as the subject of the repeat. If there was only one character, we can
1562     abolish the previous item altogether. */
1563
1564     else if (*previous == OP_CHARS)
1565       {
1566       int len = previous[1];
1567       if (len == 1)
1568         {
1569         c = previous[2];
1570         code = previous;
1571         }
1572       else
1573         {
1574         c = previous[len+1];
1575         previous[1]--;
1576         code--;
1577         }
1578       op_type = 0;                 /* Use single-char op codes */
1579       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1580       }
1581
1582     /* If previous was a single negated character ([^a] or similar), we use
1583     one of the special opcodes, replacing it. The code is shared with single-
1584     character repeats by adding a suitable offset into repeat_type. */
1585
1586     else if ((int)*previous == OP_NOT)
1587       {
1588       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1589       c = previous[1];
1590       code = previous;
1591       goto OUTPUT_SINGLE_REPEAT;
1592       }
1593
1594     /* If previous was a character type match (\d or similar), abolish it and
1595     create a suitable repeat item. The code is shared with single-character
1596     repeats by adding a suitable offset into repeat_type. */
1597
1598     else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
1599       {
1600       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1601       c = *previous;
1602       code = previous;
1603
1604       OUTPUT_SINGLE_REPEAT:
1605       repeat_type += op_type;      /* Combine both values for many cases */
1606
1607       /* A minimum of zero is handled either as the special case * or ?, or as
1608       an UPTO, with the maximum given. */
1609
1610       if (repeat_min == 0)
1611         {
1612         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1613           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1614         else
1615           {
1616           *code++ = OP_UPTO + repeat_type;
1617           *code++ = repeat_max >> 8;
1618           *code++ = (repeat_max & 255);
1619           }
1620         }
1621
1622       /* The case {1,} is handled as the special case + */
1623
1624       else if (repeat_min == 1 && repeat_max == -1)
1625         *code++ = OP_PLUS + repeat_type;
1626
1627       /* The case {n,n} is just an EXACT, while the general case {n,m} is
1628       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1629
1630       else
1631         {
1632         if (repeat_min != 1)
1633           {
1634           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1635           *code++ = repeat_min >> 8;
1636           *code++ = (repeat_min & 255);
1637           }
1638
1639         /* If the mininum is 1 and the previous item was a character string,
1640         we either have to put back the item that got cancelled if the string
1641         length was 1, or add the character back onto the end of a longer
1642         string. For a character type nothing need be done; it will just get
1643         put back naturally. Note that the final character is always going to
1644         get added below. */
1645
1646         else if (*previous == OP_CHARS)
1647           {
1648           if (code == previous) code += 2; else previous[1]++;
1649           }
1650
1651         /*  For a single negated character we also have to put back the
1652         item that got cancelled. */
1653
1654         else if (*previous == OP_NOT) code++;
1655
1656         /* If the maximum is unlimited, insert an OP_STAR. */
1657
1658         if (repeat_max < 0)
1659           {
1660           *code++ = c;
1661           *code++ = OP_STAR + repeat_type;
1662           }
1663
1664         /* Else insert an UPTO if the max is greater than the min. */
1665
1666         else if (repeat_max != repeat_min)
1667           {
1668           *code++ = c;
1669           repeat_max -= repeat_min;
1670           *code++ = OP_UPTO + repeat_type;
1671           *code++ = repeat_max >> 8;
1672           *code++ = (repeat_max & 255);
1673           }
1674         }
1675
1676       /* The character or character type itself comes last in all cases. */
1677
1678       *code++ = c;
1679       }
1680
1681     /* If previous was a character class or a back reference, we put the repeat
1682     stuff after it. */
1683
1684     else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1685              *previous==OP_CLASS_L || *previous == OP_REF)
1686       {
1687       if (repeat_min == 0 && repeat_max == -1)
1688         *code++ = OP_CRSTAR + repeat_type;
1689       else if (repeat_min == 1 && repeat_max == -1)
1690         *code++ = OP_CRPLUS + repeat_type;
1691       else if (repeat_min == 0 && repeat_max == 1)
1692         *code++ = OP_CRQUERY + repeat_type;
1693       else
1694         {
1695         *code++ = OP_CRRANGE + repeat_type;
1696         *code++ = repeat_min >> 8;
1697         *code++ = repeat_min & 255;
1698         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
1699         *code++ = repeat_max >> 8;
1700         *code++ = repeat_max & 255;
1701         }
1702       }
1703
1704     /* If previous was a bracket group, we may have to replicate it in certain
1705     cases. If the maximum repeat count is unlimited, check that the bracket
1706     group cannot match the empty string, and diagnose an error if it can. */
1707
1708     else if ((int)*previous >= OP_BRA)
1709       {
1710       int i;
1711       int len = code - previous;
1712
1713       if (repeat_max == -1 && could_be_empty(previous))
1714         {
1715         *errorptr = ERR10;
1716         goto FAILED;
1717         }
1718
1719       /* If the minimum is greater than zero, and the maximum is unlimited or
1720       equal to the minimum, the first copy remains where it is, and is
1721       replicated up to the minimum number of times. This case includes the +
1722       repeat, but of course no replication is needed in that case. */
1723
1724       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1725         {
1726         for (i = 1; i < repeat_min; i++)
1727           {
1728           memcpy(code, previous, len);
1729           code += len;
1730           }
1731         }
1732
1733       /* If the minimum is zero, stick BRAZERO in front of the first copy.
1734       Then, if there is a fixed upper limit, replicated up to that many times,
1735       sticking BRAZERO in front of all the optional ones. */
1736
1737       else
1738         {
1739         if (repeat_min == 0)
1740           {
1741           memmove(previous+1, previous, len);
1742           code++;
1743           *previous++ = OP_BRAZERO + repeat_type;
1744           }
1745
1746         for (i = 1; i < repeat_min; i++)
1747           {
1748           memcpy(code, previous, len);
1749           code += len;
1750           }
1751
1752         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1753           {
1754           *code++ = OP_BRAZERO + repeat_type;
1755           memcpy(code, previous, len);
1756           code += len;
1757           }
1758         }
1759
1760       /* If the maximum is unlimited, set a repeater in the final copy. */
1761
1762       if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1763       }
1764
1765     /* Else there's some kind of shambles */
1766
1767     else
1768       {
1769       *errorptr = ERR11;
1770       goto FAILED;
1771       }
1772
1773     /* In all case we no longer have a previous item. */
1774
1775     previous = NULL;
1776     break;
1777
1778
1779     /* Start of nested bracket sub-expression, or comment or lookahead.
1780     First deal with special things that can come after a bracket; all are
1781     introduced by ?, and the appearance of any of them means that this is not a
1782     referencing group. They were checked for validity in the first pass over
1783     the string, so we don't have to check for syntax errors here.  */
1784
1785     case '(':
1786     previous = code;              /* Only real brackets can be repeated */
1787     if (*(++ptr) == '?')
1788       {
1789       bravalue = OP_BRA;
1790
1791       switch (*(++ptr))
1792         {
1793         case '#':
1794         case 'i':
1795         case 'L':
1796         case 'm':
1797         case 's':
1798         case 'x':
1799         ptr++;
1800         while (*ptr != ')') ptr++;
1801         previous = NULL;
1802         continue;
1803
1804         case ':':                 /* Non-extracting bracket */
1805         ptr++;
1806         break;
1807
1808         case '=':                 /* Assertions can't be repeated */
1809         bravalue = OP_ASSERT;
1810         ptr++;
1811         previous = NULL;
1812         break;
1813
1814         case '!':
1815         bravalue = OP_ASSERT_NOT;
1816         ptr++;
1817         previous = NULL;
1818         break;
1819
1820         case ('P'):
1821           ptr++;
1822           if (*ptr=='<')
1823             {
1824               /* (?P<groupname>...) */
1825               int idlen;
1826               PyObject *string, *intobj;
1827
1828               ptr++;
1829               idlen = get_group_id(ptr, '>', errorptr);
1830               if (*errorptr) {
1831                 goto FAILED;
1832               }
1833               string = PyString_FromStringAndSize((char*)ptr, idlen);
1834               intobj = PyInt_FromLong( brackets[0] + 1 );
1835               if (intobj == NULL || string == NULL)
1836                 {
1837                   Py_XDECREF(string);
1838                   Py_XDECREF(intobj);
1839                   *errorptr = "exception raised";
1840                   goto FAILED;
1841                 }
1842               PyDict_SetItem(dictionary, string, intobj);
1843               Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
1844               ptr += idlen+1;  /* Point to rest of expression */
1845               goto do_grouping_bracket;
1846             }
1847           if (*ptr=='=')
1848             {
1849               /* (?P=groupname) */
1850               int idlen, refnum;
1851               PyObject *string, *intobj;
1852
1853               ptr++;
1854               idlen = get_group_id(ptr, ')', errorptr);
1855               if (*errorptr) {
1856                 goto FAILED;
1857               }
1858               string = PyString_FromStringAndSize((char *)ptr, idlen);
1859               if (string==NULL) {
1860                   *errorptr = "exception raised";
1861                   goto FAILED;
1862                 }
1863               intobj = PyDict_GetItem(dictionary, string);
1864               if (intobj==NULL) {
1865                 Py_DECREF(string);
1866                 *errorptr = "?P= group identifier isn't defined";
1867                 goto FAILED;
1868               }
1869
1870               refnum = PyInt_AsLong(intobj);
1871               Py_DECREF(string);
1872               /* The caller doesn't own the reference to the value
1873                  returned from PyDict_GetItem, so intobj is not
1874                  DECREF'ed. */
1875
1876               *code++ = OP_REF;
1877               *code++ = refnum;
1878               /* The continue will cause the top-level for() loop to
1879                  be resumed, so ptr will be immediately incremented.
1880                  Therefore, the following line adds just idlen, not
1881                  idlen+1 */
1882               ptr += idlen;
1883               continue;
1884             }
1885           /* The character after ?P is neither < nor =, so
1886              report an error.  Add more Python-extensions here. */
1887           *errorptr="unknown after (?P";
1888           goto FAILED;
1889
1890         case '>':                         /* "Match once" brackets */
1891         if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */
1892           {
1893           bravalue = OP_ONCE;
1894           ptr++;
1895           previous = NULL;
1896           break;
1897           }
1898         /* Else fall through */
1899
1900         default:
1901         *errorptr = ERR12;
1902         goto FAILED;
1903         }
1904       }
1905
1906     /* Else we have a referencing group */
1907
1908     else
1909       {
1910       do_grouping_bracket:
1911       if (++(*brackets) > EXTRACT_MAX)
1912         {
1913         *errorptr = ERR13;
1914         goto FAILED;
1915         }
1916       bravalue = OP_BRA + *brackets;
1917       }
1918
1919     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1920     code into a non-register variable in order to be able to pass its address
1921     because some compilers complain otherwise. */
1922
1923     *code = bravalue;
1924       {
1925       uschar *mcode = code;
1926       if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
1927         goto FAILED;
1928       code = mcode;
1929       }
1930
1931     if (*ptr != ')')
1932       {
1933       *errorptr = ERR14;
1934       goto FAILED;
1935       }
1936     break;
1937
1938     /* Check \ for being a real metacharacter; if not, fall through and handle
1939     it as a data character at the start of a string. Escape items are checked
1940     for validity in the pre-compiling pass. */
1941
1942     case '\\':
1943     oldptr = ptr;
1944     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1945
1946     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1947     are arranged to be the negation of the corresponding OP_values. For the
1948     back references, the values are ESC_REF plus the reference number. Only
1949     back references and those types that consume a character may be repeated.
1950     We can test for values between ESC_b and ESC_Z for the latter; this may
1951     have to change if any new ones are ever created. */
1952
1953     if (c < 0)
1954       {
1955       if (-c >= ESC_REF)
1956         {
1957         int refnum = -c - ESC_REF;
1958         if (*brackets < refnum)
1959           {
1960           *errorptr = ERR15;
1961           goto FAILED;
1962           }
1963         previous = code;
1964         *code++ = OP_REF;
1965         *code++ = refnum;
1966         }
1967       else
1968         {
1969         previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1970         if ( (options & PCRE_LOCALE) != 0)
1971           {
1972             switch (c)
1973               {
1974                 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1975                 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1976                 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1977                 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1978               }
1979           }
1980         *code++ = -c;
1981         }
1982       continue;
1983       }
1984
1985     /* Data character: Reset and fall through */
1986
1987     ptr = oldptr;
1988     c = '\\';
1989
1990     /* Handle a run of data characters until a metacharacter is encountered.
1991     The first character is guaranteed not to be whitespace or # when the
1992     extended flag is set. */
1993
1994     NORMAL_CHAR:
1995     default:
1996     previous = code;
1997     *code = OP_CHARS;
1998     code += 2;
1999     length = 0;
2000
2001     do
2002       {
2003       if ((options & PCRE_EXTENDED) != 0)
2004         {
2005         if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2006         if (c == '#')
2007           {
2008           while ((c = *(++ptr)) != 0 && c != '\n');
2009           if (c == 0) break;
2010           continue;
2011           }
2012         }
2013
2014       /* Backslash may introduce a data char or a metacharacter. Escaped items
2015       are checked for validity in the pre-compiling pass. Stop the string
2016       before a metaitem. */
2017
2018       if (c == '\\')
2019         {
2020         oldptr = ptr;
2021         c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
2022         if (c < 0) { ptr = oldptr; break; }
2023         }
2024
2025       /* Ordinary character or single-char escape */
2026
2027       *code++ = c;
2028       length++;
2029       }
2030
2031     /* This "while" is the end of the "do" above. */
2032
2033     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2034
2035     /* Compute the length and set it in the data vector, and advance to
2036     the next state. */
2037
2038     previous[1] = length;
2039     if (length < 255) ptr--;
2040     break;
2041     }
2042   }                   /* end of big loop */
2043
2044 /* Control never reaches here by falling through, only by a goto for all the
2045 error states. Pass back the position in the pattern so that it can be displayed
2046 to the user for diagnosing the error. */
2047
2048 FAILED:
2049 *ptrptr = ptr;
2050 return FALSE;
2051 }
2052
2053
2054
2055
2056 /*************************************************
2057 *     Compile sequence of alternatives           *
2058 *************************************************/
2059
2060 /* On entry, ptr is pointing past the bracket character, but on return
2061 it points to the closing bracket, or vertical bar, or end of string.
2062 The code variable is pointing at the byte into which the BRA operator has been
2063 stored.
2064
2065 Argument:
2066   options   the option bits
2067   brackets  -> int containing the number of extracting brackets used
2068   codeptr   -> the address of the current code pointer
2069   ptrptr    -> the address of the current pattern pointer
2070   errorptr  -> pointer to error message
2071
2072 Returns:    TRUE on success
2073 */
2074
2075 static BOOL
2076 compile_regex(int options, int *brackets, uschar **codeptr,
2077   const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
2078 {
2079 const uschar *ptr = *ptrptr;
2080 uschar *code = *codeptr;
2081 uschar *start_bracket = code;
2082
2083 for (;;)
2084   {
2085   int length;
2086   uschar *last_branch = code;
2087
2088   code += 3;
2089   if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
2090     {
2091     *ptrptr = ptr;
2092     return FALSE;
2093     }
2094
2095   /* Fill in the length of the last branch */
2096
2097   length = code - last_branch;
2098   last_branch[1] = length >> 8;
2099   last_branch[2] = length & 255;
2100
2101   /* Reached end of expression, either ')' or end of pattern. Insert a
2102   terminating ket and the length of the whole bracketed item, and return,
2103   leaving the pointer at the terminating char. */
2104
2105   if (*ptr != '|')
2106     {
2107     length = code - start_bracket;
2108     *code++ = OP_KET;
2109     *code++ = length >> 8;
2110     *code++ = length & 255;
2111     *codeptr = code;
2112     *ptrptr = ptr;
2113     return TRUE;
2114     }
2115
2116   /* Another branch follows; insert an "or" node and advance the pointer. */
2117
2118   *code = OP_ALT;
2119   ptr++;
2120   }
2121 /* Control never reaches here */
2122 }
2123
2124
2125
2126 /*************************************************
2127 *          Check for anchored expression         *
2128 *************************************************/
2129
2130 /* Try to find out if this is an anchored regular expression. Consider each
2131 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2132 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2133 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2134 counts, since OP_CIRC can match in the middle.
2135
2136 A branch is also implicitly anchored if it starts with .* because that will try
2137 the rest of the pattern at all possible matching points, so there is no point
2138 trying them again.
2139
2140 Argument:  points to start of expression (the bracket)
2141 Returns:   TRUE or FALSE
2142 */
2143
2144 static BOOL
2145 is_anchored(register const uschar *code, BOOL multiline)
2146 {
2147 do {
2148    int op = (int)code[3];
2149    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2150      { if (!is_anchored(code+3, multiline)) return FALSE; }
2151    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2152      { if (code[4] != OP_ANY) return FALSE; }
2153    else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2154    code += (code[1] << 8) + code[2];
2155    }
2156 while (*code == OP_ALT);
2157 return TRUE;
2158 }
2159
2160
2161
2162 /*************************************************
2163 *     Check for start with \n line expression    *
2164 *************************************************/
2165
2166 /* This is called for multiline expressions to try to find out if every branch
2167 starts with ^ so that "first char" processing can be done to speed things up.
2168
2169 Argument:  points to start of expression (the bracket)
2170 Returns:   TRUE or FALSE
2171 */
2172
2173 static BOOL
2174 is_startline(const uschar *code)
2175 {
2176 do {
2177    if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2178      { if (!is_startline(code+3)) return FALSE; }
2179    else if (code[3] != OP_CIRC) return FALSE;
2180    code += (code[1] << 8) + code[2];
2181    }
2182 while (*code == OP_ALT);
2183 return TRUE;
2184 }
2185
2186
2187
2188 /*************************************************
2189 *          Check for fixed first char            *
2190 *************************************************/
2191
2192 /* Try to find out if there is a fixed first character. This is called for
2193 unanchored expressions, as it speeds up their processing quite considerably.
2194 Consider each alternative branch. If they all start with the same char, or with
2195 a bracket all of whose alternatives start with the same char (recurse ad lib),
2196 then we return that char, otherwise -1.
2197
2198 Argument:  points to start of expression (the bracket)
2199 Returns:   -1 or the fixed first char
2200 */
2201
2202 static int
2203 find_firstchar(uschar *code)
2204 {
2205 register int c = -1;
2206 do
2207   {
2208   register int charoffset = 4;
2209
2210   if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2211     {
2212     register int d;
2213     if ((d = find_firstchar(code+3)) < 0) return -1;
2214     if (c < 0) c = d; else if (c != d) return -1;
2215     }
2216
2217   else switch(code[3])
2218     {
2219     default:
2220     return -1;
2221
2222     case OP_EXACT:       /* Fall through */
2223     charoffset++;
2224
2225     case OP_CHARS:       /* Fall through */
2226     charoffset++;
2227
2228     case OP_PLUS:
2229     case OP_MINPLUS:
2230     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2231     break;
2232     }
2233   code += (code[1] << 8) + code[2];
2234   }
2235 while (*code == OP_ALT);
2236 return c;
2237 }
2238
2239
2240
2241 /*************************************************
2242 *        Compile a Regular Expression            *
2243 *************************************************/
2244
2245 /* This function takes a string and returns a pointer to a block of store
2246 holding a compiled version of the expression.
2247
2248 Arguments:
2249   pattern      the regular expression
2250   options      various option bits
2251   errorptr     pointer to pointer to error text
2252   erroroffset  ptr offset in pattern where error was detected
2253
2254 Returns:       pointer to compiled data block, or NULL on error,
2255                with errorptr and erroroffset set
2256 */
2257
2258 pcre *
2259 pcre_compile(const char *pattern, int options, const char **errorptr,
2260              int *erroroffset, PyObject *dictionary)
2261 {
2262 real_pcre *re;
2263 int spaces = 0;
2264 int length = 3;      /* For initial BRA plus length */
2265 int runlength;
2266 int c, size;
2267 int bracount = 0;
2268 int brastack[200];
2269 int top_backref = 0;
2270 unsigned int brastackptr = 0;
2271 uschar *code;
2272 const uschar *ptr;
2273
2274 #ifdef DEBUG
2275 uschar *code_base, *code_end;
2276 #endif
2277
2278 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2279 can do is just return NULL. */
2280
2281 if (errorptr == NULL) return NULL;
2282 *errorptr = NULL;
2283
2284 /* However, we can give a message for this error */
2285
2286 if (erroroffset == NULL)
2287   {
2288   *errorptr = ERR16;
2289   return NULL;
2290   }
2291 *erroroffset = 0;
2292
2293 if ((options & ~PUBLIC_OPTIONS) != 0)
2294   {
2295   *errorptr = ERR17;
2296   return NULL;
2297   }
2298
2299 DPRINTF(("------------------------------------------------------------------\n"));
2300 DPRINTF(("%s\n", pattern));
2301
2302 /* The first thing to do is to make a pass over the pattern to compute the
2303 amount of store required to hold the compiled code. This does not have to be
2304 perfect as long as errors are overestimates. At the same time we can detect any
2305 internal flag settings. Make an attempt to correct for any counted white space
2306 if an "extended" flag setting appears late in the pattern. We can't be so
2307 clever for #-comments. */
2308
2309 ptr = (const uschar *)(pattern - 1);
2310 while ((c = *(++ptr)) != 0)
2311   {
2312   int min, max;
2313   int class_charcount;
2314
2315   if ((pcre_ctypes[c] & ctype_space) != 0)
2316     {
2317     if ((options & PCRE_EXTENDED) != 0) continue;
2318     spaces++;
2319     }
2320
2321   if (c == '#' && (options & PCRE_EXTENDED) != 0)
2322     {
2323     while ((c = *(++ptr)) != 0 && c != '\n');
2324     continue;
2325     }
2326
2327   switch(c)
2328     {
2329     /* A backslashed item may be an escaped "normal" character or a
2330     character type. For a "normal" character, put the pointers and
2331     character back so that tests for whitespace etc. in the input
2332     are done correctly. */
2333
2334     case '\\':
2335       {
2336       const uschar *save_ptr = ptr;
2337       c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2338       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2339       if (c >= 0)
2340         {
2341         ptr = save_ptr;
2342         c = '\\';
2343         goto NORMAL_CHAR;
2344         }
2345       }
2346     length++;
2347
2348     /* A back reference needs an additional char, plus either one or 5
2349     bytes for a repeat. We also need to keep the value of the highest
2350     back reference. */
2351
2352     if (c <= -ESC_REF)
2353       {
2354       int refnum = -c - ESC_REF;
2355       if (refnum > top_backref) top_backref = refnum;
2356       length++;   /* For single back reference */
2357       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2358         {
2359         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2360         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2361         if ((min == 0 && (max == 1 || max == -1)) ||
2362           (min == 1 && max == -1))
2363             length++;
2364         else length += 5;
2365         if (ptr[1] == '?') ptr++;
2366         }
2367       }
2368     continue;
2369
2370     case '^':
2371     case '.':
2372     case '$':
2373     case '*':     /* These repeats won't be after brackets; */
2374     case '+':     /* those are handled separately */
2375     case '?':
2376     length++;
2377     continue;
2378
2379     /* This covers the cases of repeats after a single char, metachar, class,
2380     or back reference. */
2381
2382     case '{':
2383     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2384     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2385     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2386     if ((min == 0 && (max == 1 || max == -1)) ||
2387       (min == 1 && max == -1))
2388         length++;
2389     else
2390       {
2391       length--;   /* Uncount the original char or metachar */
2392       if (min == 1) length++; else if (min > 0) length += 4;
2393       if (max > 0) length += 4; else length += 2;
2394       }
2395     if (ptr[1] == '?') ptr++;
2396     continue;
2397
2398     /* An alternation contains an offset to the next branch or ket. */
2399     case '|':
2400     length += 3;
2401     continue;
2402
2403     /* A character class uses 33 characters. Don't worry about character types
2404     that aren't allowed in classes - they'll get picked up during the compile.
2405     A character class that contains only one character uses 2 or 3 bytes,
2406     depending on whether it is negated or not. Notice this where we can. */
2407
2408     case '[':
2409     class_charcount = 0;
2410     if (*(++ptr) == '^') ptr++;
2411     do
2412       {
2413       if (*ptr == '\\')
2414         {
2415         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
2416         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2417         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2418         }
2419       else class_charcount++;
2420       ptr++;
2421       }
2422     while (*ptr != 0 && *ptr != ']');
2423
2424     /* Repeats for negated single chars are handled by the general code */
2425
2426     if (class_charcount == 1) length += 3; else
2427       {
2428       length += 33;
2429       if (options & PCRE_LOCALE) length++;  /* Add a byte for the localization flag */
2430
2431       /* A repeat needs either 1 or 5 bytes. */
2432
2433       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2434         {
2435         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2436         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2437         if ((min == 0 && (max == 1 || max == -1)) ||
2438           (min == 1 && max == -1))
2439             length++;
2440         else length += 5;
2441         if (ptr[1] == '?') ptr++;
2442         }
2443       }
2444     continue;
2445
2446     /* Brackets may be genuine groups or special things */
2447
2448     case '(':
2449
2450     /* Handle special forms of bracket, which all start (? */
2451
2452     if (ptr[1] == '?') switch (c = ptr[2])
2453       {
2454       /* Skip over comments entirely */
2455       case '#':
2456       ptr += 3;
2457       while (*ptr != 0 && *ptr != ')') ptr++;
2458       if (*ptr == 0)
2459         {
2460         *errorptr = ERR18;
2461         goto PCRE_ERROR_RETURN;
2462         }
2463       continue;
2464
2465       /* Non-referencing groups and lookaheads just move the pointer on, and
2466       then behave like a non-special bracket, except that they don't increment
2467       the count of extracting brackets. */
2468
2469       case ':':
2470       case '=':
2471       case '!':
2472       ptr += 2;
2473       break;
2474
2475       case ('P'):
2476         {
2477           int idlen;
2478           switch (*ptr++) {
2479           case ('<'):
2480             idlen = get_group_id(ptr++, '>', errorptr);
2481             if (*errorptr) goto PCRE_ERROR_RETURN;
2482             ptr += idlen+1;
2483             break;
2484           case ('='):
2485             idlen = get_group_id(ptr++, ')', errorptr);
2486             if (*errorptr) goto PCRE_ERROR_RETURN;
2487             ptr += idlen+1;
2488             length++;
2489             break;
2490           }
2491         }
2492         break;
2493
2494       /* Ditto for the "once only" bracket, allowed only if the extra bit
2495       is set. */
2496
2497       case '>':
2498       if ((options & PCRE_EXTRA) != 0)
2499         {
2500         ptr += 2;
2501         break;
2502         }
2503       /* Else fall through */
2504
2505       /* Else loop setting valid options until ) is met. Anything else is an
2506       error. */
2507
2508       default:
2509       ptr += 2;
2510       for (;; ptr++)
2511         {
2512         if ((c = *ptr) == 'i')
2513           {
2514           options |= PCRE_CASELESS;
2515           continue;
2516           }
2517         else if ((c = *ptr) == 'L')
2518           {
2519           options |= PCRE_LOCALE;
2520           continue;
2521           }
2522         else if ((c = *ptr) == 'm')
2523           {
2524           options |= PCRE_MULTILINE;
2525           continue;
2526           }
2527         else if (c == 's')
2528           {
2529           options |= PCRE_DOTALL;
2530           continue;
2531           }
2532         else if (c == 'x')
2533           {
2534           options |= PCRE_EXTENDED;
2535           length -= spaces;          /* Already counted spaces */
2536           continue;
2537           }
2538         else if (c == ')') break;
2539
2540         *errorptr = ERR12;
2541         goto PCRE_ERROR_RETURN;
2542         }
2543       continue;                      /* End of this bracket handling */
2544       }
2545
2546     /* Extracting brackets must be counted so we can process escapes in a
2547     Perlish way. */
2548
2549     else bracount++;
2550
2551     /* Non-special forms of bracket. Save length for computing whole length
2552     at end if there's a repeat that requires duplication of the group. */
2553
2554     if (brastackptr >= sizeof(brastack)/sizeof(int))
2555       {
2556       *errorptr = ERR19;
2557       goto PCRE_ERROR_RETURN;
2558       }
2559
2560     brastack[brastackptr++] = length;
2561     length += 3;
2562     continue;
2563
2564     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2565     have to replicate this bracket up to that many times. If brastackptr is
2566     0 this is an unmatched bracket which will generate an error, but take care
2567     not to try to access brastack[-1]. */
2568
2569     case ')':
2570     length += 3;
2571       {
2572       int minval = 1;
2573       int maxval = 1;
2574       int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
2575
2576       /* Leave ptr at the final char; for read_repeat_counts this happens
2577       automatically; for the others we need an increment. */
2578
2579       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2580         {
2581         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2582         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583         }
2584       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2585       else if (c == '+') { maxval = -1; ptr++; }
2586       else if (c == '?') { minval = 0; ptr++; }
2587
2588       /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2589       if there is a limited maximum we have to replicate up to maxval-1 times
2590       and allow for a BRAZERO item before each optional copy, as we also have
2591       to do before the first copy if the minimum is zero. */
2592
2593       if (minval == 0) length++;
2594         else if (minval > 1) length += (minval - 1) * duplength;
2595       if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2596       }
2597     continue;
2598
2599     /* Non-special character. For a run of such characters the length required
2600     is the number of characters + 2, except that the maximum run length is 255.
2601     We won't get a skipped space or a non-data escape or the start of a #
2602     comment as the first character, so the length can't be zero. */
2603
2604     NORMAL_CHAR:
2605     default:
2606     length += 2;
2607     runlength = 0;
2608     do
2609       {
2610       if ((pcre_ctypes[c] & ctype_space) != 0)
2611         {
2612         if ((options & PCRE_EXTENDED) != 0) continue;
2613         spaces++;
2614         }
2615
2616       if (c == '#' && (options & PCRE_EXTENDED) != 0)
2617         {
2618         while ((c = *(++ptr)) != 0 && c != '\n');
2619         continue;
2620         }
2621
2622       /* Backslash may introduce a data char or a metacharacter; stop the
2623       string before the latter. */
2624
2625       if (c == '\\')
2626         {
2627         const uschar *saveptr = ptr;
2628         c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2629         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630         if (c < 0) { ptr = saveptr; break; }
2631         }
2632
2633       /* Ordinary character or single-char escape */
2634
2635       runlength++;
2636       }
2637
2638     /* This "while" is the end of the "do" above. */
2639
2640     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2641
2642     ptr--;
2643     length += runlength;
2644     continue;
2645     }
2646   }
2647
2648 length += 4;    /* For final KET and END */
2649
2650 if (length > 65539)
2651   {
2652   *errorptr = ERR20;
2653   return NULL;
2654   }
2655
2656 /* Compute the size of data block needed and get it, either from malloc or
2657 externally provided function. We specify "code[0]" in the offsetof() expression
2658 rather than just "code", because it has been reported that one broken compiler
2659 fails on "code" because it is also an independent variable. It should make no
2660 difference to the value of the offsetof(). */
2661
2662 size = length + offsetof(real_pcre, code[0]);
2663 re = (real_pcre *)(pcre_malloc)(size+50);
2664
2665 if (re == NULL)
2666   {
2667   *errorptr = ERR21;
2668   return NULL;
2669   }
2670
2671 /* Put in the magic number and the options. */
2672
2673 re->magic_number = MAGIC_NUMBER;
2674 re->options = options;
2675
2676 /* Set up a starting, non-extracting bracket, then compile the expression. On
2677 error, *errorptr will be set non-NULL, so we don't need to look at the result
2678 of the function here. */
2679
2680 ptr = (const uschar *)pattern;
2681 code = re->code;
2682 *code = OP_BRA;
2683 bracount = 0;
2684 (void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2685 re->top_bracket = bracount;
2686 re->top_backref = top_backref;
2687
2688 /* If not reached end of pattern on success, there's an excess bracket. */
2689
2690 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2691
2692 /* Fill in the terminating state and check for disastrous overflow, but
2693 if debugging, leave the test till after things are printed out. */
2694
2695 *code++ = OP_END;
2696
2697
2698 #ifndef DEBUG
2699 if (code - re->code > length) *errorptr = ERR23;
2700 #endif
2701
2702 /* Failed to compile */
2703
2704 if (*errorptr != NULL)
2705   {
2706   (pcre_free)(re);
2707   PCRE_ERROR_RETURN:
2708   *erroroffset = ptr - (const uschar *)pattern;
2709   return NULL;
2710   }
2711
2712 /* If the anchored option was not passed, set flag if we can determine that it
2713 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2714 we can determine what the first character has to be, because that speeds up
2715 unanchored matches no end. In the case of multiline matches, an alternative is
2716 to set the PCRE_STARTLINE flag if all branches start with ^. */
2717
2718 if ((options & PCRE_ANCHORED) == 0)
2719   {
2720   if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2721     re->options |= PCRE_ANCHORED;
2722   else
2723     {
2724     int ch = find_firstchar(re->code);
2725     if (ch >= 0)
2726       {
2727       re->first_char = ch;
2728       re->options |= PCRE_FIRSTSET;
2729       }
2730     else if (is_startline(re->code))
2731       re->options |= PCRE_STARTLINE;
2732     }
2733   }
2734
2735 /* Print out the compiled data for debugging */
2736
2737 #ifdef DEBUG
2738
2739 printf("Length = %d top_bracket = %d top_backref=%d\n",
2740   length, re->top_bracket, re->top_backref);
2741
2742 if (re->options != 0)
2743   {
2744   printf("%s%s%s%s%s%s%s%s\n",
2745     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2746     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2747     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2748     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2749     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2750     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2751     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2752     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2753   }
2754
2755 if ((re->options & PCRE_FIRSTSET) != 0)
2756   {
2757   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2758     else printf("First char = \\x%02x\n", re->first_char);
2759   }
2760
2761 code_end = code;
2762 code_base = code = re->code;
2763
2764 while (code < code_end)
2765   {
2766   int charlength;
2767
2768   printf("%3d ", code - code_base);
2769
2770   if (*code >= OP_BRA)
2771     {
2772     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2773     code += 2;
2774     }
2775
2776   else switch(*code)
2777     {
2778     case OP_CHARS:
2779     charlength = *(++code);
2780     printf("%3d ", charlength);
2781     while (charlength-- > 0)
2782       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2783     break;
2784
2785     case OP_KETRMAX:
2786     case OP_KETRMIN:
2787     case OP_ALT:
2788     case OP_KET:
2789     case OP_ASSERT:
2790     case OP_ASSERT_NOT:
2791     case OP_ONCE:
2792     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2793     code += 2;
2794     break;
2795
2796     case OP_STAR:
2797     case OP_MINSTAR:
2798     case OP_PLUS:
2799     case OP_MINPLUS:
2800     case OP_QUERY:
2801     case OP_MINQUERY:
2802     case OP_TYPESTAR:
2803     case OP_TYPEMINSTAR:
2804     case OP_TYPEPLUS:
2805     case OP_TYPEMINPLUS:
2806     case OP_TYPEQUERY:
2807     case OP_TYPEMINQUERY:
2808     if (*code >= OP_TYPESTAR)
2809       printf("    %s", OP_names[code[1]]);
2810     else if (isprint(c = code[1])) printf("    %c", c);
2811       else printf("    \\x%02x", c);
2812     printf("%s", OP_names[*code++]);
2813     break;
2814
2815     case OP_EXACT:
2816     case OP_UPTO:
2817     case OP_MINUPTO:
2818     if (isprint(c = code[3])) printf("    %c{", c);
2819       else printf("    \\x%02x{", c);
2820     if (*code != OP_EXACT) printf("0,");
2821     printf("%d}", (code[1] << 8) + code[2]);
2822     if (*code == OP_MINUPTO) printf("?");
2823     code += 3;
2824     break;
2825
2826     case OP_TYPEEXACT:
2827     case OP_TYPEUPTO:
2828     case OP_TYPEMINUPTO:
2829     printf("    %s{", OP_names[code[3]]);
2830     if (*code != OP_TYPEEXACT) printf(",");
2831     printf("%d}", (code[1] << 8) + code[2]);
2832     if (*code == OP_TYPEMINUPTO) printf("?");
2833     code += 3;
2834     break;
2835
2836     case OP_NOT:
2837     if (isprint(c = *(++code))) printf("    [^%c]", c);
2838       else printf("    [^\\x%02x]", c);
2839     break;
2840
2841     case OP_NOTSTAR:
2842     case OP_NOTMINSTAR:
2843     case OP_NOTPLUS:
2844     case OP_NOTMINPLUS:
2845     case OP_NOTQUERY:
2846     case OP_NOTMINQUERY:
2847     if (isprint(c = code[1])) printf("    [^%c]", c);
2848       else printf("    [^\\x%02x]", c);
2849     printf("%s", OP_names[*code++]);
2850     break;
2851
2852     case OP_NOTEXACT:
2853     case OP_NOTUPTO:
2854     case OP_NOTMINUPTO:
2855     if (isprint(c = code[3])) printf("    [^%c]{", c);
2856       else printf("    [^\\x%02x]{", c);
2857     if (*code != OP_NOTEXACT) printf(",");
2858     printf("%d}", (code[1] << 8) + code[2]);
2859     if (*code == OP_NOTMINUPTO) printf("?");
2860     code += 3;
2861     break;
2862
2863     case OP_REF:
2864     printf("    \\%d", *(++code));
2865     code ++;
2866     goto CLASS_REF_REPEAT;
2867
2868     case OP_CLASS:
2869     case OP_NEGCLASS:
2870     case OP_CLASS_L:
2871       {
2872       int i, min, max;
2873
2874       if (*code==OP_CLASS_L)
2875         {
2876           code++;
2877           printf("Locflag = %i ", *code++);
2878           printf("    [");
2879         }
2880       else
2881         {
2882           if (*code++ == OP_CLASS) printf("    [");
2883           else printf("   ^[");
2884         }
2885
2886
2887       for (i = 0; i < 256; i++)
2888         {
2889         if ((code[i/8] & (1 << (i&7))) != 0)
2890           {
2891           int j;
2892           for (j = i+1; j < 256; j++)
2893             if ((code[j/8] & (1 << (j&7))) == 0) break;
2894           if (i == '-' || i == ']') printf("\\");
2895           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2896           if (--j > i)
2897             {
2898             printf("-");
2899             if (j == '-' || j == ']') printf("\\");
2900             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2901             }
2902           i = j;
2903           }
2904         }
2905       printf("]");
2906       code += 32;
2907       /*      code ++;*/
2908
2909       CLASS_REF_REPEAT:
2910
2911       switch(*code)
2912         {
2913         case OP_CRSTAR:
2914         case OP_CRMINSTAR:
2915         case OP_CRPLUS:
2916         case OP_CRMINPLUS:
2917         case OP_CRQUERY:
2918         case OP_CRMINQUERY:
2919         printf("%s", OP_names[*code]);
2920         break;
2921
2922         case OP_CRRANGE:
2923         case OP_CRMINRANGE:
2924         min = (code[1] << 8) + code[2];
2925         max = (code[3] << 8) + code[4];
2926         if (max == 0) printf("{%d,}", min);
2927         else printf("{%d,%d}", min, max);
2928         if (*code == OP_CRMINRANGE) printf("?");
2929         code += 4;
2930         break;
2931
2932         default:
2933         code--;
2934         }
2935       }
2936     break;
2937
2938     /* Anything else is just a one-node item */
2939
2940     default:
2941     printf("    %s", OP_names[*code]);
2942     break;
2943     }
2944
2945   code++;
2946   printf("\n");
2947   }
2948 printf("------------------------------------------------------------------\n");
2949
2950 /* This check is done here in the debugging case so that the code that
2951 was compiled can be seen. */
2952
2953 if (code - re->code > length)
2954   {
2955   printf("length=%i, code length=%i\n", length, code-re->code);
2956   *errorptr = ERR23;
2957   (pcre_free)(re);
2958   *erroroffset = ptr - (uschar *)pattern;
2959   return NULL;
2960   }
2961 #endif
2962
2963 return (pcre *)re;
2964 }
2965
2966
2967
2968 /*************************************************
2969 *        Match a character type                  *
2970 *************************************************/
2971
2972 /* Not used in all the places it might be as it's sometimes faster
2973 to put the code inline.
2974
2975 Arguments:
2976   type        the character type
2977   c           the character
2978   dotall      the dotall flag
2979
2980 Returns:      TRUE if character is of the type
2981 */
2982
2983 static BOOL
2984 match_type(int type, int c, BOOL dotall)
2985 {
2986
2987 #ifdef DEBUG
2988 if (isprint(c)) printf("matching subject %c against ", c);
2989   else printf("matching subject \\x%02x against ", c);
2990 printf("%s\n", OP_names[type]);
2991 #endif
2992
2993 switch(type)
2994   {
2995   case OP_ANY:            return dotall || c != '\n';
2996   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;
2997   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;
2998   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2999   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;
3000   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;
3001   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;
3002   case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3003   case OP_WORDCHAR_L:     return (c=='_' || isalnum(c));
3004   }
3005 return FALSE;
3006 }
3007
3008
3009
3010 /*************************************************
3011 *          Match a back-reference                *
3012 *************************************************/
3013
3014 /* If a back reference hasn't been set, the match fails.
3015
3016 Arguments:
3017   number      reference number
3018   eptr        points into the subject
3019   length      length to be matched
3020   md          points to match data block
3021
3022 Returns:      TRUE if matched
3023 */
3024
3025 static BOOL
3026 match_ref(int number, register const uschar *eptr, int length, match_data *md)
3027 {
3028 const uschar *p = md->start_subject + md->offset_vector[number];
3029
3030 #ifdef DEBUG
3031 if (eptr >= md->end_subject)
3032   printf("matching subject <null>");
3033 else
3034   {
3035   printf("matching subject ");
3036   pchars(eptr, length, TRUE, md);
3037   }
3038 printf(" against backref ");
3039 pchars(p, length, FALSE, md);
3040 printf("\n");
3041 #endif
3042
3043 /* Always fail if not enough characters left */
3044
3045 if (length > md->end_subject - p) return FALSE;
3046
3047 /* Separate the caseless case for speed */
3048
3049 if (md->caseless)
3050   { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3051 else
3052   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3053
3054 return TRUE;
3055 }
3056
3057 static int free_stack(match_data *md)
3058 {
3059 /* Free any stack space that was allocated by the call to match(). */
3060 if (md->off_num)    free(md->off_num);
3061 if (md->offset_top) free(md->offset_top);
3062 if (md->r1)         free(md->r1);
3063 if (md->r2)         free(md->r2);
3064 if (md->eptr)       free((char *)md->eptr);
3065 if (md->ecode)      free((char *)md->ecode);
3066 return 0;
3067 }
3068
3069 static int grow_stack(match_data *md)
3070 {
3071   if (md->length != 0)
3072     {
3073       md->length = md->length + md->length/2;
3074     }
3075   else
3076     {
3077       int string_len = md->end_subject - md->start_subject + 1;
3078       if (string_len < 80) {md->length = string_len; }
3079       else {md->length = 80;}
3080     }
3081   PyMem_RESIZE(md->offset_top, int, md->length);
3082   PyMem_RESIZE(md->eptr, const uschar *, md->length);
3083   PyMem_RESIZE(md->ecode, const uschar *, md->length);
3084   PyMem_RESIZE(md->off_num, int, md->length);
3085   PyMem_RESIZE(md->r1, int, md->length);
3086   PyMem_RESIZE(md->r2, int, md->length);
3087   if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3088       md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3089     {
3090       PyErr_NoMemory();
3091       longjmp(md->error_env, 1);
3092     }
3093   return 0;
3094 }
3095
3096
3097 /*************************************************
3098 *         Match from current position            *
3099 *************************************************/
3100
3101 /* On entry ecode points to the first opcode, and eptr to the first character.
3102
3103 Arguments:
3104    eptr        pointer in subject
3105    ecode       position in code
3106    offset_top  current top pointer
3107    md          pointer to "static" info for the match
3108
3109 Returns:       TRUE if matched
3110 */
3111
3112 static BOOL
3113 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
3114   match_data *md)
3115 {
3116   int save_stack_position = md->point;
3117 match_loop:
3118
3119 #define SUCCEED goto succeed
3120 #define FAIL    goto fail
3121
3122 for (;;)
3123   {
3124   int min, max, ctype;
3125   register int i;
3126   register int c;
3127   BOOL minimize = FALSE;
3128
3129   /* Opening bracket. Check the alternative branches in turn, failing if none
3130   match. We have to set the start offset if required and there is space
3131   in the offset vector so that it is available for subsequent back references
3132   if the bracket matches. However, if the bracket fails, we must put back the
3133   previous value of both offsets in case they were set by a previous copy of
3134   the same bracket. Don't worry about setting the flag for the error case here;
3135   that is handled in the code for KET. */
3136
3137   if ((int)*ecode >= OP_BRA)
3138     {
3139     int number = (*ecode - OP_BRA) << 1;
3140     int save_offset1 = 0, save_offset2 = 0;
3141
3142     DPRINTF(("start bracket %d\n", number/2));
3143
3144     if (number > 0 && number < md->offset_end)
3145       {
3146       save_offset1 = md->offset_vector[number];
3147       save_offset2 = md->offset_vector[number+1];
3148       md->offset_vector[number] = eptr - md->start_subject;
3149
3150       DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
3151       }
3152
3153     /* Recurse for all the alternatives. */
3154
3155     do
3156       {
3157       if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3158       ecode += (ecode[1] << 8) + ecode[2];
3159       }
3160     while (*ecode == OP_ALT);
3161
3162     DPRINTF(("bracket %d failed\n", number/2));
3163
3164     if (number > 0 && number < md->offset_end)
3165       {
3166       md->offset_vector[number] = save_offset1;
3167       md->offset_vector[number+1] = save_offset2;
3168       }
3169
3170     FAIL;
3171     }
3172
3173   /* Other types of node can be handled by a switch */
3174
3175   switch(*ecode)
3176     {
3177     case OP_END:
3178     md->end_match_ptr = eptr;          /* Record where we ended */
3179     md->end_offset_top = offset_top;   /* and how many extracts were taken */
3180     SUCCEED;
3181
3182     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3183     whole thing doesn't match, so we have to get out via a longjmp(). */
3184
3185     case OP_CUT:
3186     if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3187     longjmp(md->fail_env, 1);
3188
3189     /* Assertion brackets. Check the alternative branches in turn - the
3190     matching won't pass the KET for an assertion. If any one branch matches,
3191     the assertion is true. */
3192
3193     case OP_ASSERT:
3194     do
3195       {
3196       if (match(eptr, ecode+3, offset_top, md)) break;
3197       ecode += (ecode[1] << 8) + ecode[2];
3198       }
3199     while (*ecode == OP_ALT);
3200     if (*ecode == OP_KET) FAIL;
3201
3202     /* Continue from after the assertion, updating the offsets high water
3203     mark, since extracts may have been taken during the assertion. */
3204
3205     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3206     ecode += 3;
3207     offset_top = md->end_offset_top;
3208     continue;
3209
3210     /* Negative assertion: all branches must fail to match */
3211
3212     case OP_ASSERT_NOT:
3213     do
3214       {
3215       if (match(eptr, ecode+3, offset_top, md)) FAIL;
3216       ecode += (ecode[1] << 8) + ecode[2];
3217       }
3218     while (*ecode == OP_ALT);
3219     ecode += 3;
3220     continue;
3221
3222     /* "Once" brackets are like assertion brackets except that after a match,
3223     the point in the subject string is not moved back. Thus there can never be
3224     a move back into the brackets. Check the alternative branches in turn - the
3225     matching won't pass the KET for this kind of subpattern. If any one branch
3226     matches, we carry on, leaving the subject pointer. */
3227
3228     case OP_ONCE:
3229     do
3230       {
3231       if (match(eptr, ecode+3, offset_top, md)) break;
3232       ecode += (ecode[1] << 8) + ecode[2];
3233       }
3234     while (*ecode == OP_ALT);
3235     if (*ecode == OP_KET) FAIL;
3236
3237     /* Continue as from after the assertion, updating the offsets high water
3238     mark, since extracts may have been taken. */
3239
3240     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3241     ecode += 3;
3242     offset_top = md->end_offset_top;
3243     eptr = md->end_match_ptr;
3244     continue;
3245
3246     /* An alternation is the end of a branch; scan along to find the end of the
3247     bracketed group and go to there. */
3248
3249     case OP_ALT:
3250     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3251     break;
3252
3253     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3254     that it may occur zero times. It may repeat infinitely, or not at all -
3255     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3256     repeat limits are compiled as a number of copies, with the optional ones
3257     preceded by BRAZERO or BRAMINZERO. */
3258
3259     case OP_BRAZERO:
3260       {
3261       const uschar *next = ecode+1;
3262       if (match(eptr, next, offset_top, md)) SUCCEED;
3263       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3264       ecode = next + 3;
3265       }
3266     break;
3267
3268     case OP_BRAMINZERO:
3269       {
3270       const uschar *next = ecode+1;
3271       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3272       if (match(eptr, next+3, offset_top, md)) SUCCEED;
3273       ecode++;
3274       }
3275     break;;
3276
3277     /* End of a group, repeated or non-repeating. If we are at the end of
3278     an assertion "group", stop matching and SUCCEED, but record the
3279     current high water mark for use by positive assertions. */
3280
3281     case OP_KET:
3282     case OP_KETRMIN:
3283     case OP_KETRMAX:
3284       {
3285       int number;
3286       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3287
3288       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
3289         {
3290         md->end_match_ptr = eptr;      /* For ONCE */
3291         md->end_offset_top = offset_top;
3292         SUCCEED;
3293         }
3294
3295       /* In all other cases we have to check the group number back at the
3296       start and if necessary complete handling an extraction by setting the
3297       final offset and bumping the high water mark. */
3298
3299       number = (*prev - OP_BRA) << 1;
3300
3301       DPRINTF(("end bracket %d\n", number/2));
3302
3303       if (number > 0)
3304         {
3305         if (number >= md->offset_end) md->offset_overflow = TRUE; else
3306           {
3307           md->offset_vector[number+1] = eptr - md->start_subject;
3308           if (offset_top <= number) offset_top = number + 2;
3309           }
3310         }
3311
3312       /* For a non-repeating ket, just advance to the next node and continue at
3313       this level. */
3314
3315       if (*ecode == OP_KET)
3316         {
3317         ecode += 3;
3318         break;
3319         }
3320
3321       /* The repeating kets try the rest of the pattern or restart from the
3322       preceding bracket, in the appropriate order. */
3323
3324       if (*ecode == OP_KETRMIN)
3325         {
3326         const uschar *ptr;
3327         if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3328         /* Handle alternation inside the BRA...KET; push the additional
3329            alternatives onto the stack */
3330         ptr=prev;
3331         do {
3332           ptr += (ptr[1]<<8)+ ptr[2];
3333           if (*ptr==OP_ALT)
3334             {
3335               if (md->length == md->point)
3336                 {
3337                   grow_stack(md);
3338                 }
3339               md->offset_top[md->point] = offset_top;
3340               md->eptr[md->point]       = eptr;
3341               md->ecode[md->point]      = ptr+3;
3342               md->r1[md->point]         = 0;
3343               md->r2[md->point]         = 0;
3344               md->off_num[md->point]    = 0;
3345               md->point++;
3346             }
3347         } while (*ptr==OP_ALT);
3348         ecode=prev+3; goto match_loop;
3349         }
3350       else  /* OP_KETRMAX */
3351         {
3352         const uschar *ptr;
3353         /*int points_pushed=0;*/
3354
3355         /* Push one failure point, that will resume matching at the code after
3356            the KETRMAX opcode. */
3357         if (md->length == md->point)
3358           {
3359             grow_stack(md);
3360           }
3361         md->offset_top[md->point] = offset_top;
3362         md->eptr[md->point]       = eptr;
3363         md->ecode[md->point]      = ecode+3;
3364         md->r1[md->point]         = md->offset_vector[number];
3365         md->r2[md->point]         = md->offset_vector[number+1];
3366         md->off_num[md->point]    = number;
3367         md->point++;
3368
3369         md->offset_vector[number] = eptr - md->start_subject;
3370         /* Handle alternation inside the BRA...KET; push each of the
3371            additional alternatives onto the stack */
3372         ptr=prev;
3373         do {
3374           ptr += (ptr[1]<<8)+ ptr[2];
3375           if (*ptr==OP_ALT)
3376             {
3377               if (md->length == md->point)
3378                 if (md->length == md->point)
3379                   {
3380                     grow_stack(md);
3381                   }
3382               md->offset_top[md->point] = offset_top;
3383               md->eptr[md->point]       = eptr;
3384               md->ecode[md->point]      = ptr+3;
3385               md->r1[md->point]         = 0;
3386               md->r2[md->point]         = 0;
3387               md->off_num[md->point]    = 0;
3388               md->point++;
3389               /*points_pushed++;*/
3390             }
3391         } while (*ptr==OP_ALT);
3392         /* Jump to the first (or only) alternative and resume trying to match */
3393         ecode=prev+3; goto match_loop;
3394         }
3395       }
3396
3397     /* Start of subject unless notbol, or after internal newline if multiline */
3398
3399     case OP_CIRC:
3400     if (md->notbol && eptr == md->start_subject) FAIL;
3401     if (md->multiline)
3402       {
3403       if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3404       ecode++;
3405       break;
3406       }
3407     /* ... else fall through */
3408
3409     /* Start of subject assertion */
3410
3411     case OP_SOD:
3412     if (eptr != md->start_subject) FAIL;
3413     ecode++;
3414     break;
3415
3416     /* Assert before internal newline if multiline, or before
3417     a terminating newline unless endonly is set, else end of subject unless
3418     noteol is set. */
3419
3420     case OP_DOLL:
3421     if (md->noteol && eptr >= md->end_subject) FAIL;
3422     if (md->multiline)
3423       {
3424       if (eptr < md->end_subject && *eptr != '\n') FAIL;
3425       ecode++;
3426       break;
3427       }
3428     else if (!md->endonly)
3429       {
3430       if (eptr < md->end_subject - 1 ||
3431          (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3432       ecode++;
3433       break;
3434       }
3435     /* ... else fall through */
3436
3437     /* End of subject assertion */
3438
3439     case OP_EOD:
3440     if (eptr < md->end_subject) FAIL;
3441     ecode++;
3442     break;
3443
3444     /* Word boundary assertions */
3445
3446     case OP_NOT_WORD_BOUNDARY:
3447     case OP_WORD_BOUNDARY:
3448       {
3449       BOOL prev_is_word = (eptr != md->start_subject) &&
3450         ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3451       BOOL cur_is_word = (eptr < md->end_subject) &&
3452         ((pcre_ctypes[*eptr] & ctype_word) != 0);
3453       if ((*ecode++ == OP_WORD_BOUNDARY)?
3454            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3455         FAIL;
3456       }
3457     break;
3458
3459     case OP_NOT_WORD_BOUNDARY_L:
3460     case OP_WORD_BOUNDARY_L:
3461       {
3462         BOOL prev_is_word = (eptr != md->start_subject) &&
3463           (isalnum(eptr[-1]) || eptr[-1]=='_');
3464         BOOL cur_is_word = (eptr < md->end_subject) &&
3465           (isalnum(*eptr) || *eptr=='_');
3466         if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3467             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3468           FAIL;
3469       }
3470       break;
3471
3472
3473     /* Match a single character type; inline for speed */
3474
3475     case OP_ANY:
3476     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3477     if (eptr++ >= md->end_subject) FAIL;
3478     ecode++;
3479     break;
3480
3481     case OP_NOT_DIGIT:
3482     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3483       FAIL;
3484     ecode++;
3485     break;
3486
3487     case OP_DIGIT:
3488     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3489       FAIL;
3490     ecode++;
3491     break;
3492
3493     case OP_NOT_WHITESPACE:
3494     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3495       FAIL;
3496     ecode++;
3497     break;
3498
3499     case OP_WHITESPACE:
3500     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3501       FAIL;
3502     ecode++;
3503     break;
3504
3505     case OP_NOT_WORDCHAR:
3506     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3507       FAIL;
3508     ecode++;
3509     break;
3510
3511     case OP_WORDCHAR:
3512     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3513       FAIL;
3514     ecode++;
3515     break;
3516
3517     case OP_NOT_WORDCHAR_L:
3518     if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
3519       FAIL;
3520     eptr++;
3521     ecode++;
3522     break;
3523
3524     case OP_WORDCHAR_L:
3525     if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
3526       FAIL;
3527     eptr++;
3528     ecode++;
3529     break;
3530
3531     /* Match a back reference, possibly repeatedly. Look past the end of the
3532     item to see if there is repeat information following. The code is similar
3533     to that for character classes, but repeated for efficiency. Then obey
3534     similar code to character type repeats - written out again for speed.
3535     However, if the referenced string is the empty string, always treat
3536     it as matched, any number of times (otherwise there could be infinite
3537     loops). */
3538
3539     case OP_REF:
3540       {
3541       int length;
3542       int number = ecode[1] << 1;                /* Doubled reference number */
3543       ecode += 2;                                /* Advance past the item */
3544
3545       if (number >= offset_top || md->offset_vector[number] < 0)
3546         {
3547         md->errorcode = PCRE_ERROR_BADREF;
3548         FAIL;
3549         }
3550
3551       length = md->offset_vector[number+1] - md->offset_vector[number];
3552
3553       switch (*ecode)
3554         {
3555         case OP_CRSTAR:
3556         case OP_CRMINSTAR:
3557         case OP_CRPLUS:
3558         case OP_CRMINPLUS:
3559         case OP_CRQUERY:
3560         case OP_CRMINQUERY:
3561         c = *ecode++ - OP_CRSTAR;
3562         minimize = (c & 1) != 0;
3563         min = rep_min[c];                 /* Pick up values from tables; */
3564         max = rep_max[c];                 /* zero for max => infinity */
3565         if (max == 0) max = INT_MAX;
3566         break;
3567
3568         case OP_CRRANGE:
3569         case OP_CRMINRANGE:
3570         minimize = (*ecode == OP_CRMINRANGE);
3571         min = (ecode[1] << 8) + ecode[2];
3572         max = (ecode[3] << 8) + ecode[4];
3573         if (max == 0) max = INT_MAX;
3574         ecode += 5;
3575         break;
3576
3577         default:               /* No repeat follows */
3578         if (!match_ref(number, eptr, length, md)) FAIL;
3579         eptr += length;
3580         continue;              /* With the main loop */
3581         }
3582
3583       /* If the length of the reference is zero, just continue with the
3584       main loop. */
3585
3586       if (length == 0) continue;
3587
3588       /* First, ensure the minimum number of matches are present. We get back
3589       the length of the reference string explicitly rather than passing the
3590       address of eptr, so that eptr can be a register variable. */
3591
3592       for (i = 1; i <= min; i++)
3593         {
3594         if (!match_ref(number, eptr, length, md)) FAIL;
3595         eptr += length;
3596         }
3597
3598       /* If min = max, continue at the same level without recursion.
3599       They are not both allowed to be zero. */
3600
3601       if (min == max) continue;
3602
3603       /* If minimizing, keep trying and advancing the pointer */
3604
3605       if (minimize)
3606         {
3607         for (i = min;; i++)
3608           {
3609           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3610           if (i >= max || !match_ref(number, eptr, length, md))
3611             FAIL;
3612           eptr += length;
3613           }
3614         /* Control never gets here */
3615         }
3616
3617       /* If maximizing, find the longest string and work backwards */
3618
3619       else
3620         {
3621         const uschar *pp = eptr;
3622         for (i = min; i < max; i++)
3623           {
3624           if (!match_ref(number, eptr, length, md)) break;
3625           eptr += length;
3626           }
3627         while (eptr >= pp)
3628           {
3629           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3630           eptr -= length;
3631           }
3632         FAIL;
3633         }
3634       }
3635     /* Control never gets here */
3636
3637     /* Match a character class, possibly repeatedly. Look past the end of the
3638     item to see if there is repeat information following. Then obey similar
3639     code to character type repeats - written out again for speed. If caseless
3640     matching was set at runtime but not at compile time, we have to check both
3641     versions of a character, and we have to behave differently for positive and
3642     negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3643     treated differently. */
3644
3645     case OP_CLASS:
3646     case OP_NEGCLASS:
3647       {
3648       BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
3649       const uschar *data = ecode + 1;  /* Save for matching */
3650       ecode += 33;                     /* Advance past the item */
3651
3652       switch (*ecode)
3653         {
3654         case OP_CRSTAR:
3655         case OP_CRMINSTAR:
3656         case OP_CRPLUS:
3657         case OP_CRMINPLUS:
3658         case OP_CRQUERY:
3659         case OP_CRMINQUERY:
3660         c = *ecode++ - OP_CRSTAR;
3661         minimize = (c & 1) != 0;
3662         min = rep_min[c];                 /* Pick up values from tables; */
3663         max = rep_max[c];                 /* zero for max => infinity */
3664         if (max == 0) max = INT_MAX;
3665         break;
3666
3667         case OP_CRRANGE:
3668         case OP_CRMINRANGE:
3669         minimize = (*ecode == OP_CRMINRANGE);
3670         min = (ecode[1] << 8) + ecode[2];
3671         max = (ecode[3] << 8) + ecode[4];
3672         if (max == 0) max = INT_MAX;
3673         ecode += 5;
3674         break;
3675
3676         default:               /* No repeat follows */
3677           min = max = 1;
3678           break;
3679         }
3680
3681       /* First, ensure the minimum number of matches are present. */
3682
3683       for (i = 1; i <= min; i++)
3684         {
3685         if (eptr >= md->end_subject) FAIL;
3686         c = *eptr++;
3687
3688         /* Either not runtime caseless, or it was a positive class. For
3689         runtime caseless, continue if either case is in the map. */
3690
3691         if (!nasty_case)
3692           {
3693           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3694           if (md->runtime_caseless)
3695             {
3696             c = pcre_fcc[c];
3697             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3698             }
3699           }
3700
3701         /* Runtime caseless and it was a negative class. Continue only if
3702         both cases are in the map. */
3703
3704         else
3705           {
3706            if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3707            c = pcre_fcc[c];
3708            if ((data[c/8] & (1 << (c&7))) != 0) continue;
3709            }
3710
3711         FAIL;
3712         }
3713
3714       /* If max == min we can continue with the main loop without the
3715       need to recurse. */
3716
3717       if (min == max) continue;
3718
3719       /* If minimizing, keep testing the rest of the expression and advancing
3720       the pointer while it matches the class. */
3721
3722       if (minimize)
3723         {
3724         for (i = min;; i++)
3725           {
3726           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3727           if (i >= max || eptr >= md->end_subject) FAIL;
3728           c = *eptr++;
3729
3730           /* Either not runtime caseless, or it was a positive class. For
3731           runtime caseless, continue if either case is in the map. */
3732
3733           if (!nasty_case)
3734             {
3735             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3736             if (md->runtime_caseless)
3737               {
3738               c = pcre_fcc[c];
3739               if ((data[c/8] & (1 << (c&7))) != 0) continue;
3740               }
3741             }
3742
3743           /* Runtime caseless and it was a negative class. Continue only if
3744           both cases are in the map. */
3745
3746           else
3747              {
3748              if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3749              c = pcre_fcc[c];
3750              if ((data[c/8] & (1 << (c&7))) != 0) continue;
3751              }
3752
3753           FAIL;
3754           }
3755         /* Control never gets here */
3756         }
3757
3758       /* If maximizing, find the longest possible run, then work backwards. */
3759
3760       else
3761         {
3762         const uschar *pp = eptr;
3763         for (i = min; i < max; eptr++, i++)
3764           {
3765           if (eptr >= md->end_subject) break;
3766           c = *eptr;
3767
3768           /* Either not runtime caseless, or it was a positive class. For
3769           runtime caseless, continue if either case is in the map. */
3770
3771           if (!nasty_case)
3772             {
3773             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3774             if (md->runtime_caseless)
3775               {
3776               c = pcre_fcc[c];
3777               if ((data[c/8] & (1 << (c&7))) != 0) continue;
3778               }
3779             }
3780
3781           /* Runtime caseless and it was a negative class. Continue only if
3782           both cases are in the map. */
3783
3784           else
3785             {
3786             if ((data[c/8] & (1 << (c&7))) == 0) break;
3787             c = pcre_fcc[c];
3788             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3789             }
3790
3791           break;
3792           }
3793
3794         while (eptr >= pp)
3795           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3796         FAIL;
3797         }
3798       }
3799     /* Control never gets here */
3800
3801    /* OP_CLASS_L opcode: handles localized character classes */
3802
3803    case OP_CLASS_L:
3804      {
3805       const uschar *data = ecode + 1;  /* Save for matching */
3806       const uschar locale_flag = *data;
3807       ecode++; data++;          /* The localization support adds an extra byte */
3808
3809       ecode += 33;               /* Advance past the item */
3810
3811       switch (*ecode)
3812         {
3813         case OP_CRSTAR:
3814         case OP_CRMINSTAR:
3815         case OP_CRPLUS:
3816         case OP_CRMINPLUS:
3817         case OP_CRQUERY:
3818         case OP_CRMINQUERY:
3819         c = *ecode++ - OP_CRSTAR;
3820         minimize = (c & 1) != 0;
3821         min = rep_min[c];                 /* Pick up values from tables; */
3822         max = rep_max[c];                 /* zero for max => infinity */
3823         if (max == 0) max = INT_MAX;
3824         break;
3825
3826         case OP_CRRANGE:
3827         case OP_CRMINRANGE:
3828         minimize = (*ecode == OP_CRMINRANGE);
3829         min = (ecode[1] << 8) + ecode[2];
3830         max = (ecode[3] << 8) + ecode[4];
3831         if (max == 0) max = INT_MAX;
3832         ecode += 5;
3833         break;
3834
3835         default:               /* No repeat follows */
3836         if (eptr >= md->end_subject) FAIL;
3837         c = *eptr++;
3838         if ((data[c/8] & (1 << (c&7))) != 0) continue;    /* With main loop */
3839         if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3840         if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3841 #if 0
3842         if ( (locale_flag &  4) && isdigit(c) ) continue;    /* Locale \d */
3843         if ( (locale_flag &  8) && !isdigit(c) ) continue;   /* Locale \D */
3844         if ( (locale_flag & 16) && isspace(c) ) continue;    /* Locale \s */
3845         if ( (locale_flag & 32) && !isspace(c) ) continue;   /* Locale \S */
3846 #endif
3847
3848         if (md->runtime_caseless)
3849           {
3850           c = pcre_fcc[c];
3851           if ((data[c/8] & (1 << (c&7))) != 0) continue;  /* With main loop */
3852
3853           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3854           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3855           }
3856         FAIL;
3857         }
3858
3859       /* First, ensure the minimum number of matches are present. */
3860
3861       for (i = 1; i <= min; i++)
3862         {
3863         if (eptr >= md->end_subject) FAIL;
3864         c = *eptr++;
3865         if ((data[c/8] & (1 << (c&7))) != 0) continue;
3866         if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3867         if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3868
3869         if (md->runtime_caseless)
3870           {
3871           c = pcre_fcc[c];
3872           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3873           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3874           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3875           }
3876         FAIL;
3877         }
3878
3879       /* If max == min we can continue with the main loop without the
3880       need to recurse. */
3881
3882       if (min == max) continue;
3883
3884       /* If minimizing, keep testing the rest of the expression and advancing
3885       the pointer while it matches the class. */
3886
3887       if (minimize)
3888         {
3889         for (i = min;; i++)
3890           {
3891           if (match(eptr, ecode, offset_top, md)) SUCCEED;
3892           if (i >= max || eptr >= md->end_subject) FAIL;
3893           c = *eptr++;
3894           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3895           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3896           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3897
3898           if (md->runtime_caseless)
3899             {
3900             c = pcre_fcc[c];
3901             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3902             if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3903             if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3904             }
3905           FAIL;
3906           }
3907         /* Control never gets here */
3908         }
3909
3910       /* If maximizing, find the longest possible run, then work backwards. */
3911
3912       else
3913         {
3914         const uschar *pp = eptr;
3915         for (i = min; i < max; eptr++, i++)
3916           {
3917           if (eptr >= md->end_subject) break;
3918           c = *eptr;
3919           if ((data[c/8] & (1 << (c&7))) != 0) continue;
3920           if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3921           if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3922           if (md->runtime_caseless)
3923             {
3924             c = pcre_fcc[c];
3925             if ((data[c/8] & (1 << (c&7))) != 0) continue;
3926             if ( (locale_flag &  1) && (isalnum(c) || c=='_') ) continue;   /* Locale \w */
3927             if ( (locale_flag &  2) && (!isalnum(c) && c!='_') ) continue;   /* Locale \W */
3928             }
3929           break;
3930           }
3931
3932         while (eptr >= pp)
3933           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3934         FAIL;
3935         }
3936       }
3937     /* Control never gets here */
3938
3939     /* Match a run of characters */
3940
3941     case OP_CHARS:
3942       {
3943       register int length = ecode[1];
3944       ecode += 2;
3945
3946 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
3947       if (eptr >= md->end_subject)
3948         printf("matching subject <null> against pattern ");
3949       else
3950         {
3951         printf("matching subject ");
3952         pchars(eptr, length, TRUE, md);
3953         printf(" against pattern ");
3954         }
3955       pchars(ecode, length, FALSE, md);
3956       printf("\n");
3957 #endif
3958
3959       if (length > md->end_subject - eptr) FAIL;
3960       if (md->caseless)
3961         {
3962         while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3963         }
3964       else
3965         {
3966         while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3967         }
3968       }
3969     break;
3970
3971     /* Match a single character repeatedly; different opcodes share code. */
3972
3973     case OP_EXACT:
3974     min = max = (ecode[1] << 8) + ecode[2];
3975     ecode += 3;
3976     goto REPEATCHAR;
3977
3978     case OP_UPTO:
3979     case OP_MINUPTO:
3980     min = 0;
3981     max = (ecode[1] << 8) + ecode[2];
3982     minimize = *ecode == OP_MINUPTO;
3983     ecode += 3;
3984     goto REPEATCHAR;
3985
3986     case OP_STAR:
3987     case OP_MINSTAR:
3988     case OP_PLUS:
3989     case OP_MINPLUS:
3990     case OP_QUERY:
3991     case OP_MINQUERY:
3992     c = *ecode++ - OP_STAR;
3993     minimize = (c & 1) != 0;
3994     min = rep_min[c];                 /* Pick up values from tables; */
3995     max = rep_max[c];                 /* zero for max => infinity */
3996     if (max == 0) max = INT_MAX;
3997
3998     /* Common code for all repeated single-character matches. We can give
3999     up quickly if there are fewer than the minimum number of characters left in
4000     the subject. */
4001
4002     REPEATCHAR:
4003     if (min > md->end_subject - eptr) FAIL;
4004     c = *ecode++;
4005
4006     /* The code is duplicated for the caseless and caseful cases, for speed,
4007     since matching characters is likely to be quite common. First, ensure the
4008     minimum number of matches are present. If min = max, continue at the same
4009     level without recursing. Otherwise, if minimizing, keep trying the rest of
4010     the expression and advancing one matching character if failing, up to the
4011     maximum. Alternatively, if maximizing, find the maximum number of
4012     characters and work backwards. */
4013
4014     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4015       max, eptr));
4016
4017     if (md->caseless)
4018       {
4019       c = pcre_lcc[c];
4020       for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4021       if (min == max) continue;
4022       if (minimize)
4023         {
4024         for (i = min;; i++)
4025           {
4026           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4027           if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4028             FAIL;
4029           }
4030         /* Control never gets here */
4031         }
4032       else
4033         {
4034         const uschar *pp = eptr;
4035         for (i = min; i < max; i++)
4036           {
4037           if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4038           eptr++;
4039           }
4040         while (eptr >= pp)
4041           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4042         FAIL;
4043         }
4044       /* Control never gets here */
4045       }
4046
4047     /* Caseful comparisons */
4048
4049     else
4050       {
4051       for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4052       if (min == max) continue;
4053       if (minimize)
4054         {
4055         for (i = min;; i++)
4056           {
4057           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4058           if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4059           }
4060         /* Control never gets here */
4061         }
4062       else
4063         {
4064         const uschar *pp = eptr;
4065         for (i = min; i < max; i++)
4066           {
4067           if (eptr >= md->end_subject || c != *eptr) break;
4068           eptr++;
4069           }
4070         while (eptr >= pp)
4071          if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4072         FAIL;
4073         }
4074       }
4075     /* Control never gets here */
4076
4077     /* Match a negated single character */
4078
4079     case OP_NOT:
4080     if (eptr >= md->end_subject) FAIL;
4081     ecode++;
4082     if (md->caseless)
4083       {
4084       if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4085       }
4086     else
4087       {
4088       if (*ecode++ == *eptr++) FAIL;
4089       }
4090     break;
4091
4092     /* Match a negated single character repeatedly. This is almost a repeat of
4093     the code for a repeated single character, but I haven't found a nice way of
4094     commoning these up that doesn't require a test of the positive/negative
4095     option for each character match. Maybe that wouldn't add very much to the
4096     time taken, but character matching *is* what this is all about... */
4097
4098     case OP_NOTEXACT:
4099     min = max = (ecode[1] << 8) + ecode[2];
4100     ecode += 3;
4101     goto REPEATNOTCHAR;
4102
4103     case OP_NOTUPTO:
4104     case OP_NOTMINUPTO:
4105     min = 0;
4106     max = (ecode[1] << 8) + ecode[2];
4107     minimize = *ecode == OP_NOTMINUPTO;
4108     ecode += 3;
4109     goto REPEATNOTCHAR;
4110
4111     case OP_NOTSTAR:
4112     case OP_NOTMINSTAR:
4113     case OP_NOTPLUS:
4114     case OP_NOTMINPLUS:
4115     case OP_NOTQUERY:
4116     case OP_NOTMINQUERY:
4117     c = *ecode++ - OP_NOTSTAR;
4118     minimize = (c & 1) != 0;
4119     min = rep_min[c];                 /* Pick up values from tables; */
4120     max = rep_max[c];                 /* zero for max => infinity */
4121     if (max == 0) max = INT_MAX;
4122
4123     /* Common code for all repeated single-character matches. We can give
4124     up quickly if there are fewer than the minimum number of characters left in
4125     the subject. */
4126
4127     REPEATNOTCHAR:
4128     if (min > md->end_subject - eptr) FAIL;
4129     c = *ecode++;
4130
4131     /* The code is duplicated for the caseless and caseful cases, for speed,
4132     since matching characters is likely to be quite common. First, ensure the
4133     minimum number of matches are present. If min = max, continue at the same
4134     level without recursing. Otherwise, if minimizing, keep trying the rest of
4135     the expression and advancing one matching character if failing, up to the
4136     maximum. Alternatively, if maximizing, find the maximum number of
4137     characters and work backwards. */
4138
4139     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4140       max, eptr));
4141
4142     if (md->caseless)
4143       {
4144       c = pcre_lcc[c];
4145       for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4146       if (min == max) continue;
4147       if (minimize)
4148         {
4149         for (i = min;; i++)
4150           {
4151           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4152           if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4153             FAIL;
4154           }
4155         /* Control never gets here */
4156         }
4157       else
4158         {
4159         const uschar *pp = eptr;
4160         for (i = min; i < max; i++)
4161           {
4162           if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4163           eptr++;
4164           }
4165         while (eptr >= pp)
4166           if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4167         FAIL;
4168         }
4169       /* Control never gets here */
4170       }
4171
4172     /* Caseful comparisons */
4173
4174     else
4175       {
4176       for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4177       if (min == max) continue;
4178       if (minimize)
4179         {
4180         for (i = min;; i++)
4181           {
4182           if (match(eptr, ecode, offset_top, md)) SUCCEED;
4183           if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4184           }
4185         /* Control never gets here */
4186         }
4187       else
4188         {
4189         const uschar *pp = eptr;
4190         for (i = min; i < max; i++)
4191           {
4192           if (eptr >= md->end_subject || c == *eptr) break;
4193           eptr++;
4194           }
4195         while (eptr >= pp)
4196          if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4197         FAIL;
4198         }
4199       }
4200     /* Control never gets here */
4201
4202     /* Match a single character type repeatedly; several different opcodes
4203     share code. This is very similar to the code for single characters, but we
4204     repeat it in the interests of efficiency. */
4205
4206     case OP_TYPEEXACT:
4207     min = max = (ecode[1] << 8) + ecode[2];
4208     minimize = TRUE;
4209     ecode += 3;
4210     goto REPEATTYPE;
4211
4212     case OP_TYPEUPTO:
4213     case OP_TYPEMINUPTO:
4214     min = 0;
4215     max = (ecode[1] << 8) + ecode[2];
4216     minimize = *ecode == OP_TYPEMINUPTO;
4217     ecode += 3;
4218     goto REPEATTYPE;
4219
4220     case OP_TYPESTAR:
4221     case OP_TYPEMINSTAR:
4222     case OP_TYPEPLUS:
4223     case OP_TYPEMINPLUS:
4224     case OP_TYPEQUERY:
4225     case OP_TYPEMINQUERY:
4226     c = *ecode++ - OP_TYPESTAR;
4227     minimize = (c & 1) != 0;
4228     min = rep_min[c];                 /* Pick up values from tables; */
4229     max = rep_max[c];                 /* zero for max => infinity */
4230     if (max == 0) max = INT_MAX;
4231
4232     /* Common code for all repeated single character type matches */
4233
4234     REPEATTYPE:
4235     ctype = *ecode++;      /* Code for the character type */
4236
4237     /* First, ensure the minimum number of matches are present. Use inline
4238     code for maximizing the speed, and do the type test once at the start
4239     (i.e. keep it out of the loop). Also test that there are at least the
4240     minimum number of characters before we start. */
4241
4242     if (min > md->end_subject - eptr) FAIL;
4243     if (min > 0) switch(ctype)
4244       {
4245       case OP_ANY:
4246       if (!md->dotall)
4247         { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4248       else eptr += min;
4249       break;
4250
4251       case OP_NOT_DIGIT:
4252       for (i = 1; i <= min; i++)
4253         if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4254       break;
4255
4256       case OP_DIGIT:
4257       for (i = 1; i <= min; i++)
4258         if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4259       break;
4260
4261       case OP_NOT_WHITESPACE:
4262       for (i = 1; i <= min; i++)
4263         if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4264       break;
4265
4266       case OP_WHITESPACE:
4267       for (i = 1; i <= min; i++)
4268         if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4269       break;
4270
4271       case OP_NOT_WORDCHAR:
4272       for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4273         FAIL;
4274       break;
4275
4276       case OP_WORDCHAR:
4277       for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4278         FAIL;
4279       break;
4280
4281       case OP_NOT_WORDCHAR_L:
4282       for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
4283         FAIL;
4284       break;
4285
4286       case OP_WORDCHAR_L:
4287       for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
4288         FAIL;
4289       break;
4290       }
4291
4292     /* If min = max, continue at the same level without recursing */
4293
4294     if (min == max) continue;
4295
4296     /* If minimizing, we have to test the rest of the pattern before each
4297     subsequent match, so inlining isn't much help; just use the function. */
4298
4299     if (minimize)
4300       {
4301       for (i = min;; i++)
4302         {
4303         if (match(eptr, ecode, offset_top, md)) SUCCEED;
4304         if (i >= max || eptr >= md->end_subject ||
4305           !match_type(ctype, *eptr++, md->dotall))
4306             FAIL;
4307         }
4308       /* Control never gets here */
4309       }
4310
4311     /* If maximizing it is worth using inline code for speed, doing the type
4312     test once at the start (i.e. keep it out of the loop). */
4313
4314     else
4315       {
4316       const uschar *pp = eptr;
4317       switch(ctype)
4318         {
4319         case OP_ANY:
4320         if (!md->dotall)
4321           {
4322           for (i = min; i < max; i++)
4323             {
4324             if (eptr >= md->end_subject || *eptr == '\n') break;
4325             eptr++;
4326             }
4327           }
4328         else
4329           {
4330           c = max - min;
4331           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4332           eptr += c;
4333           }
4334         break;
4335
4336         case OP_NOT_DIGIT:
4337         for (i = min; i < max; i++)
4338           {
4339           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4340             break;
4341           eptr++;
4342           }
4343         break;
4344
4345         case OP_DIGIT:
4346         for (i = min; i < max; i++)
4347           {
4348           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4349             break;
4350           eptr++;
4351           }
4352         break;
4353
4354         case OP_NOT_WHITESPACE:
4355         for (i = min; i < max; i++)
4356           {
4357           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4358             break;
4359           eptr++;
4360           }
4361         break;
4362
4363         case OP_WHITESPACE:
4364         for (i = min; i < max; i++)
4365           {
4366           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4367             break;
4368           eptr++;
4369           }
4370         break;
4371
4372         case OP_NOT_WORDCHAR:
4373         for (i = min; i < max; i++)
4374           {
4375           if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4376             break;
4377           eptr++;
4378           }
4379         break;
4380
4381         case OP_WORDCHAR:
4382         for (i = min; i < max; i++)
4383           {
4384             if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4385               break;
4386             eptr++;
4387           }
4388         break;
4389         case OP_NOT_WORDCHAR_L:
4390           for (i = min; i < max; i++)
4391          {
4392          if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
4393            break;
4394          eptr++;
4395          }
4396        break;
4397
4398        case OP_WORDCHAR_L:
4399        for (i = min; i < max; i++)
4400          {
4401          if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
4402              break;
4403           eptr++;
4404           }
4405         break;
4406         }
4407
4408       while (eptr >= pp)
4409         if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4410       FAIL;
4411       }
4412     /* Control never gets here */
4413
4414     /* There's been some horrible disaster. */
4415
4416     default:
4417     DPRINTF(("Unknown opcode %d\n", *ecode));
4418     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4419     FAIL;
4420     }
4421
4422   /* Do not stick any code in here without much thought; it is assumed
4423   that "continue" in the code above comes out to here to repeat the main
4424   loop. */
4425
4426   }             /* End of main loop */
4427 /* Control never reaches here */
4428
4429 fail:
4430  if (md->point > save_stack_position)
4431  {
4432    /* If there are still points remaining on the stack, pop the next one off */
4433    int off_num;
4434
4435    md->point--;
4436    offset_top = md->offset_top[md->point];
4437    eptr       = md->eptr[md->point];
4438    ecode      = md->ecode[md->point];
4439    off_num    = md->off_num[md->point];
4440    md->offset_vector[off_num]   = md->r1[md->point];
4441    md->offset_vector[off_num+1] = md->r2[md->point];
4442    goto match_loop;
4443   }
4444    /* Failure, and nothing left on the stack, so end this function call */
4445
4446  /* Restore the top of the stack to where it was before this function
4447     call.  This lets us use one stack for everything; recursive calls
4448     can push and pop information, and may increase the stack.  When
4449     the call returns, the parent function can resume pushing and
4450     popping wherever it was. */
4451
4452  md->point = save_stack_position;
4453  return FALSE;
4454
4455 succeed:
4456  return TRUE;
4457 }
4458
4459
4460
4461 /*************************************************
4462 *         Segregate setjmp()                     *
4463 *************************************************/
4464
4465 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
4466 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4467 hide it in a separate function. This is called only when PCRE_EXTRA is set,
4468 since it's needed only for the extension \X option, and with any luck, a good
4469 compiler will spot the tail recursion and compile it efficiently.
4470
4471 Arguments:
4472    eptr        pointer in subject
4473    ecode       position in code
4474    offset_top  current top pointer
4475    md          pointer to "static" info for the match
4476
4477 Returns:       TRUE if matched
4478 */
4479
4480 static BOOL
4481 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4482   match_data *match_block)
4483 {
4484 return setjmp(match_block->fail_env) == 0 &&
4485       match(eptr, ecode, offset_top, match_block);
4486 }
4487
4488
4489
4490 /*************************************************
4491 *         Execute a Regular Expression           *
4492 *************************************************/
4493
4494 /* This function applies a compiled re to a subject string and picks out
4495 portions of the string if it matches. Two elements in the vector are set for
4496 each substring: the offsets to the start and end of the substring.
4497
4498 Arguments:
4499   external_re     points to the compiled expression
4500   external_extra  points to "hints" from pcre_study() or is NULL
4501   subject         points to the subject string
4502   length          length of subject string (may contain binary zeros)
4503   options         option bits
4504   offsets         points to a vector of ints to be filled in with offsets
4505   offsetcount     the number of elements in the vector
4506
4507 Returns:          > 0 => success; value is the number of elements filled in
4508                   = 0 => success, but offsets is not big enough
4509                    -1 => failed to match
4510                  < -1 => some kind of unexpected problem
4511 */
4512
4513 int
4514 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4515   const char *subject, int length, int start_pos, int options,
4516   int *offsets, int offsetcount)
4517 {
4518   /* The "volatile" directives are to make gcc -Wall stop complaining
4519      that these variables can be clobbered by the longjmp.  Hopefully
4520      they won't cost too much performance. */
4521 volatile int resetcount, ocount;
4522 volatile int first_char = -1;
4523 match_data match_block;
4524 const uschar *start_bits = NULL;
4525 const uschar *start_match = (const uschar *)subject + start_pos;
4526 const uschar *end_subject;
4527 const real_pcre *re = (const real_pcre *)external_re;
4528 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4529 volatile BOOL using_temporary_offsets = FALSE;
4530 volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4531 volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4532
4533 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4534
4535 if (re == NULL || subject == NULL ||
4536    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4537 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4538
4539 match_block.start_subject = (const uschar *)subject;
4540 match_block.end_subject = match_block.start_subject + length;
4541 end_subject = match_block.end_subject;
4542
4543 match_block.caseless  = ((re->options | options) & PCRE_CASELESS) != 0;
4544 match_block.runtime_caseless = match_block.caseless &&
4545   (re->options & PCRE_CASELESS) == 0;
4546
4547 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4548 match_block.dotall    = ((re->options | options) & PCRE_DOTALL) != 0;
4549 match_block.endonly   = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4550
4551 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4552 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4553
4554 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4555
4556 /* Set the stack state to empty */
4557   match_block.off_num = match_block.offset_top = NULL;
4558   match_block.r1 = match_block.r2 = NULL;
4559   match_block.eptr = match_block.ecode = NULL;
4560   match_block.point = match_block.length = 0;
4561
4562 /* If the expression has got more back references than the offsets supplied can
4563 hold, we get a temporary bit of working store to use during the matching.
4564 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4565 of 2. */
4566
4567 ocount = offsetcount & (-2);
4568 if (re->top_backref > 0 && re->top_backref >= ocount/2)
4569   {
4570   ocount = re->top_backref * 2 + 2;
4571   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4572   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4573   using_temporary_offsets = TRUE;
4574   DPRINTF(("Got memory to hold back references\n"));
4575   }
4576 else match_block.offset_vector = offsets;
4577
4578 match_block.offset_end = ocount;
4579 match_block.offset_overflow = FALSE;
4580
4581 /* Compute the minimum number of offsets that we need to reset each time. Doing
4582 this makes a huge difference to execution time when there aren't many brackets
4583 in the pattern. */
4584
4585 resetcount = 2 + re->top_bracket * 2;
4586 if (resetcount > offsetcount) resetcount = ocount;
4587
4588 /* If MULTILINE is set at exec time but was not set at compile time, and the
4589 anchored flag is set, we must re-check because a setting provoked by ^ in the
4590 pattern is not right in multi-line mode. Calling is_anchored() again here does
4591 the right check, because multiline is now set. If it now yields FALSE, the
4592 expression must have had ^ starting some of its branches. Check to see if
4593 that is true for *all* branches, and if so, set the startline flag. */
4594
4595 if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4596     !is_anchored(re->code, match_block.multiline))
4597   {
4598   anchored = FALSE;
4599   if (is_startline(re->code)) startline = TRUE;
4600   }
4601
4602 /* Set up the first character to match, if available. The first_char value is
4603 never set for an anchored regular expression, but the anchoring may be forced
4604 at run time, so we have to test for anchoring. The first char may be unset for
4605 an unanchored pattern, of course. If there's no first char and the pattern was
4606 studied, the may be a bitmap of possible first characters. However, we can
4607 use this only if the caseless state of the studying was correct. */
4608
4609 if (!anchored)
4610   {
4611   if ((re->options & PCRE_FIRSTSET) != 0)
4612     {
4613     first_char = re->first_char;
4614     if (match_block.caseless) first_char = pcre_lcc[first_char];
4615     }
4616   else
4617     if (!startline && extra != NULL &&
4618       (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4619       ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4620         start_bits = extra->start_bits;
4621   }
4622
4623 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4624
4625 do
4626   {
4627   int rc;
4628   register int *iptr = match_block.offset_vector;
4629   register int *iend = iptr + resetcount;
4630
4631   /* Reset the maximum number of extractions we might see. */
4632
4633   while (iptr < iend) *iptr++ = -1;
4634
4635   /* Advance to a unique first char if possible */
4636
4637   if (first_char >= 0)
4638     {
4639     if (match_block.caseless)
4640       while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4641         start_match++;
4642     else
4643       while (start_match < end_subject && *start_match != first_char)
4644         start_match++;
4645     }
4646
4647   /* Or to just after \n for a multiline match if possible */
4648
4649   else if (startline)
4650     {
4651     if (start_match > match_block.start_subject)
4652       {
4653       while (start_match < end_subject && start_match[-1] != '\n')
4654         start_match++;
4655       }
4656     }
4657
4658   /* Or to a non-unique first char */
4659
4660   else if (start_bits != NULL)
4661     {
4662     while (start_match < end_subject)
4663       {
4664       register int c = *start_match;
4665       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4666       }
4667     }
4668
4669 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
4670   printf(">>>> Match against: ");
4671   pchars(start_match, end_subject - start_match, TRUE, &match_block);
4672   printf("\n");
4673 #endif
4674
4675   /* When a match occurs, substrings will be set for all internal extractions;
4676   we just need to set up the whole thing as substring 0 before returning. If
4677   there were too many extractions, set the return code to zero. In the case
4678   where we had to get some local store to hold offsets for backreferences, copy
4679   those back references that we can. In this case there need not be overflow
4680   if certain parts of the pattern were not used.
4681
4682   Before starting the match, we have to set up a longjmp() target to enable
4683   the "cut" operation to fail a match completely without backtracking. This
4684   is done in a separate function to avoid compiler warnings. We need not do
4685   it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4686   enabled. */
4687
4688   /* To handle errors such as running out of memory for the failure
4689      stack, we need to save this location via setjmp(), so
4690      error-handling code can call longjmp() to jump out of deeply-nested code. */
4691   if (setjmp(match_block.error_env)==0)
4692     {
4693
4694   if ((re->options & PCRE_EXTRA) != 0)
4695     {
4696     if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4697       continue;
4698     }
4699   else if (!match(start_match, re->code, 2, &match_block)) continue;
4700
4701   /* Copy the offset information from temporary store if necessary */
4702
4703   if (using_temporary_offsets)
4704     {
4705     if (offsetcount >= 4)
4706       {
4707       memcpy(offsets + 2, match_block.offset_vector + 2,
4708         (offsetcount - 2) * sizeof(int));
4709       DPRINTF(("Copied offsets from temporary memory\n"));
4710       }
4711     if (match_block.end_offset_top > offsetcount)
4712       match_block.offset_overflow = TRUE;
4713
4714     DPRINTF(("Freeing temporary memory\n"));
4715     (pcre_free)(match_block.offset_vector);
4716     }
4717
4718   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4719
4720   if (match_block.offset_end < 2) rc = 0; else
4721     {
4722     offsets[0] = start_match - match_block.start_subject;
4723     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4724     }
4725
4726   DPRINTF((">>>> returning %d\n", rc));
4727   free_stack(&match_block);
4728   return rc;
4729   }  /* End of (if setjmp(match_block.error_env)...) */
4730   free_stack(&match_block);
4731
4732   /* Return an error code; pcremodule.c will preserve the exception */
4733   if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4734   }
4735 while (!anchored &&
4736        match_block.errorcode == PCRE_ERROR_NOMATCH &&
4737        start_match++ < end_subject);
4738
4739 if (using_temporary_offsets)
4740   {
4741   DPRINTF(("Freeing temporary memory\n"));
4742   (pcre_free)(match_block.offset_vector);
4743   }
4744
4745 #ifdef DEBUG
4746 printf(">>>> returning %d\n", match_block.errorcode);
4747 #endif
4748
4749  free_stack(&match_block);
4750  return match_block.errorcode;
4751 }
4752
4753 /* End of pcre.c */