glib/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language (but see
   7 below for why this module is different).
   8
   9                        Written by Philip Hazel
  10            Copyright (c) 1997-2012 University of Cambridge
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41 /* This module contains the external function pcre_dfa_exec(), which is an
  42 alternative matching function that uses a sort of DFA algorithm (not a true
  43 FSM). This is NOT Perl-compatible, but it has advantages in certain
  44 applications. */
  45
  46
  47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
  48 the performance of his patterns greatly. I could not use it as it stood, as it
  49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
  50 test 7 to loop, and test 9 to crash with a segfault.
  51
  52 The issue is the check for duplicate states, which is done by a simple linear
  53 search up the state list. (Grep for "duplicate" below to find the code.) For
  54 many patterns, there will never be many states active at one time, so a simple
  55 linear search is fine. In patterns that have many active states, it might be a
  56 bottleneck. The suggested code used an indexing scheme to remember which states
  57 had previously been used for each character, and avoided the linear search when
  58 it knew there was no chance of a duplicate. This was implemented when adding
  59 states to the state lists.
  60
  61 I wrote some thread-safe, not-limited code to try something similar at the time
  62 of checking for duplicates (instead of when adding states), using index vectors
  63 on the stack. It did give a 13% improvement with one specially constructed
  64 pattern for certain subject strings, but on other strings and on many of the
  65 simpler patterns in the test suite it did worse. The major problem, I think,
  66 was the extra time to initialize the index. This had to be done for each call
  67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
  68 only once - I suspect this was the cause of the problems with the tests.)
  69
  70 Overall, I concluded that the gains in some cases did not outweigh the losses
  71 in others, so I abandoned this code. */
  72
  73
  74
  75 #include "config.h"
  76
  77 #define NLBLOCK md             /* Block containing newline information */
  78 #define PSSTART start_subject  /* Field containing processed string start */
  79 #define PSEND   end_subject    /* Field containing processed string end */
  80
  81 #include "pcre_internal.h"
  82
  83
  84 /* For use to indent debugging output */
  85
  86 #define SP "                   "
  87
  88
  89 /*************************************************
  90 *      Code parameters and static tables         *
  91 *************************************************/
  92
  93 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  94 into others, under special conditions. A gap of 20 between the blocks should be
  95 enough. The resulting opcodes don't have to be less than 256 because they are
  96 never stored, so we push them well clear of the normal opcodes. */
  97
  98 #define OP_PROP_EXTRA       300
  99 #define OP_EXTUNI_EXTRA     320
 100 #define OP_ANYNL_EXTRA      340
 101 #define OP_HSPACE_EXTRA     360
 102 #define OP_VSPACE_EXTRA     380
 103
 104
 105 /* This table identifies those opcodes that are followed immediately by a
 106 character that is to be tested in some way. This makes it possible to
 107 centralize the loading of these characters. In the case of Type * etc, the
 108 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
 109 small value. Non-zero values in the table are the offsets from the opcode where
 110 the character is to be found. ***NOTE*** If the start of this table is
 111 modified, the three tables that follow must also be modified. */
 112
 113 static const pcre_uint8 coptable[] = {
 114   0,                             /* End                                    */
 115   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
 116   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
 117   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
 118   0, 0,                          /* \P, \p                                 */
 119   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
 120   0,                             /* \X                                     */
 121   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
 122   1,                             /* Char                                   */
 123   1,                             /* Chari                                  */
 124   1,                             /* not                                    */
 125   1,                             /* noti                                   */
 126   /* Positive single-char repeats                                          */
 127   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 128   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
 129   1+IMM2_SIZE,                   /* exact                                  */
 130   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
 131   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 132   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
 133   1+IMM2_SIZE,                   /* exact I                                */
 134   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
 135   /* Negative single-char repeats - only for chars < 256                   */
 136   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 137   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
 138   1+IMM2_SIZE,                   /* NOT exact                              */
 139   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
 140   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 141   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
 142   1+IMM2_SIZE,                   /* NOT exact I                            */
 143   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
 144   /* Positive type repeats                                                 */
 145   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 146   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
 147   1+IMM2_SIZE,                   /* Type exact                             */
 148   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
 149   /* Character class & ref repeats                                         */
 150   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 151   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 152   0,                             /* CLASS                                  */
 153   0,                             /* NCLASS                                 */
 154   0,                             /* XCLASS - variable length               */
 155   0,                             /* REF                                    */
 156   0,                             /* REFI                                   */
 157   0,                             /* RECURSE                                */
 158   0,                             /* CALLOUT                                */
 159   0,                             /* Alt                                    */
 160   0,                             /* Ket                                    */
 161   0,                             /* KetRmax                                */
 162   0,                             /* KetRmin                                */
 163   0,                             /* KetRpos                                */
 164   0,                             /* Reverse                                */
 165   0,                             /* Assert                                 */
 166   0,                             /* Assert not                             */
 167   0,                             /* Assert behind                          */
 168   0,                             /* Assert behind not                      */
 169   0, 0,                          /* ONCE, ONCE_NC                          */
 170   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 171   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 172   0, 0,                          /* CREF, NCREF                            */
 173   0, 0,                          /* RREF, NRREF                            */
 174   0,                             /* DEF                                    */
 175   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 176   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 177   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 178   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
 179   0, 0                           /* CLOSE, SKIPZERO  */
 180 };
 181
 182 /* This table identifies those opcodes that inspect a character. It is used to
 183 remember the fact that a character could have been inspected when the end of
 184 the subject is reached. ***NOTE*** If the start of this table is modified, the
 185 two tables that follow must also be modified. */
 186
 187 static const pcre_uint8 poptable[] = {
 188   0,                             /* End                                    */
 189   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
 190   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
 191   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
 192   1, 1,                          /* \P, \p                                 */
 193   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
 194   1,                             /* \X                                     */
 195   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
 196   1,                             /* Char                                   */
 197   1,                             /* Chari                                  */
 198   1,                             /* not                                    */
 199   1,                             /* noti                                   */
 200   /* Positive single-char repeats                                          */
 201   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 202   1, 1, 1,                       /* upto, minupto, exact                   */
 203   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
 204   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 205   1, 1, 1,                       /* upto I, minupto I, exact I             */
 206   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
 207   /* Negative single-char repeats - only for chars < 256                   */
 208   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 209   1, 1, 1,                       /* NOT upto, minupto, exact               */
 210   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
 211   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 212   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
 213   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
 214   /* Positive type repeats                                                 */
 215   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 216   1, 1, 1,                       /* Type upto, minupto, exact              */
 217   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
 218   /* Character class & ref repeats                                         */
 219   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 220   1, 1,                          /* CRRANGE, CRMINRANGE                    */
 221   1,                             /* CLASS                                  */
 222   1,                             /* NCLASS                                 */
 223   1,                             /* XCLASS - variable length               */
 224   0,                             /* REF                                    */
 225   0,                             /* REFI                                   */
 226   0,                             /* RECURSE                                */
 227   0,                             /* CALLOUT                                */
 228   0,                             /* Alt                                    */
 229   0,                             /* Ket                                    */
 230   0,                             /* KetRmax                                */
 231   0,                             /* KetRmin                                */
 232   0,                             /* KetRpos                                */
 233   0,                             /* Reverse                                */
 234   0,                             /* Assert                                 */
 235   0,                             /* Assert not                             */
 236   0,                             /* Assert behind                          */
 237   0,                             /* Assert behind not                      */
 238   0, 0,                          /* ONCE, ONCE_NC                          */
 239   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 240   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 241   0, 0,                          /* CREF, NCREF                            */
 242   0, 0,                          /* RREF, NRREF                            */
 243   0,                             /* DEF                                    */
 244   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 245   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 246   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 247   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
 248   0, 0                           /* CLOSE, SKIPZERO                        */
 249 };
 250
 251 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 252 and \w */
 253
 254 static const pcre_uint8 toptable1[] = {
 255   0, 0, 0, 0, 0, 0,
 256   ctype_digit, ctype_digit,
 257   ctype_space, ctype_space,
 258   ctype_word,  ctype_word,
 259   0, 0                            /* OP_ANY, OP_ALLANY */
 260 };
 261
 262 static const pcre_uint8 toptable2[] = {
 263   0, 0, 0, 0, 0, 0,
 264   ctype_digit, 0,
 265   ctype_space, 0,
 266   ctype_word,  0,
 267   1, 1                            /* OP_ANY, OP_ALLANY */
 268 };
 269
 270
 271 /* Structure for holding data about a particular state, which is in effect the
 272 current data for an active path through the match tree. It must consist
 273 entirely of ints because the working vector we are passed, and which we put
 274 these structures in, is a vector of ints. */
 275
 276 typedef struct stateblock {
 277   int offset;                     /* Offset to opcode */
 278   int count;                      /* Count for repeats */
 279   int data;                       /* Some use extra data */
 280 } stateblock;
 281
 282 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
 283
 284
 285 #ifdef PCRE_DEBUG
 286 /*************************************************
 287 *             Print character string             *
 288 *************************************************/
 289
 290 /* Character string printing function for debugging.
 291
 292 Arguments:
 293   p            points to string
 294   length       number of bytes
 295   f            where to print
 296
 297 Returns:       nothing
 298 */
 299
 300 static void
 301 pchars(const pcre_uchar *p, int length, FILE *f)
 302 {
 303 int c;
 304 while (length-- > 0)
 305   {
 306   if (isprint(c = *(p++)))
 307     fprintf(f, "%c", c);
 308   else
 309     fprintf(f, "\\x%02x", c);
 310   }
 311 }
 312 #endif
 313
 314
 315
 316 /*************************************************
 317 *    Execute a Regular Expression - DFA engine   *
 318 *************************************************/
 319
 320 /* This internal function applies a compiled pattern to a subject string,
 321 starting at a given point, using a DFA engine. This function is called from the
 322 external one, possibly multiple times if the pattern is not anchored. The
 323 function calls itself recursively for some kinds of subpattern.
 324
 325 Arguments:
 326   md                the match_data block with fixed information
 327   this_start_code   the opening bracket of this subexpression's code
 328   current_subject   where we currently are in the subject string
 329   start_offset      start offset in the subject string
 330   offsets           vector to contain the matching string offsets
 331   offsetcount       size of same
 332   workspace         vector of workspace
 333   wscount           size of same
 334   rlevel            function call recursion level
 335
 336 Returns:            > 0 => number of match offset pairs placed in offsets
 337                     = 0 => offsets overflowed; longest matches are present
 338                      -1 => failed to match
 339                    < -1 => some kind of unexpected problem
 340
 341 The following macros are used for adding states to the two state vectors (one
 342 for the current character, one for the following character). */
 343
 344 #define ADD_ACTIVE(x,y) \
 345   if (active_count++ < wscount) \
 346     { \
 347     next_active_state->offset = (x); \
 348     next_active_state->count  = (y); \
 349     next_active_state++; \
 350     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 351     } \
 352   else return PCRE_ERROR_DFA_WSSIZE
 353
 354 #define ADD_ACTIVE_DATA(x,y,z) \
 355   if (active_count++ < wscount) \
 356     { \
 357     next_active_state->offset = (x); \
 358     next_active_state->count  = (y); \
 359     next_active_state->data   = (z); \
 360     next_active_state++; \
 361     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 362     } \
 363   else return PCRE_ERROR_DFA_WSSIZE
 364
 365 #define ADD_NEW(x,y) \
 366   if (new_count++ < wscount) \
 367     { \
 368     next_new_state->offset = (x); \
 369     next_new_state->count  = (y); \
 370     next_new_state++; \
 371     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 372     } \
 373   else return PCRE_ERROR_DFA_WSSIZE
 374
 375 #define ADD_NEW_DATA(x,y,z) \
 376   if (new_count++ < wscount) \
 377     { \
 378     next_new_state->offset = (x); \
 379     next_new_state->count  = (y); \
 380     next_new_state->data   = (z); \
 381     next_new_state++; \
 382     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
 383       (x), (y), (z), __LINE__)); \
 384     } \
 385   else return PCRE_ERROR_DFA_WSSIZE
 386
 387 /* And now, here is the code */
 388
 389 static int
 390 internal_dfa_exec(
 391   dfa_match_data *md,
 392   const pcre_uchar *this_start_code,
 393   const pcre_uchar *current_subject,
 394   int start_offset,
 395   int *offsets,
 396   int offsetcount,
 397   int *workspace,
 398   int wscount,
 399   int  rlevel)
 400 {
 401 stateblock *active_states, *new_states, *temp_states;
 402 stateblock *next_active_state, *next_new_state;
 403
 404 const pcre_uint8 *ctypes, *lcc, *fcc;
 405 const pcre_uchar *ptr;
 406 const pcre_uchar *end_code, *first_op;
 407
 408 dfa_recursion_info new_recursive;
 409
 410 int active_count, new_count, match_count;
 411
 412 /* Some fields in the md block are frequently referenced, so we load them into
 413 independent variables in the hope that this will perform better. */
 414
 415 const pcre_uchar *start_subject = md->start_subject;
 416 const pcre_uchar *end_subject = md->end_subject;
 417 const pcre_uchar *start_code = md->start_code;
 418
 419 #ifdef SUPPORT_UTF
 420 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
 421 #else
 422 BOOL utf = FALSE;
 423 #endif
 424
 425 BOOL reset_could_continue = FALSE;
 426
 427 rlevel++;
 428 offsetcount &= (-2);
 429
 430 wscount -= 2;
 431 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 432           (2 * INTS_PER_STATEBLOCK);
 433
 434 DPRINTF(("\n%.*s---------------------\n"
 435   "%.*sCall to internal_dfa_exec f=%d\n",
 436   rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
 437
 438 ctypes = md->tables + ctypes_offset;
 439 lcc = md->tables + lcc_offset;
 440 fcc = md->tables + fcc_offset;
 441
 442 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 443
 444 active_states = (stateblock *)(workspace + 2);
 445 next_new_state = new_states = active_states + wscount;
 446 new_count = 0;
 447
 448 first_op = this_start_code + 1 + LINK_SIZE +
 449   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 450     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 451     ? IMM2_SIZE:0);
 452
 453 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 454 the alternative states onto the list, and find out where the end is. This
 455 makes is possible to use this function recursively, when we want to stop at a
 456 matching internal ket rather than at the end.
 457
 458 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 459 a backward assertion. In that case, we have to find out the maximum amount to
 460 move back, and set up each alternative appropriately. */
 461
 462 if (*first_op == OP_REVERSE)
 463   {
 464   int max_back = 0;
 465   int gone_back;
 466
 467   end_code = this_start_code;
 468   do
 469     {
 470     int back = GET(end_code, 2+LINK_SIZE);
 471     if (back > max_back) max_back = back;
 472     end_code += GET(end_code, 1);
 473     }
 474   while (*end_code == OP_ALT);
 475
 476   /* If we can't go back the amount required for the longest lookbehind
 477   pattern, go back as far as we can; some alternatives may still be viable. */
 478
 479 #ifdef SUPPORT_UTF
 480   /* In character mode we have to step back character by character */
 481
 482   if (utf)
 483     {
 484     for (gone_back = 0; gone_back < max_back; gone_back++)
 485       {
 486       if (current_subject <= start_subject) break;
 487       current_subject--;
 488       ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
 489       }
 490     }
 491   else
 492 #endif
 493
 494   /* In byte-mode we can do this quickly. */
 495
 496     {
 497     gone_back = (current_subject - max_back < start_subject)?
 498       (int)(current_subject - start_subject) : max_back;
 499     current_subject -= gone_back;
 500     }
 501
 502   /* Save the earliest consulted character */
 503
 504   if (current_subject < md->start_used_ptr)
 505     md->start_used_ptr = current_subject;
 506
 507   /* Now we can process the individual branches. */
 508
 509   end_code = this_start_code;
 510   do
 511     {
 512     int back = GET(end_code, 2+LINK_SIZE);
 513     if (back <= gone_back)
 514       {
 515       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
 516       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 517       }
 518     end_code += GET(end_code, 1);
 519     }
 520   while (*end_code == OP_ALT);
 521  }
 522
 523 /* This is the code for a "normal" subpattern (not a backward assertion). The
 524 start of a whole pattern is always one of these. If we are at the top level,
 525 we may be asked to restart matching from the same point that we reached for a
 526 previous partial match. We still have to scan through the top-level branches to
 527 find the end state. */
 528
 529 else
 530   {
 531   end_code = this_start_code;
 532
 533   /* Restarting */
 534
 535   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 536     {
 537     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 538     new_count = workspace[1];
 539     if (!workspace[0])
 540       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 541     }
 542
 543   /* Not restarting */
 544
 545   else
 546     {
 547     int length = 1 + LINK_SIZE +
 548       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 549         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 550         ? IMM2_SIZE:0);
 551     do
 552       {
 553       ADD_NEW((int)(end_code - start_code + length), 0);
 554       end_code += GET(end_code, 1);
 555       length = 1 + LINK_SIZE;
 556       }
 557     while (*end_code == OP_ALT);
 558     }
 559   }
 560
 561 workspace[0] = 0;    /* Bit indicating which vector is current */
 562
 563 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
 564
 565 /* Loop for scanning the subject */
 566
 567 ptr = current_subject;
 568 for (;;)
 569   {
 570   int i, j;
 571   int clen, dlen;
 572   unsigned int c, d;
 573   int forced_fail = 0;
 574   BOOL partial_newline = FALSE;
 575   BOOL could_continue = reset_could_continue;
 576   reset_could_continue = FALSE;
 577
 578   /* Make the new state list into the active state list and empty the
 579   new state list. */
 580
 581   temp_states = active_states;
 582   active_states = new_states;
 583   new_states = temp_states;
 584   active_count = new_count;
 585   new_count = 0;
 586
 587   workspace[0] ^= 1;              /* Remember for the restarting feature */
 588   workspace[1] = active_count;
 589
 590 #ifdef PCRE_DEBUG
 591   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 592   pchars(ptr, STRLEN_UC(ptr), stdout);
 593   printf("\"\n");
 594
 595   printf("%.*sActive states: ", rlevel*2-2, SP);
 596   for (i = 0; i < active_count; i++)
 597     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 598   printf("\n");
 599 #endif
 600
 601   /* Set the pointers for adding new states */
 602
 603   next_active_state = active_states + active_count;
 604   next_new_state = new_states;
 605
 606   /* Load the current character from the subject outside the loop, as many
 607   different states may want to look at it, and we assume that at least one
 608   will. */
 609
 610   if (ptr < end_subject)
 611     {
 612     clen = 1;        /* Number of data items in the character */
 613 #ifdef SUPPORT_UTF
 614     if (utf) { GETCHARLEN(c, ptr, clen); } else
 615 #endif  /* SUPPORT_UTF */
 616     c = *ptr;
 617     }
 618   else
 619     {
 620     clen = 0;        /* This indicates the end of the subject */
 621     c = NOTACHAR;    /* This value should never actually be used */
 622     }
 623
 624   /* Scan up the active states and act on each one. The result of an action
 625   may be to add more states to the currently active list (e.g. on hitting a
 626   parenthesis) or it may be to put states on the new list, for considering
 627   when we move the character pointer on. */
 628
 629   for (i = 0; i < active_count; i++)
 630     {
 631     stateblock *current_state = active_states + i;
 632     BOOL caseless = FALSE;
 633     const pcre_uchar *code;
 634     int state_offset = current_state->offset;
 635     int count, codevalue, rrc;
 636
 637 #ifdef PCRE_DEBUG
 638     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 639     if (clen == 0) printf("EOL\n");
 640       else if (c > 32 && c < 127) printf("'%c'\n", c);
 641         else printf("0x%02x\n", c);
 642 #endif
 643
 644     /* A negative offset is a special case meaning "hold off going to this
 645     (negated) state until the number of characters in the data field have
 646     been skipped". If the could_continue flag was passed over from a previous
 647     state, arrange for it to passed on. */
 648
 649     if (state_offset < 0)
 650       {
 651       if (current_state->data > 0)
 652         {
 653         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 654         ADD_NEW_DATA(state_offset, current_state->count,
 655           current_state->data - 1);
 656         if (could_continue) reset_could_continue = TRUE;
 657         continue;
 658         }
 659       else
 660         {
 661         current_state->offset = state_offset = -state_offset;
 662         }
 663       }
 664
 665     /* Check for a duplicate state with the same count, and skip if found.
 666     See the note at the head of this module about the possibility of improving
 667     performance here. */
 668
 669     for (j = 0; j < i; j++)
 670       {
 671       if (active_states[j].offset == state_offset &&
 672           active_states[j].count == current_state->count)
 673         {
 674         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 675         goto NEXT_ACTIVE_STATE;
 676         }
 677       }
 678
 679     /* The state offset is the offset to the opcode */
 680
 681     code = start_code + state_offset;
 682     codevalue = *code;
 683
 684     /* If this opcode inspects a character, but we are at the end of the
 685     subject, remember the fact for use when testing for a partial match. */
 686
 687     if (clen == 0 && poptable[codevalue] != 0)
 688       could_continue = TRUE;
 689
 690     /* If this opcode is followed by an inline character, load it. It is
 691     tempting to test for the presence of a subject character here, but that
 692     is wrong, because sometimes zero repetitions of the subject are
 693     permitted.
 694
 695     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 696     argument that is not a data character - but is always one byte long because
 697     the values are small. We have to take special action to deal with  \P, \p,
 698     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
 699     these ones to new opcodes. */
 700
 701     if (coptable[codevalue] > 0)
 702       {
 703       dlen = 1;
 704 #ifdef SUPPORT_UTF
 705       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 706 #endif  /* SUPPORT_UTF */
 707       d = code[coptable[codevalue]];
 708       if (codevalue >= OP_TYPESTAR)
 709         {
 710         switch(d)
 711           {
 712           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 713           case OP_NOTPROP:
 714           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 715           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 716           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 717           case OP_NOT_HSPACE:
 718           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 719           case OP_NOT_VSPACE:
 720           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 721           default: break;
 722           }
 723         }
 724       }
 725     else
 726       {
 727       dlen = 0;         /* Not strictly necessary, but compilers moan */
 728       d = NOTACHAR;     /* if these variables are not set. */
 729       }
 730
 731
 732     /* Now process the individual opcodes */
 733
 734     switch (codevalue)
 735       {
 736 /* ========================================================================== */
 737       /* These cases are never obeyed. This is a fudge that causes a compile-
 738       time error if the vectors coptable or poptable, which are indexed by
 739       opcode, are not the correct length. It seems to be the only way to do
 740       such a check at compile time, as the sizeof() operator does not work
 741       in the C preprocessor. */
 742
 743       case OP_TABLE_LENGTH:
 744       case OP_TABLE_LENGTH +
 745         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
 746          (sizeof(poptable) == OP_TABLE_LENGTH)):
 747       break;
 748
 749 /* ========================================================================== */
 750       /* Reached a closing bracket. If not at the end of the pattern, carry
 751       on with the next opcode. For repeating opcodes, also add the repeat
 752       state. Note that KETRPOS will always be encountered at the end of the
 753       subpattern, because the possessive subpattern repeats are always handled
 754       using recursive calls. Thus, it never adds any new states.
 755
 756       At the end of the (sub)pattern, unless we have an empty string and
 757       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
 758       start of the subject, save the match data, shifting up all previous
 759       matches so we always have the longest first. */
 760
 761       case OP_KET:
 762       case OP_KETRMIN:
 763       case OP_KETRMAX:
 764       case OP_KETRPOS:
 765       if (code != end_code)
 766         {
 767         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 768         if (codevalue != OP_KET)
 769           {
 770           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 771           }
 772         }
 773       else
 774         {
 775         if (ptr > current_subject ||
 776             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
 777               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
 778                 current_subject > start_subject + md->start_offset)))
 779           {
 780           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 781             else if (match_count > 0 && ++match_count * 2 > offsetcount)
 782               match_count = 0;
 783           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 784           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 785           if (offsetcount >= 2)
 786             {
 787             offsets[0] = (int)(current_subject - start_subject);
 788             offsets[1] = (int)(ptr - start_subject);
 789             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 790               offsets[1] - offsets[0], (char *)current_subject));
 791             }
 792           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 793             {
 794             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 795               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 796               match_count, rlevel*2-2, SP));
 797             return match_count;
 798             }
 799           }
 800         }
 801       break;
 802
 803 /* ========================================================================== */
 804       /* These opcodes add to the current list of states without looking
 805       at the current character. */
 806
 807       /*-----------------------------------------------------------------*/
 808       case OP_ALT:
 809       do { code += GET(code, 1); } while (*code == OP_ALT);
 810       ADD_ACTIVE((int)(code - start_code), 0);
 811       break;
 812
 813       /*-----------------------------------------------------------------*/
 814       case OP_BRA:
 815       case OP_SBRA:
 816       do
 817         {
 818         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 819         code += GET(code, 1);
 820         }
 821       while (*code == OP_ALT);
 822       break;
 823
 824       /*-----------------------------------------------------------------*/
 825       case OP_CBRA:
 826       case OP_SCBRA:
 827       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
 828       code += GET(code, 1);
 829       while (*code == OP_ALT)
 830         {
 831         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
 832         code += GET(code, 1);
 833         }
 834       break;
 835
 836       /*-----------------------------------------------------------------*/
 837       case OP_BRAZERO:
 838       case OP_BRAMINZERO:
 839       ADD_ACTIVE(state_offset + 1, 0);
 840       code += 1 + GET(code, 2);
 841       while (*code == OP_ALT) code += GET(code, 1);
 842       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 843       break;
 844
 845       /*-----------------------------------------------------------------*/
 846       case OP_SKIPZERO:
 847       code += 1 + GET(code, 2);
 848       while (*code == OP_ALT) code += GET(code, 1);
 849       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 850       break;
 851
 852       /*-----------------------------------------------------------------*/
 853       case OP_CIRC:
 854       if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
 855         { ADD_ACTIVE(state_offset + 1, 0); }
 856       break;
 857
 858       /*-----------------------------------------------------------------*/
 859       case OP_CIRCM:
 860       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 861           (ptr != end_subject && WAS_NEWLINE(ptr)))
 862         { ADD_ACTIVE(state_offset + 1, 0); }
 863       break;
 864
 865       /*-----------------------------------------------------------------*/
 866       case OP_EOD:
 867       if (ptr >= end_subject)
 868         {
 869         if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 870           could_continue = TRUE;
 871         else { ADD_ACTIVE(state_offset + 1, 0); }
 872         }
 873       break;
 874
 875       /*-----------------------------------------------------------------*/
 876       case OP_SOD:
 877       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 878       break;
 879
 880       /*-----------------------------------------------------------------*/
 881       case OP_SOM:
 882       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 883       break;
 884
 885
 886 /* ========================================================================== */
 887       /* These opcodes inspect the next subject character, and sometimes
 888       the previous one as well, but do not have an argument. The variable
 889       clen contains the length of the current character and is zero if we are
 890       at the end of the subject. */
 891
 892       /*-----------------------------------------------------------------*/
 893       case OP_ANY:
 894       if (clen > 0 && !IS_NEWLINE(ptr))
 895         {
 896         if (ptr + 1 >= md->end_subject &&
 897             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
 898             NLBLOCK->nltype == NLTYPE_FIXED &&
 899             NLBLOCK->nllen == 2 &&
 900             c == NLBLOCK->nl[0])
 901           {
 902           could_continue = partial_newline = TRUE;
 903           }
 904         else
 905           {
 906           ADD_NEW(state_offset + 1, 0);
 907           }
 908         }
 909       break;
 910
 911       /*-----------------------------------------------------------------*/
 912       case OP_ALLANY:
 913       if (clen > 0)
 914         { ADD_NEW(state_offset + 1, 0); }
 915       break;
 916
 917       /*-----------------------------------------------------------------*/
 918       case OP_EODN:
 919       if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 920         could_continue = TRUE;
 921       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 922         { ADD_ACTIVE(state_offset + 1, 0); }
 923       break;
 924
 925       /*-----------------------------------------------------------------*/
 926       case OP_DOLL:
 927       if ((md->moptions & PCRE_NOTEOL) == 0)
 928         {
 929         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 930           could_continue = TRUE;
 931         else if (clen == 0 ||
 932             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
 933                (ptr == end_subject - md->nllen)
 934             ))
 935           { ADD_ACTIVE(state_offset + 1, 0); }
 936         else if (ptr + 1 >= md->end_subject &&
 937                  (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
 938                  NLBLOCK->nltype == NLTYPE_FIXED &&
 939                  NLBLOCK->nllen == 2 &&
 940                  c == NLBLOCK->nl[0])
 941           {
 942           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 943             {
 944             reset_could_continue = TRUE;
 945             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 946             }
 947           else could_continue = partial_newline = TRUE;
 948           }
 949         }
 950       break;
 951
 952       /*-----------------------------------------------------------------*/
 953       case OP_DOLLM:
 954       if ((md->moptions & PCRE_NOTEOL) == 0)
 955         {
 956         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 957           could_continue = TRUE;
 958         else if (clen == 0 ||
 959             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
 960           { ADD_ACTIVE(state_offset + 1, 0); }
 961         else if (ptr + 1 >= md->end_subject &&
 962                  (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
 963                  NLBLOCK->nltype == NLTYPE_FIXED &&
 964                  NLBLOCK->nllen == 2 &&
 965                  c == NLBLOCK->nl[0])
 966           {
 967           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 968             {
 969             reset_could_continue = TRUE;
 970             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 971             }
 972           else could_continue = partial_newline = TRUE;
 973           }
 974         }
 975       else if (IS_NEWLINE(ptr))
 976         { ADD_ACTIVE(state_offset + 1, 0); }
 977       break;
 978
 979       /*-----------------------------------------------------------------*/
 980
 981       case OP_DIGIT:
 982       case OP_WHITESPACE:
 983       case OP_WORDCHAR:
 984       if (clen > 0 && c < 256 &&
 985             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 986         { ADD_NEW(state_offset + 1, 0); }
 987       break;
 988
 989       /*-----------------------------------------------------------------*/
 990       case OP_NOT_DIGIT:
 991       case OP_NOT_WHITESPACE:
 992       case OP_NOT_WORDCHAR:
 993       if (clen > 0 && (c >= 256 ||
 994             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 995         { ADD_NEW(state_offset + 1, 0); }
 996       break;
 997
 998       /*-----------------------------------------------------------------*/
 999       case OP_WORD_BOUNDARY:
1000       case OP_NOT_WORD_BOUNDARY:
1001         {
1002         int left_word, right_word;
1003
1004         if (ptr > start_subject)
1005           {
1006           const pcre_uchar *temp = ptr - 1;
1007           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1008 #ifdef SUPPORT_UTF
1009           if (utf) { BACKCHAR(temp); }
1010 #endif
1011           GETCHARTEST(d, temp);
1012 #ifdef SUPPORT_UCP
1013           if ((md->poptions & PCRE_UCP) != 0)
1014             {
1015             if (d == '_') left_word = TRUE; else
1016               {
1017               int cat = UCD_CATEGORY(d);
1018               left_word = (cat == ucp_L || cat == ucp_N);
1019               }
1020             }
1021           else
1022 #endif
1023           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1024           }
1025         else left_word = FALSE;
1026
1027         if (clen > 0)
1028           {
1029 #ifdef SUPPORT_UCP
1030           if ((md->poptions & PCRE_UCP) != 0)
1031             {
1032             if (c == '_') right_word = TRUE; else
1033               {
1034               int cat = UCD_CATEGORY(c);
1035               right_word = (cat == ucp_L || cat == ucp_N);
1036               }
1037             }
1038           else
1039 #endif
1040           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1041           }
1042         else right_word = FALSE;
1043
1044         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1045           { ADD_ACTIVE(state_offset + 1, 0); }
1046         }
1047       break;
1048
1049
1050       /*-----------------------------------------------------------------*/
1051       /* Check the next character by Unicode property. We will get here only
1052       if the support is in the binary; otherwise a compile-time error occurs.
1053       */
1054
1055 #ifdef SUPPORT_UCP
1056       case OP_PROP:
1057       case OP_NOTPROP:
1058       if (clen > 0)
1059         {
1060         BOOL OK;
1061         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1062         switch(code[1])
1063           {
1064           case PT_ANY:
1065           OK = TRUE;
1066           break;
1067
1068           case PT_LAMP:
1069           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1070                chartype == ucp_Lt;
1071           break;
1072
1073           case PT_GC:
1074           OK = PRIV(ucp_gentype)[chartype] == code[2];
1075           break;
1076
1077           case PT_PC:
1078           OK = chartype == code[2];
1079           break;
1080
1081           case PT_SC:
1082           OK = UCD_SCRIPT(c) == code[2];
1083           break;
1084
1085           /* These are specials for combination cases. */
1086
1087           case PT_ALNUM:
1088           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1089                PRIV(ucp_gentype)[chartype] == ucp_N;
1090           break;
1091
1092           case PT_SPACE:    /* Perl space */
1093           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1094                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1095           break;
1096
1097           case PT_PXSPACE:  /* POSIX space */
1098           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1099                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1100                c == CHAR_FF || c == CHAR_CR;
1101           break;
1102
1103           case PT_WORD:
1104           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1105                PRIV(ucp_gentype)[chartype] == ucp_N ||
1106                c == CHAR_UNDERSCORE;
1107           break;
1108
1109           /* Should never occur, but keep compilers from grumbling. */
1110
1111           default:
1112           OK = codevalue != OP_PROP;
1113           break;
1114           }
1115
1116         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1117         }
1118       break;
1119 #endif
1120
1121
1122
1123 /* ========================================================================== */
1124       /* These opcodes likewise inspect the subject character, but have an
1125       argument that is not a data character. It is one of these opcodes:
1126       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1127       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1128
1129       case OP_TYPEPLUS:
1130       case OP_TYPEMINPLUS:
1131       case OP_TYPEPOSPLUS:
1132       count = current_state->count;  /* Already matched */
1133       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1134       if (clen > 0)
1135         {
1136         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1137             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1138             NLBLOCK->nltype == NLTYPE_FIXED &&
1139             NLBLOCK->nllen == 2 &&
1140             c == NLBLOCK->nl[0])
1141           {
1142           could_continue = partial_newline = TRUE;
1143           }
1144         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1145             (c < 256 &&
1146               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1147               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1148           {
1149           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1150             {
1151             active_count--;            /* Remove non-match possibility */
1152             next_active_state--;
1153             }
1154           count++;
1155           ADD_NEW(state_offset, count);
1156           }
1157         }
1158       break;
1159
1160       /*-----------------------------------------------------------------*/
1161       case OP_TYPEQUERY:
1162       case OP_TYPEMINQUERY:
1163       case OP_TYPEPOSQUERY:
1164       ADD_ACTIVE(state_offset + 2, 0);
1165       if (clen > 0)
1166         {
1167         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1168             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1169             NLBLOCK->nltype == NLTYPE_FIXED &&
1170             NLBLOCK->nllen == 2 &&
1171             c == NLBLOCK->nl[0])
1172           {
1173           could_continue = partial_newline = TRUE;
1174           }
1175         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1176             (c < 256 &&
1177               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1178               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1179           {
1180           if (codevalue == OP_TYPEPOSQUERY)
1181             {
1182             active_count--;            /* Remove non-match possibility */
1183             next_active_state--;
1184             }
1185           ADD_NEW(state_offset + 2, 0);
1186           }
1187         }
1188       break;
1189
1190       /*-----------------------------------------------------------------*/
1191       case OP_TYPESTAR:
1192       case OP_TYPEMINSTAR:
1193       case OP_TYPEPOSSTAR:
1194       ADD_ACTIVE(state_offset + 2, 0);
1195       if (clen > 0)
1196         {
1197         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1198             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1199             NLBLOCK->nltype == NLTYPE_FIXED &&
1200             NLBLOCK->nllen == 2 &&
1201             c == NLBLOCK->nl[0])
1202           {
1203           could_continue = partial_newline = TRUE;
1204           }
1205         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1206             (c < 256 &&
1207               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1208               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1209           {
1210           if (codevalue == OP_TYPEPOSSTAR)
1211             {
1212             active_count--;            /* Remove non-match possibility */
1213             next_active_state--;
1214             }
1215           ADD_NEW(state_offset, 0);
1216           }
1217         }
1218       break;
1219
1220       /*-----------------------------------------------------------------*/
1221       case OP_TYPEEXACT:
1222       count = current_state->count;  /* Number already matched */
1223       if (clen > 0)
1224         {
1225         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1226             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1227             NLBLOCK->nltype == NLTYPE_FIXED &&
1228             NLBLOCK->nllen == 2 &&
1229             c == NLBLOCK->nl[0])
1230           {
1231           could_continue = partial_newline = TRUE;
1232           }
1233         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1234             (c < 256 &&
1235               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1236               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1237           {
1238           if (++count >= GET2(code, 1))
1239             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1240           else
1241             { ADD_NEW(state_offset, count); }
1242           }
1243         }
1244       break;
1245
1246       /*-----------------------------------------------------------------*/
1247       case OP_TYPEUPTO:
1248       case OP_TYPEMINUPTO:
1249       case OP_TYPEPOSUPTO:
1250       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1251       count = current_state->count;  /* Number already matched */
1252       if (clen > 0)
1253         {
1254         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1255             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1256             NLBLOCK->nltype == NLTYPE_FIXED &&
1257             NLBLOCK->nllen == 2 &&
1258             c == NLBLOCK->nl[0])
1259           {
1260           could_continue = partial_newline = TRUE;
1261           }
1262         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1263             (c < 256 &&
1264               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1265               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1266           {
1267           if (codevalue == OP_TYPEPOSUPTO)
1268             {
1269             active_count--;           /* Remove non-match possibility */
1270             next_active_state--;
1271             }
1272           if (++count >= GET2(code, 1))
1273             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1274           else
1275             { ADD_NEW(state_offset, count); }
1276           }
1277         }
1278       break;
1279
1280 /* ========================================================================== */
1281       /* These are virtual opcodes that are used when something like
1282       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1283       argument. It keeps the code above fast for the other cases. The argument
1284       is in the d variable. */
1285
1286 #ifdef SUPPORT_UCP
1287       case OP_PROP_EXTRA + OP_TYPEPLUS:
1288       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1289       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1290       count = current_state->count;           /* Already matched */
1291       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1292       if (clen > 0)
1293         {
1294         BOOL OK;
1295         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1296         switch(code[2])
1297           {
1298           case PT_ANY:
1299           OK = TRUE;
1300           break;
1301
1302           case PT_LAMP:
1303           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1304             chartype == ucp_Lt;
1305           break;
1306
1307           case PT_GC:
1308           OK = PRIV(ucp_gentype)[chartype] == code[3];
1309           break;
1310
1311           case PT_PC:
1312           OK = chartype == code[3];
1313           break;
1314
1315           case PT_SC:
1316           OK = UCD_SCRIPT(c) == code[3];
1317           break;
1318
1319           /* These are specials for combination cases. */
1320
1321           case PT_ALNUM:
1322           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1323                PRIV(ucp_gentype)[chartype] == ucp_N;
1324           break;
1325
1326           case PT_SPACE:    /* Perl space */
1327           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1328                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1329           break;
1330
1331           case PT_PXSPACE:  /* POSIX space */
1332           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1333                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1334                c == CHAR_FF || c == CHAR_CR;
1335           break;
1336
1337           case PT_WORD:
1338           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1339                PRIV(ucp_gentype)[chartype] == ucp_N ||
1340                c == CHAR_UNDERSCORE;
1341           break;
1342
1343           /* Should never occur, but keep compilers from grumbling. */
1344
1345           default:
1346           OK = codevalue != OP_PROP;
1347           break;
1348           }
1349
1350         if (OK == (d == OP_PROP))
1351           {
1352           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1353             {
1354             active_count--;           /* Remove non-match possibility */
1355             next_active_state--;
1356             }
1357           count++;
1358           ADD_NEW(state_offset, count);
1359           }
1360         }
1361       break;
1362
1363       /*-----------------------------------------------------------------*/
1364       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1365       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1366       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1367       count = current_state->count;  /* Already matched */
1368       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1369       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1370         {
1371         const pcre_uchar *nptr = ptr + clen;
1372         int ncount = 0;
1373         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1374           {
1375           active_count--;           /* Remove non-match possibility */
1376           next_active_state--;
1377           }
1378         while (nptr < end_subject)
1379           {
1380           int nd;
1381           int ndlen = 1;
1382           GETCHARLEN(nd, nptr, ndlen);
1383           if (UCD_CATEGORY(nd) != ucp_M) break;
1384           ncount++;
1385           nptr += ndlen;
1386           }
1387         count++;
1388         ADD_NEW_DATA(-state_offset, count, ncount);
1389         }
1390       break;
1391 #endif
1392
1393       /*-----------------------------------------------------------------*/
1394       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1395       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1396       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1397       count = current_state->count;  /* Already matched */
1398       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1399       if (clen > 0)
1400         {
1401         int ncount = 0;
1402         switch (c)
1403           {
1404           case 0x000b:
1405           case 0x000c:
1406           case 0x0085:
1407           case 0x2028:
1408           case 0x2029:
1409           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1410           goto ANYNL01;
1411
1412           case 0x000d:
1413           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1414           /* Fall through */
1415
1416           ANYNL01:
1417           case 0x000a:
1418           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1419             {
1420             active_count--;           /* Remove non-match possibility */
1421             next_active_state--;
1422             }
1423           count++;
1424           ADD_NEW_DATA(-state_offset, count, ncount);
1425           break;
1426
1427           default:
1428           break;
1429           }
1430         }
1431       break;
1432
1433       /*-----------------------------------------------------------------*/
1434       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1435       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1436       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1437       count = current_state->count;  /* Already matched */
1438       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1439       if (clen > 0)
1440         {
1441         BOOL OK;
1442         switch (c)
1443           {
1444           case 0x000a:
1445           case 0x000b:
1446           case 0x000c:
1447           case 0x000d:
1448           case 0x0085:
1449           case 0x2028:
1450           case 0x2029:
1451           OK = TRUE;
1452           break;
1453
1454           default:
1455           OK = FALSE;
1456           break;
1457           }
1458
1459         if (OK == (d == OP_VSPACE))
1460           {
1461           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1462             {
1463             active_count--;           /* Remove non-match possibility */
1464             next_active_state--;
1465             }
1466           count++;
1467           ADD_NEW_DATA(-state_offset, count, 0);
1468           }
1469         }
1470       break;
1471
1472       /*-----------------------------------------------------------------*/
1473       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1474       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1475       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1476       count = current_state->count;  /* Already matched */
1477       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1478       if (clen > 0)
1479         {
1480         BOOL OK;
1481         switch (c)
1482           {
1483           case 0x09:      /* HT */
1484           case 0x20:      /* SPACE */
1485           case 0xa0:      /* NBSP */
1486           case 0x1680:    /* OGHAM SPACE MARK */
1487           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1488           case 0x2000:    /* EN QUAD */
1489           case 0x2001:    /* EM QUAD */
1490           case 0x2002:    /* EN SPACE */
1491           case 0x2003:    /* EM SPACE */
1492           case 0x2004:    /* THREE-PER-EM SPACE */
1493           case 0x2005:    /* FOUR-PER-EM SPACE */
1494           case 0x2006:    /* SIX-PER-EM SPACE */
1495           case 0x2007:    /* FIGURE SPACE */
1496           case 0x2008:    /* PUNCTUATION SPACE */
1497           case 0x2009:    /* THIN SPACE */
1498           case 0x200A:    /* HAIR SPACE */
1499           case 0x202f:    /* NARROW NO-BREAK SPACE */
1500           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1501           case 0x3000:    /* IDEOGRAPHIC SPACE */
1502           OK = TRUE;
1503           break;
1504
1505           default:
1506           OK = FALSE;
1507           break;
1508           }
1509
1510         if (OK == (d == OP_HSPACE))
1511           {
1512           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1513             {
1514             active_count--;           /* Remove non-match possibility */
1515             next_active_state--;
1516             }
1517           count++;
1518           ADD_NEW_DATA(-state_offset, count, 0);
1519           }
1520         }
1521       break;
1522
1523       /*-----------------------------------------------------------------*/
1524 #ifdef SUPPORT_UCP
1525       case OP_PROP_EXTRA + OP_TYPEQUERY:
1526       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1527       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1528       count = 4;
1529       goto QS1;
1530
1531       case OP_PROP_EXTRA + OP_TYPESTAR:
1532       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1533       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1534       count = 0;
1535
1536       QS1:
1537
1538       ADD_ACTIVE(state_offset + 4, 0);
1539       if (clen > 0)
1540         {
1541         BOOL OK;
1542         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1543         switch(code[2])
1544           {
1545           case PT_ANY:
1546           OK = TRUE;
1547           break;
1548
1549           case PT_LAMP:
1550           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1551             chartype == ucp_Lt;
1552           break;
1553
1554           case PT_GC:
1555           OK = PRIV(ucp_gentype)[chartype] == code[3];
1556           break;
1557
1558           case PT_PC:
1559           OK = chartype == code[3];
1560           break;
1561
1562           case PT_SC:
1563           OK = UCD_SCRIPT(c) == code[3];
1564           break;
1565
1566           /* These are specials for combination cases. */
1567
1568           case PT_ALNUM:
1569           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1570                PRIV(ucp_gentype)[chartype] == ucp_N;
1571           break;
1572
1573           case PT_SPACE:    /* Perl space */
1574           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1575                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1576           break;
1577
1578           case PT_PXSPACE:  /* POSIX space */
1579           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1580                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1581                c == CHAR_FF || c == CHAR_CR;
1582           break;
1583
1584           case PT_WORD:
1585           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1586                PRIV(ucp_gentype)[chartype] == ucp_N ||
1587                c == CHAR_UNDERSCORE;
1588           break;
1589
1590           /* Should never occur, but keep compilers from grumbling. */
1591
1592           default:
1593           OK = codevalue != OP_PROP;
1594           break;
1595           }
1596
1597         if (OK == (d == OP_PROP))
1598           {
1599           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1600               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1601             {
1602             active_count--;           /* Remove non-match possibility */
1603             next_active_state--;
1604             }
1605           ADD_NEW(state_offset + count, 0);
1606           }
1607         }
1608       break;
1609
1610       /*-----------------------------------------------------------------*/
1611       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1612       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1613       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1614       count = 2;
1615       goto QS2;
1616
1617       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1618       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1619       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1620       count = 0;
1621
1622       QS2:
1623
1624       ADD_ACTIVE(state_offset + 2, 0);
1625       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1626         {
1627         const pcre_uchar *nptr = ptr + clen;
1628         int ncount = 0;
1629         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1630             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1631           {
1632           active_count--;           /* Remove non-match possibility */
1633           next_active_state--;
1634           }
1635         while (nptr < end_subject)
1636           {
1637           int nd;
1638           int ndlen = 1;
1639           GETCHARLEN(nd, nptr, ndlen);
1640           if (UCD_CATEGORY(nd) != ucp_M) break;
1641           ncount++;
1642           nptr += ndlen;
1643           }
1644         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1645         }
1646       break;
1647 #endif
1648
1649       /*-----------------------------------------------------------------*/
1650       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1651       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1652       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1653       count = 2;
1654       goto QS3;
1655
1656       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1657       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1658       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1659       count = 0;
1660
1661       QS3:
1662       ADD_ACTIVE(state_offset + 2, 0);
1663       if (clen > 0)
1664         {
1665         int ncount = 0;
1666         switch (c)
1667           {
1668           case 0x000b:
1669           case 0x000c:
1670           case 0x0085:
1671           case 0x2028:
1672           case 0x2029:
1673           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1674           goto ANYNL02;
1675
1676           case 0x000d:
1677           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1678           /* Fall through */
1679
1680           ANYNL02:
1681           case 0x000a:
1682           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1683               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1684             {
1685             active_count--;           /* Remove non-match possibility */
1686             next_active_state--;
1687             }
1688           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1689           break;
1690
1691           default:
1692           break;
1693           }
1694         }
1695       break;
1696
1697       /*-----------------------------------------------------------------*/
1698       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1699       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1700       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1701       count = 2;
1702       goto QS4;
1703
1704       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1705       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1706       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1707       count = 0;
1708
1709       QS4:
1710       ADD_ACTIVE(state_offset + 2, 0);
1711       if (clen > 0)
1712         {
1713         BOOL OK;
1714         switch (c)
1715           {
1716           case 0x000a:
1717           case 0x000b:
1718           case 0x000c:
1719           case 0x000d:
1720           case 0x0085:
1721           case 0x2028:
1722           case 0x2029:
1723           OK = TRUE;
1724           break;
1725
1726           default:
1727           OK = FALSE;
1728           break;
1729           }
1730         if (OK == (d == OP_VSPACE))
1731           {
1732           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1733               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1734             {
1735             active_count--;           /* Remove non-match possibility */
1736             next_active_state--;
1737             }
1738           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1739           }
1740         }
1741       break;
1742
1743       /*-----------------------------------------------------------------*/
1744       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1745       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1746       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1747       count = 2;
1748       goto QS5;
1749
1750       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1751       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1752       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1753       count = 0;
1754
1755       QS5:
1756       ADD_ACTIVE(state_offset + 2, 0);
1757       if (clen > 0)
1758         {
1759         BOOL OK;
1760         switch (c)
1761           {
1762           case 0x09:      /* HT */
1763           case 0x20:      /* SPACE */
1764           case 0xa0:      /* NBSP */
1765           case 0x1680:    /* OGHAM SPACE MARK */
1766           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1767           case 0x2000:    /* EN QUAD */
1768           case 0x2001:    /* EM QUAD */
1769           case 0x2002:    /* EN SPACE */
1770           case 0x2003:    /* EM SPACE */
1771           case 0x2004:    /* THREE-PER-EM SPACE */
1772           case 0x2005:    /* FOUR-PER-EM SPACE */
1773           case 0x2006:    /* SIX-PER-EM SPACE */
1774           case 0x2007:    /* FIGURE SPACE */
1775           case 0x2008:    /* PUNCTUATION SPACE */
1776           case 0x2009:    /* THIN SPACE */
1777           case 0x200A:    /* HAIR SPACE */
1778           case 0x202f:    /* NARROW NO-BREAK SPACE */
1779           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1780           case 0x3000:    /* IDEOGRAPHIC SPACE */
1781           OK = TRUE;
1782           break;
1783
1784           default:
1785           OK = FALSE;
1786           break;
1787           }
1788
1789         if (OK == (d == OP_HSPACE))
1790           {
1791           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1792               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1793             {
1794             active_count--;           /* Remove non-match possibility */
1795             next_active_state--;
1796             }
1797           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1798           }
1799         }
1800       break;
1801
1802       /*-----------------------------------------------------------------*/
1803 #ifdef SUPPORT_UCP
1804       case OP_PROP_EXTRA + OP_TYPEEXACT:
1805       case OP_PROP_EXTRA + OP_TYPEUPTO:
1806       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1807       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1808       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1809         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1810       count = current_state->count;  /* Number already matched */
1811       if (clen > 0)
1812         {
1813         BOOL OK;
1814         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1815         switch(code[1 + IMM2_SIZE + 1])
1816           {
1817           case PT_ANY:
1818           OK = TRUE;
1819           break;
1820
1821           case PT_LAMP:
1822           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1823             chartype == ucp_Lt;
1824           break;
1825
1826           case PT_GC:
1827           OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2];
1828           break;
1829
1830           case PT_PC:
1831           OK = chartype == code[1 + IMM2_SIZE + 2];
1832           break;
1833
1834           case PT_SC:
1835           OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2];
1836           break;
1837
1838           /* These are specials for combination cases. */
1839
1840           case PT_ALNUM:
1841           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1842                PRIV(ucp_gentype)[chartype] == ucp_N;
1843           break;
1844
1845           case PT_SPACE:    /* Perl space */
1846           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1847                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1848           break;
1849
1850           case PT_PXSPACE:  /* POSIX space */
1851           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1852                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1853                c == CHAR_FF || c == CHAR_CR;
1854           break;
1855
1856           case PT_WORD:
1857           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1858                PRIV(ucp_gentype)[chartype] == ucp_N ||
1859                c == CHAR_UNDERSCORE;
1860           break;
1861
1862           /* Should never occur, but keep compilers from grumbling. */
1863
1864           default:
1865           OK = codevalue != OP_PROP;
1866           break;
1867           }
1868
1869         if (OK == (d == OP_PROP))
1870           {
1871           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1872             {
1873             active_count--;           /* Remove non-match possibility */
1874             next_active_state--;
1875             }
1876           if (++count >= GET2(code, 1))
1877             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1878           else
1879             { ADD_NEW(state_offset, count); }
1880           }
1881         }
1882       break;
1883
1884       /*-----------------------------------------------------------------*/
1885       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1886       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1887       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1888       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1889       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1890         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1891       count = current_state->count;  /* Number already matched */
1892       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1893         {
1894         const pcre_uchar *nptr = ptr + clen;
1895         int ncount = 0;
1896         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1897           {
1898           active_count--;           /* Remove non-match possibility */
1899           next_active_state--;
1900           }
1901         while (nptr < end_subject)
1902           {
1903           int nd;
1904           int ndlen = 1;
1905           GETCHARLEN(nd, nptr, ndlen);
1906           if (UCD_CATEGORY(nd) != ucp_M) break;
1907           ncount++;
1908           nptr += ndlen;
1909           }
1910         if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1911             reset_could_continue = TRUE;
1912         if (++count >= GET2(code, 1))
1913           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1914         else
1915           { ADD_NEW_DATA(-state_offset, count, ncount); }
1916         }
1917       break;
1918 #endif
1919
1920       /*-----------------------------------------------------------------*/
1921       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1922       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1923       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1924       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1925       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1926         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1927       count = current_state->count;  /* Number already matched */
1928       if (clen > 0)
1929         {
1930         int ncount = 0;
1931         switch (c)
1932           {
1933           case 0x000b:
1934           case 0x000c:
1935           case 0x0085:
1936           case 0x2028:
1937           case 0x2029:
1938           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1939           goto ANYNL03;
1940
1941           case 0x000d:
1942           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1943           /* Fall through */
1944
1945           ANYNL03:
1946           case 0x000a:
1947           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1948             {
1949             active_count--;           /* Remove non-match possibility */
1950             next_active_state--;
1951             }
1952           if (++count >= GET2(code, 1))
1953             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1954           else
1955             { ADD_NEW_DATA(-state_offset, count, ncount); }
1956           break;
1957
1958           default:
1959           break;
1960           }
1961         }
1962       break;
1963
1964       /*-----------------------------------------------------------------*/
1965       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1966       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1967       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1968       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1969       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1970         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1971       count = current_state->count;  /* Number already matched */
1972       if (clen > 0)
1973         {
1974         BOOL OK;
1975         switch (c)
1976           {
1977           case 0x000a:
1978           case 0x000b:
1979           case 0x000c:
1980           case 0x000d:
1981           case 0x0085:
1982           case 0x2028:
1983           case 0x2029:
1984           OK = TRUE;
1985           break;
1986
1987           default:
1988           OK = FALSE;
1989           }
1990
1991         if (OK == (d == OP_VSPACE))
1992           {
1993           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1994             {
1995             active_count--;           /* Remove non-match possibility */
1996             next_active_state--;
1997             }
1998           if (++count >= GET2(code, 1))
1999             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2000           else
2001             { ADD_NEW_DATA(-state_offset, count, 0); }
2002           }
2003         }
2004       break;
2005
2006       /*-----------------------------------------------------------------*/
2007       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2008       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2009       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2010       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2011       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2012         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2013       count = current_state->count;  /* Number already matched */
2014       if (clen > 0)
2015         {
2016         BOOL OK;
2017         switch (c)
2018           {
2019           case 0x09:      /* HT */
2020           case 0x20:      /* SPACE */
2021           case 0xa0:      /* NBSP */
2022           case 0x1680:    /* OGHAM SPACE MARK */
2023           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2024           case 0x2000:    /* EN QUAD */
2025           case 0x2001:    /* EM QUAD */
2026           case 0x2002:    /* EN SPACE */
2027           case 0x2003:    /* EM SPACE */
2028           case 0x2004:    /* THREE-PER-EM SPACE */
2029           case 0x2005:    /* FOUR-PER-EM SPACE */
2030           case 0x2006:    /* SIX-PER-EM SPACE */
2031           case 0x2007:    /* FIGURE SPACE */
2032           case 0x2008:    /* PUNCTUATION SPACE */
2033           case 0x2009:    /* THIN SPACE */
2034           case 0x200A:    /* HAIR SPACE */
2035           case 0x202f:    /* NARROW NO-BREAK SPACE */
2036           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2037           case 0x3000:    /* IDEOGRAPHIC SPACE */
2038           OK = TRUE;
2039           break;
2040
2041           default:
2042           OK = FALSE;
2043           break;
2044           }
2045
2046         if (OK == (d == OP_HSPACE))
2047           {
2048           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2049             {
2050             active_count--;           /* Remove non-match possibility */
2051             next_active_state--;
2052             }
2053           if (++count >= GET2(code, 1))
2054             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2055           else
2056             { ADD_NEW_DATA(-state_offset, count, 0); }
2057           }
2058         }
2059       break;
2060
2061 /* ========================================================================== */
2062       /* These opcodes are followed by a character that is usually compared
2063       to the current subject character; it is loaded into d. We still get
2064       here even if there is no subject character, because in some cases zero
2065       repetitions are permitted. */
2066
2067       /*-----------------------------------------------------------------*/
2068       case OP_CHAR:
2069       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2070       break;
2071
2072       /*-----------------------------------------------------------------*/
2073       case OP_CHARI:
2074       if (clen == 0) break;
2075
2076 #ifdef SUPPORT_UTF
2077       if (utf)
2078         {
2079         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2080           {
2081           unsigned int othercase;
2082           if (c < 128)
2083             othercase = fcc[c];
2084           else
2085             /* If we have Unicode property support, we can use it to test the
2086             other case of the character. */
2087 #ifdef SUPPORT_UCP
2088             othercase = UCD_OTHERCASE(c);
2089 #else
2090             othercase = NOTACHAR;
2091 #endif
2092
2093           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2094           }
2095         }
2096       else
2097 #endif  /* SUPPORT_UTF */
2098       /* Not UTF mode */
2099         {
2100         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2101           { ADD_NEW(state_offset + 2, 0); }
2102         }
2103       break;
2104
2105
2106 #ifdef SUPPORT_UCP
2107       /*-----------------------------------------------------------------*/
2108       /* This is a tricky one because it can match more than one character.
2109       Find out how many characters to skip, and then set up a negative state
2110       to wait for them to pass before continuing. */
2111
2112       case OP_EXTUNI:
2113       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2114         {
2115         const pcre_uchar *nptr = ptr + clen;
2116         int ncount = 0;
2117         while (nptr < end_subject)
2118           {
2119           int nclen = 1;
2120           GETCHARLEN(c, nptr, nclen);
2121           if (UCD_CATEGORY(c) != ucp_M) break;
2122           ncount++;
2123           nptr += nclen;
2124           }
2125         if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2126             reset_could_continue = TRUE;
2127         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2128         }
2129       break;
2130 #endif
2131
2132       /*-----------------------------------------------------------------*/
2133       /* This is a tricky like EXTUNI because it too can match more than one
2134       character (when CR is followed by LF). In this case, set up a negative
2135       state to wait for one character to pass before continuing. */
2136
2137       case OP_ANYNL:
2138       if (clen > 0) switch(c)
2139         {
2140         case 0x000b:
2141         case 0x000c:
2142         case 0x0085:
2143         case 0x2028:
2144         case 0x2029:
2145         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2146
2147         case 0x000a:
2148         ADD_NEW(state_offset + 1, 0);
2149         break;
2150
2151         case 0x000d:
2152         if (ptr + 1 >= end_subject)
2153           {
2154           ADD_NEW(state_offset + 1, 0);
2155           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2156             reset_could_continue = TRUE;
2157           }
2158         else if (ptr[1] == 0x0a)
2159           {
2160           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2161           }
2162         else
2163           {
2164           ADD_NEW(state_offset + 1, 0);
2165           }
2166         break;
2167         }
2168       break;
2169
2170       /*-----------------------------------------------------------------*/
2171       case OP_NOT_VSPACE:
2172       if (clen > 0) switch(c)
2173         {
2174         case 0x000a:
2175         case 0x000b:
2176         case 0x000c:
2177         case 0x000d:
2178         case 0x0085:
2179         case 0x2028:
2180         case 0x2029:
2181         break;
2182
2183         default:
2184         ADD_NEW(state_offset + 1, 0);
2185         break;
2186         }
2187       break;
2188
2189       /*-----------------------------------------------------------------*/
2190       case OP_VSPACE:
2191       if (clen > 0) switch(c)
2192         {
2193         case 0x000a:
2194         case 0x000b:
2195         case 0x000c:
2196         case 0x000d:
2197         case 0x0085:
2198         case 0x2028:
2199         case 0x2029:
2200         ADD_NEW(state_offset + 1, 0);
2201         break;
2202
2203         default: break;
2204         }
2205       break;
2206
2207       /*-----------------------------------------------------------------*/
2208       case OP_NOT_HSPACE:
2209       if (clen > 0) switch(c)
2210         {
2211         case 0x09:      /* HT */
2212         case 0x20:      /* SPACE */
2213         case 0xa0:      /* NBSP */
2214         case 0x1680:    /* OGHAM SPACE MARK */
2215         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2216         case 0x2000:    /* EN QUAD */
2217         case 0x2001:    /* EM QUAD */
2218         case 0x2002:    /* EN SPACE */
2219         case 0x2003:    /* EM SPACE */
2220         case 0x2004:    /* THREE-PER-EM SPACE */
2221         case 0x2005:    /* FOUR-PER-EM SPACE */
2222         case 0x2006:    /* SIX-PER-EM SPACE */
2223         case 0x2007:    /* FIGURE SPACE */
2224         case 0x2008:    /* PUNCTUATION SPACE */
2225         case 0x2009:    /* THIN SPACE */
2226         case 0x200A:    /* HAIR SPACE */
2227         case 0x202f:    /* NARROW NO-BREAK SPACE */
2228         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2229         case 0x3000:    /* IDEOGRAPHIC SPACE */
2230         break;
2231
2232         default:
2233         ADD_NEW(state_offset + 1, 0);
2234         break;
2235         }
2236       break;
2237
2238       /*-----------------------------------------------------------------*/
2239       case OP_HSPACE:
2240       if (clen > 0) switch(c)
2241         {
2242         case 0x09:      /* HT */
2243         case 0x20:      /* SPACE */
2244         case 0xa0:      /* NBSP */
2245         case 0x1680:    /* OGHAM SPACE MARK */
2246         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2247         case 0x2000:    /* EN QUAD */
2248         case 0x2001:    /* EM QUAD */
2249         case 0x2002:    /* EN SPACE */
2250         case 0x2003:    /* EM SPACE */
2251         case 0x2004:    /* THREE-PER-EM SPACE */
2252         case 0x2005:    /* FOUR-PER-EM SPACE */
2253         case 0x2006:    /* SIX-PER-EM SPACE */
2254         case 0x2007:    /* FIGURE SPACE */
2255         case 0x2008:    /* PUNCTUATION SPACE */
2256         case 0x2009:    /* THIN SPACE */
2257         case 0x200A:    /* HAIR SPACE */
2258         case 0x202f:    /* NARROW NO-BREAK SPACE */
2259         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2260         case 0x3000:    /* IDEOGRAPHIC SPACE */
2261         ADD_NEW(state_offset + 1, 0);
2262         break;
2263         }
2264       break;
2265
2266       /*-----------------------------------------------------------------*/
2267       /* Match a negated single character casefully. */
2268
2269       case OP_NOT:
2270       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2271       break;
2272
2273       /*-----------------------------------------------------------------*/
2274       /* Match a negated single character caselessly. */
2275
2276       case OP_NOTI:
2277       if (clen > 0)
2278         {
2279         unsigned int otherd;
2280 #ifdef SUPPORT_UTF
2281         if (utf && d >= 128)
2282           {
2283 #ifdef SUPPORT_UCP
2284           otherd = UCD_OTHERCASE(d);
2285 #endif  /* SUPPORT_UCP */
2286           }
2287         else
2288 #endif  /* SUPPORT_UTF */
2289         otherd = TABLE_GET(d, fcc, d);
2290         if (c != d && c != otherd)
2291           { ADD_NEW(state_offset + dlen + 1, 0); }
2292         }
2293       break;
2294
2295       /*-----------------------------------------------------------------*/
2296       case OP_PLUSI:
2297       case OP_MINPLUSI:
2298       case OP_POSPLUSI:
2299       case OP_NOTPLUSI:
2300       case OP_NOTMINPLUSI:
2301       case OP_NOTPOSPLUSI:
2302       caseless = TRUE;
2303       codevalue -= OP_STARI - OP_STAR;
2304
2305       /* Fall through */
2306       case OP_PLUS:
2307       case OP_MINPLUS:
2308       case OP_POSPLUS:
2309       case OP_NOTPLUS:
2310       case OP_NOTMINPLUS:
2311       case OP_NOTPOSPLUS:
2312       count = current_state->count;  /* Already matched */
2313       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2314       if (clen > 0)
2315         {
2316         unsigned int otherd = NOTACHAR;
2317         if (caseless)
2318           {
2319 #ifdef SUPPORT_UTF
2320           if (utf && d >= 128)
2321             {
2322 #ifdef SUPPORT_UCP
2323             otherd = UCD_OTHERCASE(d);
2324 #endif  /* SUPPORT_UCP */
2325             }
2326           else
2327 #endif  /* SUPPORT_UTF */
2328           otherd = TABLE_GET(d, fcc, d);
2329           }
2330         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2331           {
2332           if (count > 0 &&
2333               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2334             {
2335             active_count--;             /* Remove non-match possibility */
2336             next_active_state--;
2337             }
2338           count++;
2339           ADD_NEW(state_offset, count);
2340           }
2341         }
2342       break;
2343
2344       /*-----------------------------------------------------------------*/
2345       case OP_QUERYI:
2346       case OP_MINQUERYI:
2347       case OP_POSQUERYI:
2348       case OP_NOTQUERYI:
2349       case OP_NOTMINQUERYI:
2350       case OP_NOTPOSQUERYI:
2351       caseless = TRUE;
2352       codevalue -= OP_STARI - OP_STAR;
2353       /* Fall through */
2354       case OP_QUERY:
2355       case OP_MINQUERY:
2356       case OP_POSQUERY:
2357       case OP_NOTQUERY:
2358       case OP_NOTMINQUERY:
2359       case OP_NOTPOSQUERY:
2360       ADD_ACTIVE(state_offset + dlen + 1, 0);
2361       if (clen > 0)
2362         {
2363         unsigned int otherd = NOTACHAR;
2364         if (caseless)
2365           {
2366 #ifdef SUPPORT_UTF
2367           if (utf && d >= 128)
2368             {
2369 #ifdef SUPPORT_UCP
2370             otherd = UCD_OTHERCASE(d);
2371 #endif  /* SUPPORT_UCP */
2372             }
2373           else
2374 #endif  /* SUPPORT_UTF */
2375           otherd = TABLE_GET(d, fcc, d);
2376           }
2377         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2378           {
2379           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2380             {
2381             active_count--;            /* Remove non-match possibility */
2382             next_active_state--;
2383             }
2384           ADD_NEW(state_offset + dlen + 1, 0);
2385           }
2386         }
2387       break;
2388
2389       /*-----------------------------------------------------------------*/
2390       case OP_STARI:
2391       case OP_MINSTARI:
2392       case OP_POSSTARI:
2393       case OP_NOTSTARI:
2394       case OP_NOTMINSTARI:
2395       case OP_NOTPOSSTARI:
2396       caseless = TRUE;
2397       codevalue -= OP_STARI - OP_STAR;
2398       /* Fall through */
2399       case OP_STAR:
2400       case OP_MINSTAR:
2401       case OP_POSSTAR:
2402       case OP_NOTSTAR:
2403       case OP_NOTMINSTAR:
2404       case OP_NOTPOSSTAR:
2405       ADD_ACTIVE(state_offset + dlen + 1, 0);
2406       if (clen > 0)
2407         {
2408         unsigned int otherd = NOTACHAR;
2409         if (caseless)
2410           {
2411 #ifdef SUPPORT_UTF
2412           if (utf && d >= 128)
2413             {
2414 #ifdef SUPPORT_UCP
2415             otherd = UCD_OTHERCASE(d);
2416 #endif  /* SUPPORT_UCP */
2417             }
2418           else
2419 #endif  /* SUPPORT_UTF */
2420           otherd = TABLE_GET(d, fcc, d);
2421           }
2422         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2423           {
2424           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2425             {
2426             active_count--;            /* Remove non-match possibility */
2427             next_active_state--;
2428             }
2429           ADD_NEW(state_offset, 0);
2430           }
2431         }
2432       break;
2433
2434       /*-----------------------------------------------------------------*/
2435       case OP_EXACTI:
2436       case OP_NOTEXACTI:
2437       caseless = TRUE;
2438       codevalue -= OP_STARI - OP_STAR;
2439       /* Fall through */
2440       case OP_EXACT:
2441       case OP_NOTEXACT:
2442       count = current_state->count;  /* Number already matched */
2443       if (clen > 0)
2444         {
2445         unsigned int otherd = NOTACHAR;
2446         if (caseless)
2447           {
2448 #ifdef SUPPORT_UTF
2449           if (utf && d >= 128)
2450             {
2451 #ifdef SUPPORT_UCP
2452             otherd = UCD_OTHERCASE(d);
2453 #endif  /* SUPPORT_UCP */
2454             }
2455           else
2456 #endif  /* SUPPORT_UTF */
2457           otherd = TABLE_GET(d, fcc, d);
2458           }
2459         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2460           {
2461           if (++count >= GET2(code, 1))
2462             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2463           else
2464             { ADD_NEW(state_offset, count); }
2465           }
2466         }
2467       break;
2468
2469       /*-----------------------------------------------------------------*/
2470       case OP_UPTOI:
2471       case OP_MINUPTOI:
2472       case OP_POSUPTOI:
2473       case OP_NOTUPTOI:
2474       case OP_NOTMINUPTOI:
2475       case OP_NOTPOSUPTOI:
2476       caseless = TRUE;
2477       codevalue -= OP_STARI - OP_STAR;
2478       /* Fall through */
2479       case OP_UPTO:
2480       case OP_MINUPTO:
2481       case OP_POSUPTO:
2482       case OP_NOTUPTO:
2483       case OP_NOTMINUPTO:
2484       case OP_NOTPOSUPTO:
2485       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2486       count = current_state->count;  /* Number already matched */
2487       if (clen > 0)
2488         {
2489         unsigned int otherd = NOTACHAR;
2490         if (caseless)
2491           {
2492 #ifdef SUPPORT_UTF
2493           if (utf && d >= 128)
2494             {
2495 #ifdef SUPPORT_UCP
2496             otherd = UCD_OTHERCASE(d);
2497 #endif  /* SUPPORT_UCP */
2498             }
2499           else
2500 #endif  /* SUPPORT_UTF */
2501           otherd = TABLE_GET(d, fcc, d);
2502           }
2503         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2504           {
2505           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2506             {
2507             active_count--;             /* Remove non-match possibility */
2508             next_active_state--;
2509             }
2510           if (++count >= GET2(code, 1))
2511             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2512           else
2513             { ADD_NEW(state_offset, count); }
2514           }
2515         }
2516       break;
2517
2518
2519 /* ========================================================================== */
2520       /* These are the class-handling opcodes */
2521
2522       case OP_CLASS:
2523       case OP_NCLASS:
2524       case OP_XCLASS:
2525         {
2526         BOOL isinclass = FALSE;
2527         int next_state_offset;
2528         const pcre_uchar *ecode;
2529
2530         /* For a simple class, there is always just a 32-byte table, and we
2531         can set isinclass from it. */
2532
2533         if (codevalue != OP_XCLASS)
2534           {
2535           ecode = code + 1 + (32 / sizeof(pcre_uchar));
2536           if (clen > 0)
2537             {
2538             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2539               ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2540             }
2541           }
2542
2543         /* An extended class may have a table or a list of single characters,
2544         ranges, or both, and it may be positive or negative. There's a
2545         function that sorts all this out. */
2546
2547         else
2548          {
2549          ecode = code + GET(code, 1);
2550          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2551          }
2552
2553         /* At this point, isinclass is set for all kinds of class, and ecode
2554         points to the byte after the end of the class. If there is a
2555         quantifier, this is where it will be. */
2556
2557         next_state_offset = (int)(ecode - start_code);
2558
2559         switch (*ecode)
2560           {
2561           case OP_CRSTAR:
2562           case OP_CRMINSTAR:
2563           ADD_ACTIVE(next_state_offset + 1, 0);
2564           if (isinclass) { ADD_NEW(state_offset, 0); }
2565           break;
2566
2567           case OP_CRPLUS:
2568           case OP_CRMINPLUS:
2569           count = current_state->count;  /* Already matched */
2570           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2571           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2572           break;
2573
2574           case OP_CRQUERY:
2575           case OP_CRMINQUERY:
2576           ADD_ACTIVE(next_state_offset + 1, 0);
2577           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2578           break;
2579
2580           case OP_CRRANGE:
2581           case OP_CRMINRANGE:
2582           count = current_state->count;  /* Already matched */
2583           if (count >= GET2(ecode, 1))
2584             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2585           if (isinclass)
2586             {
2587             int max = GET2(ecode, 1 + IMM2_SIZE);
2588             if (++count >= max && max != 0)   /* Max 0 => no limit */
2589               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2590             else
2591               { ADD_NEW(state_offset, count); }
2592             }
2593           break;
2594
2595           default:
2596           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2597           break;
2598           }
2599         }
2600       break;
2601
2602 /* ========================================================================== */
2603       /* These are the opcodes for fancy brackets of various kinds. We have
2604       to use recursion in order to handle them. The "always failing" assertion
2605       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2606       though the other "backtracking verbs" are not supported. */
2607
2608       case OP_FAIL:
2609       forced_fail++;    /* Count FAILs for multiple states */
2610       break;
2611
2612       case OP_ASSERT:
2613       case OP_ASSERT_NOT:
2614       case OP_ASSERTBACK:
2615       case OP_ASSERTBACK_NOT:
2616         {
2617         int rc;
2618         int local_offsets[2];
2619         int local_workspace[1000];
2620         const pcre_uchar *endasscode = code + GET(code, 1);
2621
2622         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2623
2624         rc = internal_dfa_exec(
2625           md,                                   /* static match data */
2626           code,                                 /* this subexpression's code */
2627           ptr,                                  /* where we currently are */
2628           (int)(ptr - start_subject),           /* start offset */
2629           local_offsets,                        /* offset vector */
2630           sizeof(local_offsets)/sizeof(int),    /* size of same */
2631           local_workspace,                      /* workspace vector */
2632           sizeof(local_workspace)/sizeof(int),  /* size of same */
2633           rlevel);                              /* function recursion level */
2634
2635         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2636         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2637             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2638         }
2639       break;
2640
2641       /*-----------------------------------------------------------------*/
2642       case OP_COND:
2643       case OP_SCOND:
2644         {
2645         int local_offsets[1000];
2646         int local_workspace[1000];
2647         int codelink = GET(code, 1);
2648         int condcode;
2649
2650         /* Because of the way auto-callout works during compile, a callout item
2651         is inserted between OP_COND and an assertion condition. This does not
2652         happen for the other conditions. */
2653
2654         if (code[LINK_SIZE+1] == OP_CALLOUT)
2655           {
2656           rrc = 0;
2657           if (PUBL(callout) != NULL)
2658             {
2659             PUBL(callout_block) cb;
2660             cb.version          = 1;   /* Version 1 of the callout block */
2661             cb.callout_number   = code[LINK_SIZE+2];
2662             cb.offset_vector    = offsets;
2663 #ifdef COMPILE_PCRE8
2664             cb.subject          = (PCRE_SPTR)start_subject;
2665 #else
2666             cb.subject          = (PCRE_SPTR16)start_subject;
2667 #endif
2668             cb.subject_length   = (int)(end_subject - start_subject);
2669             cb.start_match      = (int)(current_subject - start_subject);
2670             cb.current_position = (int)(ptr - start_subject);
2671             cb.pattern_position = GET(code, LINK_SIZE + 3);
2672             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2673             cb.capture_top      = 1;
2674             cb.capture_last     = -1;
2675             cb.callout_data     = md->callout_data;
2676             cb.mark             = NULL;   /* No (*MARK) support */
2677             if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2678             }
2679           if (rrc > 0) break;                      /* Fail this thread */
2680           code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2681           }
2682
2683         condcode = code[LINK_SIZE+1];
2684
2685         /* Back reference conditions are not supported */
2686
2687         if (condcode == OP_CREF || condcode == OP_NCREF)
2688           return PCRE_ERROR_DFA_UCOND;
2689
2690         /* The DEFINE condition is always false */
2691
2692         if (condcode == OP_DEF)
2693           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2694
2695         /* The only supported version of OP_RREF is for the value RREF_ANY,
2696         which means "test if in any recursion". We can't test for specifically
2697         recursed groups. */
2698
2699         else if (condcode == OP_RREF || condcode == OP_NRREF)
2700           {
2701           int value = GET2(code, LINK_SIZE + 2);
2702           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2703           if (md->recursive != NULL)
2704             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2705           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2706           }
2707
2708         /* Otherwise, the condition is an assertion */
2709
2710         else
2711           {
2712           int rc;
2713           const pcre_uchar *asscode = code + LINK_SIZE + 1;
2714           const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2715
2716           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2717
2718           rc = internal_dfa_exec(
2719             md,                                   /* fixed match data */
2720             asscode,                              /* this subexpression's code */
2721             ptr,                                  /* where we currently are */
2722             (int)(ptr - start_subject),           /* start offset */
2723             local_offsets,                        /* offset vector */
2724             sizeof(local_offsets)/sizeof(int),    /* size of same */
2725             local_workspace,                      /* workspace vector */
2726             sizeof(local_workspace)/sizeof(int),  /* size of same */
2727             rlevel);                              /* function recursion level */
2728
2729           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2730           if ((rc >= 0) ==
2731                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2732             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2733           else
2734             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2735           }
2736         }
2737       break;
2738
2739       /*-----------------------------------------------------------------*/
2740       case OP_RECURSE:
2741         {
2742         dfa_recursion_info *ri;
2743         int local_offsets[1000];
2744         int local_workspace[1000];
2745         const pcre_uchar *callpat = start_code + GET(code, 1);
2746         int recno = (callpat == md->start_code)? 0 :
2747           GET2(callpat, 1 + LINK_SIZE);
2748         int rc;
2749
2750         DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2751
2752         /* Check for repeating a recursion without advancing the subject
2753         pointer. This should catch convoluted mutual recursions. (Some simple
2754         cases are caught at compile time.) */
2755
2756         for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2757           if (recno == ri->group_num && ptr == ri->subject_position)
2758             return PCRE_ERROR_RECURSELOOP;
2759
2760         /* Remember this recursion and where we started it so as to
2761         catch infinite loops. */
2762
2763         new_recursive.group_num = recno;
2764         new_recursive.subject_position = ptr;
2765         new_recursive.prevrec = md->recursive;
2766         md->recursive = &new_recursive;
2767
2768         rc = internal_dfa_exec(
2769           md,                                   /* fixed match data */
2770           callpat,                              /* this subexpression's code */
2771           ptr,                                  /* where we currently are */
2772           (int)(ptr - start_subject),           /* start offset */
2773           local_offsets,                        /* offset vector */
2774           sizeof(local_offsets)/sizeof(int),    /* size of same */
2775           local_workspace,                      /* workspace vector */
2776           sizeof(local_workspace)/sizeof(int),  /* size of same */
2777           rlevel);                              /* function recursion level */
2778
2779         md->recursive = new_recursive.prevrec;  /* Done this recursion */
2780
2781         DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2782           rc));
2783
2784         /* Ran out of internal offsets */
2785
2786         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2787
2788         /* For each successful matched substring, set up the next state with a
2789         count of characters to skip before trying it. Note that the count is in
2790         characters, not bytes. */
2791
2792         if (rc > 0)
2793           {
2794           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2795             {
2796             int charcount = local_offsets[rc+1] - local_offsets[rc];
2797 #ifdef SUPPORT_UTF
2798             if (utf)
2799               {
2800               const pcre_uchar *p = start_subject + local_offsets[rc];
2801               const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2802               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2803               }
2804 #endif
2805             if (charcount > 0)
2806               {
2807               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2808               }
2809             else
2810               {
2811               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2812               }
2813             }
2814           }
2815         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2816         }
2817       break;
2818
2819       /*-----------------------------------------------------------------*/
2820       case OP_BRAPOS:
2821       case OP_SBRAPOS:
2822       case OP_CBRAPOS:
2823       case OP_SCBRAPOS:
2824       case OP_BRAPOSZERO:
2825         {
2826         int charcount, matched_count;
2827         const pcre_uchar *local_ptr = ptr;
2828         BOOL allow_zero;
2829
2830         if (codevalue == OP_BRAPOSZERO)
2831           {
2832           allow_zero = TRUE;
2833           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2834           }
2835         else allow_zero = FALSE;
2836
2837         /* Loop to match the subpattern as many times as possible as if it were
2838         a complete pattern. */
2839
2840         for (matched_count = 0;; matched_count++)
2841           {
2842           int local_offsets[2];
2843           int local_workspace[1000];
2844
2845           int rc = internal_dfa_exec(
2846             md,                                   /* fixed match data */
2847             code,                                 /* this subexpression's code */
2848             local_ptr,                            /* where we currently are */
2849             (int)(ptr - start_subject),           /* start offset */
2850             local_offsets,                        /* offset vector */
2851             sizeof(local_offsets)/sizeof(int),    /* size of same */
2852             local_workspace,                      /* workspace vector */
2853             sizeof(local_workspace)/sizeof(int),  /* size of same */
2854             rlevel);                              /* function recursion level */
2855
2856           /* Failed to match */
2857
2858           if (rc < 0)
2859             {
2860             if (rc != PCRE_ERROR_NOMATCH) return rc;
2861             break;
2862             }
2863
2864           /* Matched: break the loop if zero characters matched. */
2865
2866           charcount = local_offsets[1] - local_offsets[0];
2867           if (charcount == 0) break;
2868           local_ptr += charcount;    /* Advance temporary position ptr */
2869           }
2870
2871         /* At this point we have matched the subpattern matched_count
2872         times, and local_ptr is pointing to the character after the end of the
2873         last match. */
2874
2875         if (matched_count > 0 || allow_zero)
2876           {
2877           const pcre_uchar *end_subpattern = code;
2878           int next_state_offset;
2879
2880           do { end_subpattern += GET(end_subpattern, 1); }
2881             while (*end_subpattern == OP_ALT);
2882           next_state_offset =
2883             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2884
2885           /* Optimization: if there are no more active states, and there
2886           are no new states yet set up, then skip over the subject string
2887           right here, to save looping. Otherwise, set up the new state to swing
2888           into action when the end of the matched substring is reached. */
2889
2890           if (i + 1 >= active_count && new_count == 0)
2891             {
2892             ptr = local_ptr;
2893             clen = 0;
2894             ADD_NEW(next_state_offset, 0);
2895             }
2896           else
2897             {
2898             const pcre_uchar *p = ptr;
2899             const pcre_uchar *pp = local_ptr;
2900             charcount = (int)(pp - p);
2901 #ifdef SUPPORT_UTF
2902             if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2903 #endif
2904             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2905             }
2906           }
2907         }
2908       break;
2909
2910       /*-----------------------------------------------------------------*/
2911       case OP_ONCE:
2912       case OP_ONCE_NC:
2913         {
2914         int local_offsets[2];
2915         int local_workspace[1000];
2916
2917         int rc = internal_dfa_exec(
2918           md,                                   /* fixed match data */
2919           code,                                 /* this subexpression's code */
2920           ptr,                                  /* where we currently are */
2921           (int)(ptr - start_subject),           /* start offset */
2922           local_offsets,                        /* offset vector */
2923           sizeof(local_offsets)/sizeof(int),    /* size of same */
2924           local_workspace,                      /* workspace vector */
2925           sizeof(local_workspace)/sizeof(int),  /* size of same */
2926           rlevel);                              /* function recursion level */
2927
2928         if (rc >= 0)
2929           {
2930           const pcre_uchar *end_subpattern = code;
2931           int charcount = local_offsets[1] - local_offsets[0];
2932           int next_state_offset, repeat_state_offset;
2933
2934           do { end_subpattern += GET(end_subpattern, 1); }
2935             while (*end_subpattern == OP_ALT);
2936           next_state_offset =
2937             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2938
2939           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2940           arrange for the repeat state also to be added to the relevant list.
2941           Calculate the offset, or set -1 for no repeat. */
2942
2943           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2944                                  *end_subpattern == OP_KETRMIN)?
2945             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2946
2947           /* If we have matched an empty string, add the next state at the
2948           current character pointer. This is important so that the duplicate
2949           checking kicks in, which is what breaks infinite loops that match an
2950           empty string. */
2951
2952           if (charcount == 0)
2953             {
2954             ADD_ACTIVE(next_state_offset, 0);
2955             }
2956
2957           /* Optimization: if there are no more active states, and there
2958           are no new states yet set up, then skip over the subject string
2959           right here, to save looping. Otherwise, set up the new state to swing
2960           into action when the end of the matched substring is reached. */
2961
2962           else if (i + 1 >= active_count && new_count == 0)
2963             {
2964             ptr += charcount;
2965             clen = 0;
2966             ADD_NEW(next_state_offset, 0);
2967
2968             /* If we are adding a repeat state at the new character position,
2969             we must fudge things so that it is the only current state.
2970             Otherwise, it might be a duplicate of one we processed before, and
2971             that would cause it to be skipped. */
2972
2973             if (repeat_state_offset >= 0)
2974               {
2975               next_active_state = active_states;
2976               active_count = 0;
2977               i = -1;
2978               ADD_ACTIVE(repeat_state_offset, 0);
2979               }
2980             }
2981           else
2982             {
2983 #ifdef SUPPORT_UTF
2984             if (utf)
2985               {
2986               const pcre_uchar *p = start_subject + local_offsets[0];
2987               const pcre_uchar *pp = start_subject + local_offsets[1];
2988               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2989               }
2990 #endif
2991             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2992             if (repeat_state_offset >= 0)
2993               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2994             }
2995           }
2996         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2997         }
2998       break;
2999
3000
3001 /* ========================================================================== */
3002       /* Handle callouts */
3003
3004       case OP_CALLOUT:
3005       rrc = 0;
3006       if (PUBL(callout) != NULL)
3007         {
3008         PUBL(callout_block) cb;
3009         cb.version          = 1;   /* Version 1 of the callout block */
3010         cb.callout_number   = code[1];
3011         cb.offset_vector    = offsets;
3012 #ifdef COMPILE_PCRE8
3013         cb.subject          = (PCRE_SPTR)start_subject;
3014 #else
3015         cb.subject          = (PCRE_SPTR16)start_subject;
3016 #endif
3017         cb.subject_length   = (int)(end_subject - start_subject);
3018         cb.start_match      = (int)(current_subject - start_subject);
3019         cb.current_position = (int)(ptr - start_subject);
3020         cb.pattern_position = GET(code, 2);
3021         cb.next_item_length = GET(code, 2 + LINK_SIZE);
3022         cb.capture_top      = 1;
3023         cb.capture_last     = -1;
3024         cb.callout_data     = md->callout_data;
3025         cb.mark             = NULL;   /* No (*MARK) support */
3026         if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3027         }
3028       if (rrc == 0)
3029         { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3030       break;
3031
3032
3033 /* ========================================================================== */
3034       default:        /* Unsupported opcode */
3035       return PCRE_ERROR_DFA_UITEM;
3036       }
3037
3038     NEXT_ACTIVE_STATE: continue;
3039
3040     }      /* End of loop scanning active states */
3041
3042   /* We have finished the processing at the current subject character. If no
3043   new states have been set for the next character, we have found all the
3044   matches that we are going to find. If we are at the top level and partial
3045   matching has been requested, check for appropriate conditions.
3046
3047   The "forced_ fail" variable counts the number of (*F) encountered for the
3048   character. If it is equal to the original active_count (saved in
3049   workspace[1]) it means that (*F) was found on every active state. In this
3050   case we don't want to give a partial match.
3051
3052   The "could_continue" variable is true if a state could have continued but
3053   for the fact that the end of the subject was reached. */
3054
3055   if (new_count <= 0)
3056     {
3057     if (rlevel == 1 &&                               /* Top level, and */
3058         could_continue &&                            /* Some could go on, and */
3059         forced_fail != workspace[1] &&               /* Not all forced fail & */
3060         (                                            /* either... */
3061         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3062         ||                                           /* or... */
3063         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3064          match_count < 0)                            /* no matches */
3065         ) &&                                         /* And... */
3066         (
3067         partial_newline ||                           /* Either partial NL */
3068           (                                          /* or ... */
3069           ptr >= end_subject &&                /* End of subject and */
3070           ptr > md->start_used_ptr)            /* Inspected non-empty string */
3071           )
3072         )
3073       {
3074       if (offsetcount >= 2)
3075         {
3076         offsets[0] = (int)(md->start_used_ptr - start_subject);
3077         offsets[1] = (int)(end_subject - start_subject);
3078         }
3079       match_count = PCRE_ERROR_PARTIAL;
3080       }
3081
3082     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3083       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3084       rlevel*2-2, SP));
3085     break;        /* In effect, "return", but see the comment below */
3086     }
3087
3088   /* One or more states are active for the next character. */
3089
3090   ptr += clen;    /* Advance to next subject character */
3091   }               /* Loop to move along the subject string */
3092
3093 /* Control gets here from "break" a few lines above. We do it this way because
3094 if we use "return" above, we have compiler trouble. Some compilers warn if
3095 there's nothing here because they think the function doesn't return a value. On
3096 the other hand, if we put a dummy statement here, some more clever compilers
3097 complain that it can't be reached. Sigh. */
3098
3099 return match_count;
3100 }
3101
3102
3103
3104
3105 /*************************************************
3106 *    Execute a Regular Expression - DFA engine   *
3107 *************************************************/
3108
3109 /* This external function applies a compiled re to a subject string using a DFA
3110 engine. This function calls the internal function multiple times if the pattern
3111 is not anchored.
3112
3113 Arguments:
3114   argument_re     points to the compiled expression
3115   extra_data      points to extra data or is NULL
3116   subject         points to the subject string
3117   length          length of subject string (may contain binary zeros)
3118   start_offset    where to start in the subject string
3119   options         option bits
3120   offsets         vector of match offsets
3121   offsetcount     size of same
3122   workspace       workspace vector
3123   wscount         size of same
3124
3125 Returns:          > 0 => number of match offset pairs placed in offsets
3126                   = 0 => offsets overflowed; longest matches are present
3127                    -1 => failed to match
3128                  < -1 => some kind of unexpected problem
3129 */
3130
3131 #ifdef COMPILE_PCRE8
3132 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3133 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3134   const char *subject, int length, int start_offset, int options, int *offsets,
3135   int offsetcount, int *workspace, int wscount)
3136 #else
3137 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3138 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3139   PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3140   int offsetcount, int *workspace, int wscount)
3141 #endif
3142 {
3143 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3144 dfa_match_data match_block;
3145 dfa_match_data *md = &match_block;
3146 BOOL utf, anchored, startline, firstline;
3147 const pcre_uchar *current_subject, *end_subject;
3148 const pcre_study_data *study = NULL;
3149
3150 const pcre_uchar *req_char_ptr;
3151 const pcre_uint8 *start_bits = NULL;
3152 BOOL has_first_char = FALSE;
3153 BOOL has_req_char = FALSE;
3154 pcre_uchar first_char = 0;
3155 pcre_uchar first_char2 = 0;
3156 pcre_uchar req_char = 0;
3157 pcre_uchar req_char2 = 0;
3158 int newline;
3159
3160 /* Plausibility checks */
3161
3162 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3163 if (re == NULL || subject == NULL || workspace == NULL ||
3164    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3165 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3166 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3167 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3168
3169 /* Check that the first field in the block is the magic number. If it is not,
3170 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3171 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3172 means that the pattern is likely compiled with different endianness. */
3173
3174 if (re->magic_number != MAGIC_NUMBER)
3175   return re->magic_number == REVERSED_MAGIC_NUMBER?
3176     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3177 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3178
3179 /* If restarting after a partial match, do some sanity checks on the contents
3180 of the workspace. */
3181
3182 if ((options & PCRE_DFA_RESTART) != 0)
3183   {
3184   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3185     workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3186       return PCRE_ERROR_DFA_BADRESTART;
3187   }
3188
3189 /* Set up study, callout, and table data */
3190
3191 md->tables = re->tables;
3192 md->callout_data = NULL;
3193
3194 if (extra_data != NULL)
3195   {
3196   unsigned int flags = extra_data->flags;
3197   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3198     study = (const pcre_study_data *)extra_data->study_data;
3199   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3200   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3201     return PCRE_ERROR_DFA_UMLIMIT;
3202   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3203     md->callout_data = extra_data->callout_data;
3204   if ((flags & PCRE_EXTRA_TABLES) != 0)
3205     md->tables = extra_data->tables;
3206   }
3207
3208 /* Set some local values */
3209
3210 current_subject = (const pcre_uchar *)subject + start_offset;
3211 end_subject = (const pcre_uchar *)subject + length;
3212 req_char_ptr = current_subject - 1;
3213
3214 #ifdef SUPPORT_UTF
3215 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3216 utf = (re->options & PCRE_UTF8) != 0;
3217 #else
3218 utf = FALSE;
3219 #endif
3220
3221 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3222   (re->options & PCRE_ANCHORED) != 0;
3223
3224 /* The remaining fixed data for passing around. */
3225
3226 md->start_code = (const pcre_uchar *)argument_re +
3227     re->name_table_offset + re->name_count * re->name_entry_size;
3228 md->start_subject = (const pcre_uchar *)subject;
3229 md->end_subject = end_subject;
3230 md->start_offset = start_offset;
3231 md->moptions = options;
3232 md->poptions = re->options;
3233
3234 /* If the BSR option is not set at match time, copy what was set
3235 at compile time. */
3236
3237 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3238   {
3239   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3240     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3241 #ifdef BSR_ANYCRLF
3242   else md->moptions |= PCRE_BSR_ANYCRLF;
3243 #endif
3244   }
3245
3246 /* Handle different types of newline. The three bits give eight cases. If
3247 nothing is set at run time, whatever was used at compile time applies. */
3248
3249 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3250          PCRE_NEWLINE_BITS)
3251   {
3252   case 0: newline = NEWLINE; break;   /* Compile-time default */
3253   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3254   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3255   case PCRE_NEWLINE_CR+
3256        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3257   case PCRE_NEWLINE_ANY: newline = -1; break;
3258   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3259   default: return PCRE_ERROR_BADNEWLINE;
3260   }
3261
3262 if (newline == -2)
3263   {
3264   md->nltype = NLTYPE_ANYCRLF;
3265   }
3266 else if (newline < 0)
3267   {
3268   md->nltype = NLTYPE_ANY;
3269   }
3270 else
3271   {
3272   md->nltype = NLTYPE_FIXED;
3273   if (newline > 255)
3274     {
3275     md->nllen = 2;
3276     md->nl[0] = (newline >> 8) & 255;
3277     md->nl[1] = newline & 255;
3278     }
3279   else
3280     {
3281     md->nllen = 1;
3282     md->nl[0] = newline;
3283     }
3284   }
3285
3286 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3287 back the character offset. */
3288
3289 #ifdef SUPPORT_UTF
3290 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3291   {
3292   int erroroffset;
3293   int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3294   if (errorcode != 0)
3295     {
3296     if (offsetcount >= 2)
3297       {
3298       offsets[0] = erroroffset;
3299       offsets[1] = errorcode;
3300       }
3301     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3302       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3303     }
3304   if (start_offset > 0 && start_offset < length &&
3305         NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3306     return PCRE_ERROR_BADUTF8_OFFSET;
3307   }
3308 #endif
3309
3310 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3311 is a feature that makes it possible to save compiled regex and re-use them
3312 in other programs later. */
3313
3314 if (md->tables == NULL) md->tables = PRIV(default_tables);
3315
3316 /* The "must be at the start of a line" flags are used in a loop when finding
3317 where to start. */
3318
3319 startline = (re->flags & PCRE_STARTLINE) != 0;
3320 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3321
3322 /* Set up the first character to match, if available. The first_byte value is
3323 never set for an anchored regular expression, but the anchoring may be forced
3324 at run time, so we have to test for anchoring. The first char may be unset for
3325 an unanchored pattern, of course. If there's no first char and the pattern was
3326 studied, there may be a bitmap of possible first characters. */
3327
3328 if (!anchored)
3329   {
3330   if ((re->flags & PCRE_FIRSTSET) != 0)
3331     {
3332     has_first_char = TRUE;
3333     first_char = first_char2 = (pcre_uchar)(re->first_char);
3334     if ((re->flags & PCRE_FCH_CASELESS) != 0)
3335       {
3336       first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3337 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3338       if (utf && first_char > 127)
3339         first_char2 = UCD_OTHERCASE(first_char);
3340 #endif
3341       }
3342     }
3343   else
3344     {
3345     if (!startline && study != NULL &&
3346          (study->flags & PCRE_STUDY_MAPPED) != 0)
3347       start_bits = study->start_bits;
3348     }
3349   }
3350
3351 /* For anchored or unanchored matches, there may be a "last known required
3352 character" set. */
3353
3354 if ((re->flags & PCRE_REQCHSET) != 0)
3355   {
3356   has_req_char = TRUE;
3357   req_char = req_char2 = (pcre_uchar)(re->req_char);
3358   if ((re->flags & PCRE_RCH_CASELESS) != 0)
3359     {
3360     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3361 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3362     if (utf && req_char > 127)
3363       req_char2 = UCD_OTHERCASE(req_char);
3364 #endif
3365     }
3366   }
3367
3368 /* Call the main matching function, looping for a non-anchored regex after a
3369 failed match. If not restarting, perform certain optimizations at the start of
3370 a match. */
3371
3372 for (;;)
3373   {
3374   int rc;
3375
3376   if ((options & PCRE_DFA_RESTART) == 0)
3377     {
3378     const pcre_uchar *save_end_subject = end_subject;
3379
3380     /* If firstline is TRUE, the start of the match is constrained to the first
3381     line of a multiline string. Implement this by temporarily adjusting
3382     end_subject so that we stop scanning at a newline. If the match fails at
3383     the newline, later code breaks this loop. */
3384
3385     if (firstline)
3386       {
3387       PCRE_PUCHAR t = current_subject;
3388 #ifdef SUPPORT_UTF
3389       if (utf)
3390         {
3391         while (t < md->end_subject && !IS_NEWLINE(t))
3392           {
3393           t++;
3394           ACROSSCHAR(t < end_subject, *t, t++);
3395           }
3396         }
3397       else
3398 #endif
3399       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3400       end_subject = t;
3401       }
3402
3403     /* There are some optimizations that avoid running the match if a known
3404     starting point is not found. However, there is an option that disables
3405     these, for testing and for ensuring that all callouts do actually occur.
3406     The option can be set in the regex by (*NO_START_OPT) or passed in
3407     match-time options. */
3408
3409     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3410       {
3411       /* Advance to a known first char. */
3412
3413       if (has_first_char)
3414         {
3415         if (first_char != first_char2)
3416           while (current_subject < end_subject &&
3417               *current_subject != first_char && *current_subject != first_char2)
3418             current_subject++;
3419         else
3420           while (current_subject < end_subject &&
3421                  *current_subject != first_char)
3422             current_subject++;
3423         }
3424
3425       /* Or to just after a linebreak for a multiline match if possible */
3426
3427       else if (startline)
3428         {
3429         if (current_subject > md->start_subject + start_offset)
3430           {
3431 #ifdef SUPPORT_UTF
3432           if (utf)
3433             {
3434             while (current_subject < end_subject &&
3435                    !WAS_NEWLINE(current_subject))
3436               {
3437               current_subject++;
3438               ACROSSCHAR(current_subject < end_subject, *current_subject,
3439                 current_subject++);
3440               }
3441             }
3442           else
3443 #endif
3444           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3445             current_subject++;
3446
3447           /* If we have just passed a CR and the newline option is ANY or
3448           ANYCRLF, and we are now at a LF, advance the match position by one
3449           more character. */
3450
3451           if (current_subject[-1] == CHAR_CR &&
3452                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3453                current_subject < end_subject &&
3454                *current_subject == CHAR_NL)
3455             current_subject++;
3456           }
3457         }
3458
3459       /* Or to a non-unique first char after study */
3460
3461       else if (start_bits != NULL)
3462         {
3463         while (current_subject < end_subject)
3464           {
3465           unsigned int c = *current_subject;
3466 #ifndef COMPILE_PCRE8
3467           if (c > 255) c = 255;
3468 #endif
3469           if ((start_bits[c/8] & (1 << (c&7))) == 0)
3470             {
3471             current_subject++;
3472 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3473             /* In non 8-bit mode, the iteration will stop for
3474             characters > 255 at the beginning or not stop at all. */
3475             if (utf)
3476               ACROSSCHAR(current_subject < end_subject, *current_subject,
3477                 current_subject++);
3478 #endif
3479             }
3480           else break;
3481           }
3482         }
3483       }
3484
3485     /* Restore fudged end_subject */
3486
3487     end_subject = save_end_subject;
3488
3489     /* The following two optimizations are disabled for partial matching or if
3490     disabling is explicitly requested (and of course, by the test above, this
3491     code is not obeyed when restarting after a partial match). */
3492
3493     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3494         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3495       {
3496       /* If the pattern was studied, a minimum subject length may be set. This
3497       is a lower bound; no actual string of that length may actually match the
3498       pattern. Although the value is, strictly, in characters, we treat it as
3499       bytes to avoid spending too much time in this optimization. */
3500
3501       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3502           (pcre_uint32)(end_subject - current_subject) < study->minlength)
3503         return PCRE_ERROR_NOMATCH;
3504
3505       /* If req_char is set, we know that that character must appear in the
3506       subject for the match to succeed. If the first character is set, req_char
3507       must be later in the subject; otherwise the test starts at the match
3508       point. This optimization can save a huge amount of work in patterns with
3509       nested unlimited repeats that aren't going to match. Writing separate
3510       code for cased/caseless versions makes it go faster, as does using an
3511       autoincrement and backing off on a match.
3512
3513       HOWEVER: when the subject string is very, very long, searching to its end
3514       can take a long time, and give bad performance on quite ordinary
3515       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3516       string... so we don't do this when the string is sufficiently long. */
3517
3518       if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3519         {
3520         PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3521
3522         /* We don't need to repeat the search if we haven't yet reached the
3523         place we found it at last time. */
3524
3525         if (p > req_char_ptr)
3526           {
3527           if (req_char != req_char2)
3528             {
3529             while (p < end_subject)
3530               {
3531               int pp = *p++;
3532               if (pp == req_char || pp == req_char2) { p--; break; }
3533               }
3534             }
3535           else
3536             {
3537             while (p < end_subject)
3538               {
3539               if (*p++ == req_char) { p--; break; }
3540               }
3541             }
3542
3543           /* If we can't find the required character, break the matching loop,
3544           which will cause a return or PCRE_ERROR_NOMATCH. */
3545
3546           if (p >= end_subject) break;
3547
3548           /* If we have found the required character, save the point where we
3549           found it, so that we don't search again next time round the loop if
3550           the start hasn't passed this character yet. */
3551
3552           req_char_ptr = p;
3553           }
3554         }
3555       }
3556     }   /* End of optimizations that are done when not restarting */
3557
3558   /* OK, now we can do the business */
3559
3560   md->start_used_ptr = current_subject;
3561   md->recursive = NULL;
3562
3563   rc = internal_dfa_exec(
3564     md,                                /* fixed match data */
3565     md->start_code,                    /* this subexpression's code */
3566     current_subject,                   /* where we currently are */
3567     start_offset,                      /* start offset in subject */
3568     offsets,                           /* offset vector */
3569     offsetcount,                       /* size of same */
3570     workspace,                         /* workspace vector */
3571     wscount,                           /* size of same */
3572     0);                                /* function recurse level */
3573
3574   /* Anything other than "no match" means we are done, always; otherwise, carry
3575   on only if not anchored. */
3576
3577   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3578
3579   /* Advance to the next subject character unless we are at the end of a line
3580   and firstline is set. */
3581
3582   if (firstline && IS_NEWLINE(current_subject)) break;
3583   current_subject++;
3584 #ifdef SUPPORT_UTF
3585   if (utf)
3586     {
3587     ACROSSCHAR(current_subject < end_subject, *current_subject,
3588       current_subject++);
3589     }
3590 #endif
3591   if (current_subject > end_subject) break;
3592
3593   /* If we have just passed a CR and we are now at a LF, and the pattern does
3594   not contain any explicit matches for \r or \n, and the newline option is CRLF
3595   or ANY or ANYCRLF, advance the match position by one more character. */
3596
3597   if (current_subject[-1] == CHAR_CR &&
3598       current_subject < end_subject &&
3599       *current_subject == CHAR_NL &&
3600       (re->flags & PCRE_HASCRORLF) == 0 &&
3601         (md->nltype == NLTYPE_ANY ||
3602          md->nltype == NLTYPE_ANYCRLF ||
3603          md->nllen == 2))
3604     current_subject++;
3605
3606   }   /* "Bumpalong" loop */
3607
3608 return PCRE_ERROR_NOMATCH;
3609 }
3610
3611 /* End of pcre_dfa_exec.c */