glib/pcre/pcre_study.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2012 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_study(), along with local
  42 supporting functions. */
  43
  44
  45 #include "config.h"
  46
  47 #include "pcre_internal.h"
  48
  49 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
  50
  51 /* Returns from set_start_bits() */
  52
  53 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
  54
  55
  56
  57 /*************************************************
  58 *   Find the minimum subject length for a group  *
  59 *************************************************/
  60
  61 /* Scan a parenthesized group and compute the minimum length of subject that
  62 is needed to match it. This is a lower bound; it does not mean there is a
  63 string of that length that matches. In UTF8 mode, the result is in characters
  64 rather than bytes.
  65
  66 Arguments:
  67   code            pointer to start of group (the bracket)
  68   startcode       pointer to start of the whole pattern
  69   options         the compiling options
  70   int             RECURSE depth
  71
  72 Returns:   the minimum length
  73            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
  74            -2 internal error (missing capturing bracket)
  75            -3 internal error (opcode not listed)
  76 */
  77
  78 static int
  79 find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
  80   int recurse_depth)
  81 {
  82 int length = -1;
  83 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
  84 BOOL utf = (options & PCRE_UTF8) != 0;
  85 BOOL had_recurse = FALSE;
  86 int branchlength = 0;
  87 pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
  88
  89 if (*code == OP_CBRA || *code == OP_SCBRA ||
  90     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
  91
  92 /* Scan along the opcodes for this branch. If we get to the end of the
  93 branch, check the length against that of the other branches. */
  94
  95 for (;;)
  96   {
  97   int d, min;
  98   pcre_uchar *cs, *ce;
  99   int op = *cc;
 100
 101   switch (op)
 102     {
 103     case OP_COND:
 104     case OP_SCOND:
 105
 106     /* If there is only one branch in a condition, the implied branch has zero
 107     length, so we don't add anything. This covers the DEFINE "condition"
 108     automatically. */
 109
 110     cs = cc + GET(cc, 1);
 111     if (*cs != OP_ALT)
 112       {
 113       cc = cs + 1 + LINK_SIZE;
 114       break;
 115       }
 116
 117     /* Otherwise we can fall through and treat it the same as any other
 118     subpattern. */
 119
 120     case OP_CBRA:
 121     case OP_SCBRA:
 122     case OP_BRA:
 123     case OP_SBRA:
 124     case OP_CBRAPOS:
 125     case OP_SCBRAPOS:
 126     case OP_BRAPOS:
 127     case OP_SBRAPOS:
 128     case OP_ONCE:
 129     case OP_ONCE_NC:
 130     d = find_minlength(cc, startcode, options, recurse_depth);
 131     if (d < 0) return d;
 132     branchlength += d;
 133     do cc += GET(cc, 1); while (*cc == OP_ALT);
 134     cc += 1 + LINK_SIZE;
 135     break;
 136
 137     /* ACCEPT makes things far too complicated; we have to give up. */
 138
 139     case OP_ACCEPT:
 140     case OP_ASSERT_ACCEPT:
 141     return -1;
 142
 143     /* Reached end of a branch; if it's a ket it is the end of a nested
 144     call. If it's ALT it is an alternation in a nested call. If it is END it's
 145     the end of the outer call. All can be handled by the same code. If an
 146     ACCEPT was previously encountered, use the length that was in force at that
 147     time, and pass back the shortest ACCEPT length. */
 148
 149     case OP_ALT:
 150     case OP_KET:
 151     case OP_KETRMAX:
 152     case OP_KETRMIN:
 153     case OP_KETRPOS:
 154     case OP_END:
 155     if (length < 0 || (!had_recurse && branchlength < length))
 156       length = branchlength;
 157     if (op != OP_ALT) return length;
 158     cc += 1 + LINK_SIZE;
 159     branchlength = 0;
 160     had_recurse = FALSE;
 161     break;
 162
 163     /* Skip over assertive subpatterns */
 164
 165     case OP_ASSERT:
 166     case OP_ASSERT_NOT:
 167     case OP_ASSERTBACK:
 168     case OP_ASSERTBACK_NOT:
 169     do cc += GET(cc, 1); while (*cc == OP_ALT);
 170     /* Fall through */
 171
 172     /* Skip over things that don't match chars */
 173
 174     case OP_REVERSE:
 175     case OP_CREF:
 176     case OP_NCREF:
 177     case OP_RREF:
 178     case OP_NRREF:
 179     case OP_DEF:
 180     case OP_CALLOUT:
 181     case OP_SOD:
 182     case OP_SOM:
 183     case OP_EOD:
 184     case OP_EODN:
 185     case OP_CIRC:
 186     case OP_CIRCM:
 187     case OP_DOLL:
 188     case OP_DOLLM:
 189     case OP_NOT_WORD_BOUNDARY:
 190     case OP_WORD_BOUNDARY:
 191     cc += PRIV(OP_lengths)[*cc];
 192     break;
 193
 194     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
 195
 196     case OP_BRAZERO:
 197     case OP_BRAMINZERO:
 198     case OP_BRAPOSZERO:
 199     case OP_SKIPZERO:
 200     cc += PRIV(OP_lengths)[*cc];
 201     do cc += GET(cc, 1); while (*cc == OP_ALT);
 202     cc += 1 + LINK_SIZE;
 203     break;
 204
 205     /* Handle literal characters and + repetitions */
 206
 207     case OP_CHAR:
 208     case OP_CHARI:
 209     case OP_NOT:
 210     case OP_NOTI:
 211     case OP_PLUS:
 212     case OP_PLUSI:
 213     case OP_MINPLUS:
 214     case OP_MINPLUSI:
 215     case OP_POSPLUS:
 216     case OP_POSPLUSI:
 217     case OP_NOTPLUS:
 218     case OP_NOTPLUSI:
 219     case OP_NOTMINPLUS:
 220     case OP_NOTMINPLUSI:
 221     case OP_NOTPOSPLUS:
 222     case OP_NOTPOSPLUSI:
 223     branchlength++;
 224     cc += 2;
 225 #ifdef SUPPORT_UTF
 226     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 227 #endif
 228     break;
 229
 230     case OP_TYPEPLUS:
 231     case OP_TYPEMINPLUS:
 232     case OP_TYPEPOSPLUS:
 233     branchlength++;
 234     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
 235     break;
 236
 237     /* Handle exact repetitions. The count is already in characters, but we
 238     need to skip over a multibyte character in UTF8 mode.  */
 239
 240     case OP_EXACT:
 241     case OP_EXACTI:
 242     case OP_NOTEXACT:
 243     case OP_NOTEXACTI:
 244     branchlength += GET2(cc,1);
 245     cc += 2 + IMM2_SIZE;
 246 #ifdef SUPPORT_UTF
 247     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 248 #endif
 249     break;
 250
 251     case OP_TYPEEXACT:
 252     branchlength += GET2(cc,1);
 253     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
 254       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
 255     break;
 256
 257     /* Handle single-char non-literal matchers */
 258
 259     case OP_PROP:
 260     case OP_NOTPROP:
 261     cc += 2;
 262     /* Fall through */
 263
 264     case OP_NOT_DIGIT:
 265     case OP_DIGIT:
 266     case OP_NOT_WHITESPACE:
 267     case OP_WHITESPACE:
 268     case OP_NOT_WORDCHAR:
 269     case OP_WORDCHAR:
 270     case OP_ANY:
 271     case OP_ALLANY:
 272     case OP_EXTUNI:
 273     case OP_HSPACE:
 274     case OP_NOT_HSPACE:
 275     case OP_VSPACE:
 276     case OP_NOT_VSPACE:
 277     branchlength++;
 278     cc++;
 279     break;
 280
 281     /* "Any newline" might match two characters, but it also might match just
 282     one. */
 283
 284     case OP_ANYNL:
 285     branchlength += 1;
 286     cc++;
 287     break;
 288
 289     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
 290     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
 291     appear, but leave the code, just in case.) */
 292
 293     case OP_ANYBYTE:
 294 #ifdef SUPPORT_UTF
 295     if (utf) return -1;
 296 #endif
 297     branchlength++;
 298     cc++;
 299     break;
 300
 301     /* For repeated character types, we have to test for \p and \P, which have
 302     an extra two bytes of parameters. */
 303
 304     case OP_TYPESTAR:
 305     case OP_TYPEMINSTAR:
 306     case OP_TYPEQUERY:
 307     case OP_TYPEMINQUERY:
 308     case OP_TYPEPOSSTAR:
 309     case OP_TYPEPOSQUERY:
 310     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
 311     cc += PRIV(OP_lengths)[op];
 312     break;
 313
 314     case OP_TYPEUPTO:
 315     case OP_TYPEMINUPTO:
 316     case OP_TYPEPOSUPTO:
 317     if (cc[1 + IMM2_SIZE] == OP_PROP
 318       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
 319     cc += PRIV(OP_lengths)[op];
 320     break;
 321
 322     /* Check a class for variable quantification */
 323
 324 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 325     case OP_XCLASS:
 326     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
 327     /* Fall through */
 328 #endif
 329
 330     case OP_CLASS:
 331     case OP_NCLASS:
 332     cc += PRIV(OP_lengths)[OP_CLASS];
 333
 334     switch (*cc)
 335       {
 336       case OP_CRPLUS:
 337       case OP_CRMINPLUS:
 338       branchlength++;
 339       /* Fall through */
 340
 341       case OP_CRSTAR:
 342       case OP_CRMINSTAR:
 343       case OP_CRQUERY:
 344       case OP_CRMINQUERY:
 345       cc++;
 346       break;
 347
 348       case OP_CRRANGE:
 349       case OP_CRMINRANGE:
 350       branchlength += GET2(cc,1);
 351       cc += 1 + 2 * IMM2_SIZE;
 352       break;
 353
 354       default:
 355       branchlength++;
 356       break;
 357       }
 358     break;
 359
 360     /* Backreferences and subroutine calls are treated in the same way: we find
 361     the minimum length for the subpattern. A recursion, however, causes an
 362     a flag to be set that causes the length of this branch to be ignored. The
 363     logic is that a recursion can only make sense if there is another
 364     alternation that stops the recursing. That will provide the minimum length
 365     (when no recursion happens). A backreference within the group that it is
 366     referencing behaves in the same way.
 367
 368     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
 369     matches an empty string (by default it causes a matching failure), so in
 370     that case we must set the minimum length to zero. */
 371
 372     case OP_REF:
 373     case OP_REFI:
 374     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
 375       {
 376       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
 377       if (cs == NULL) return -2;
 378       do ce += GET(ce, 1); while (*ce == OP_ALT);
 379       if (cc > cs && cc < ce)
 380         {
 381         d = 0;
 382         had_recurse = TRUE;
 383         }
 384       else
 385         {
 386         d = find_minlength(cs, startcode, options, recurse_depth);
 387         }
 388       }
 389     else d = 0;
 390     cc += 1 + IMM2_SIZE;
 391
 392     /* Handle repeated back references */
 393
 394     switch (*cc)
 395       {
 396       case OP_CRSTAR:
 397       case OP_CRMINSTAR:
 398       case OP_CRQUERY:
 399       case OP_CRMINQUERY:
 400       min = 0;
 401       cc++;
 402       break;
 403
 404       case OP_CRPLUS:
 405       case OP_CRMINPLUS:
 406       min = 1;
 407       cc++;
 408       break;
 409
 410       case OP_CRRANGE:
 411       case OP_CRMINRANGE:
 412       min = GET2(cc, 1);
 413       cc += 1 + 2 * IMM2_SIZE;
 414       break;
 415
 416       default:
 417       min = 1;
 418       break;
 419       }
 420
 421     branchlength += min * d;
 422     break;
 423
 424     /* We can easily detect direct recursion, but not mutual recursion. This is
 425     caught by a recursion depth count. */
 426
 427     case OP_RECURSE:
 428     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
 429     do ce += GET(ce, 1); while (*ce == OP_ALT);
 430     if ((cc > cs && cc < ce) || recurse_depth > 10)
 431       had_recurse = TRUE;
 432     else
 433       {
 434       branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
 435       }
 436     cc += 1 + LINK_SIZE;
 437     break;
 438
 439     /* Anything else does not or need not match a character. We can get the
 440     item's length from the table, but for those that can match zero occurrences
 441     of a character, we must take special action for UTF-8 characters. As it
 442     happens, the "NOT" versions of these opcodes are used at present only for
 443     ASCII characters, so they could be omitted from this list. However, in
 444     future that may change, so we include them here so as not to leave a
 445     gotcha for a future maintainer. */
 446
 447     case OP_UPTO:
 448     case OP_UPTOI:
 449     case OP_NOTUPTO:
 450     case OP_NOTUPTOI:
 451     case OP_MINUPTO:
 452     case OP_MINUPTOI:
 453     case OP_NOTMINUPTO:
 454     case OP_NOTMINUPTOI:
 455     case OP_POSUPTO:
 456     case OP_POSUPTOI:
 457     case OP_NOTPOSUPTO:
 458     case OP_NOTPOSUPTOI:
 459
 460     case OP_STAR:
 461     case OP_STARI:
 462     case OP_NOTSTAR:
 463     case OP_NOTSTARI:
 464     case OP_MINSTAR:
 465     case OP_MINSTARI:
 466     case OP_NOTMINSTAR:
 467     case OP_NOTMINSTARI:
 468     case OP_POSSTAR:
 469     case OP_POSSTARI:
 470     case OP_NOTPOSSTAR:
 471     case OP_NOTPOSSTARI:
 472
 473     case OP_QUERY:
 474     case OP_QUERYI:
 475     case OP_NOTQUERY:
 476     case OP_NOTQUERYI:
 477     case OP_MINQUERY:
 478     case OP_MINQUERYI:
 479     case OP_NOTMINQUERY:
 480     case OP_NOTMINQUERYI:
 481     case OP_POSQUERY:
 482     case OP_POSQUERYI:
 483     case OP_NOTPOSQUERY:
 484     case OP_NOTPOSQUERYI:
 485
 486     cc += PRIV(OP_lengths)[op];
 487 #ifdef SUPPORT_UTF
 488     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 489 #endif
 490     break;
 491
 492     /* Skip these, but we need to add in the name length. */
 493
 494     case OP_MARK:
 495     case OP_PRUNE_ARG:
 496     case OP_SKIP_ARG:
 497     case OP_THEN_ARG:
 498     cc += PRIV(OP_lengths)[op] + cc[1];
 499     break;
 500
 501     /* The remaining opcodes are just skipped over. */
 502
 503     case OP_CLOSE:
 504     case OP_COMMIT:
 505     case OP_FAIL:
 506     case OP_PRUNE:
 507     case OP_SET_SOM:
 508     case OP_SKIP:
 509     case OP_THEN:
 510     cc += PRIV(OP_lengths)[op];
 511     break;
 512
 513     /* This should not occur: we list all opcodes explicitly so that when
 514     new ones get added they are properly considered. */
 515
 516     default:
 517     return -3;
 518     }
 519   }
 520 /* Control never gets here */
 521 }
 522
 523
 524
 525 /*************************************************
 526 *      Set a bit and maybe its alternate case    *
 527 *************************************************/
 528
 529 /* Given a character, set its first byte's bit in the table, and also the
 530 corresponding bit for the other version of a letter if we are caseless. In
 531 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
 532 when Unicode property support is available.
 533
 534 Arguments:
 535   start_bits    points to the bit map
 536   p             points to the character
 537   caseless      the caseless flag
 538   cd            the block with char table pointers
 539   utf           TRUE for UTF-8 / UTF-16 mode
 540
 541 Returns:        pointer after the character
 542 */
 543
 544 static const pcre_uchar *
 545 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
 546   compile_data *cd, BOOL utf)
 547 {
 548 unsigned int c = *p;
 549
 550 #ifdef COMPILE_PCRE8
 551 SET_BIT(c);
 552
 553 #ifdef SUPPORT_UTF
 554 if (utf && c > 127)
 555   {
 556   GETCHARINC(c, p);
 557 #ifdef SUPPORT_UCP
 558   if (caseless)
 559     {
 560     pcre_uchar buff[6];
 561     c = UCD_OTHERCASE(c);
 562     (void)PRIV(ord2utf)(c, buff);
 563     SET_BIT(buff[0]);
 564     }
 565 #endif
 566   return p;
 567   }
 568 #endif
 569
 570 /* Not UTF-8 mode, or character is less than 127. */
 571
 572 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 573 return p + 1;
 574 #endif
 575
 576 #ifdef COMPILE_PCRE16
 577 if (c > 0xff)
 578   {
 579   c = 0xff;
 580   caseless = FALSE;
 581   }
 582 SET_BIT(c);
 583
 584 #ifdef SUPPORT_UTF
 585 if (utf && c > 127)
 586   {
 587   GETCHARINC(c, p);
 588 #ifdef SUPPORT_UCP
 589   if (caseless)
 590     {
 591     c = UCD_OTHERCASE(c);
 592     if (c > 0xff)
 593       c = 0xff;
 594     SET_BIT(c);
 595     }
 596 #endif
 597   return p;
 598   }
 599 #endif
 600
 601 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 602 return p + 1;
 603 #endif
 604 }
 605
 606
 607
 608 /*************************************************
 609 *     Set bits for a positive character type     *
 610 *************************************************/
 611
 612 /* This function sets starting bits for a character type. In UTF-8 mode, we can
 613 only do a direct setting for bytes less than 128, as otherwise there can be
 614 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
 615 environment, the tables will only recognize ASCII characters anyway, but in at
 616 least one Windows environment, some higher bytes bits were set in the tables.
 617 So we deal with that case by considering the UTF-8 encoding.
 618
 619 Arguments:
 620   start_bits     the starting bitmap
 621   cbit type      the type of character wanted
 622   table_limit    32 for non-UTF-8; 16 for UTF-8
 623   cd             the block with char table pointers
 624
 625 Returns:         nothing
 626 */
 627
 628 static void
 629 set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
 630   compile_data *cd)
 631 {
 632 int c;
 633 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
 634 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 635 if (table_limit == 32) return;
 636 for (c = 128; c < 256; c++)
 637   {
 638   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
 639     {
 640     pcre_uchar buff[6];
 641     (void)PRIV(ord2utf)(c, buff);
 642     SET_BIT(buff[0]);
 643     }
 644   }
 645 #endif
 646 }
 647
 648
 649 /*************************************************
 650 *     Set bits for a negative character type     *
 651 *************************************************/
 652
 653 /* This function sets starting bits for a negative character type such as \D.
 654 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
 655 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
 656 Unlike in the positive case, where we can set appropriate starting bits for
 657 specific high-valued UTF-8 characters, in this case we have to set the bits for
 658 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
 659 0xc0 (192) for simplicity.
 660
 661 Arguments:
 662   start_bits     the starting bitmap
 663   cbit type      the type of character wanted
 664   table_limit    32 for non-UTF-8; 16 for UTF-8
 665   cd             the block with char table pointers
 666
 667 Returns:         nothing
 668 */
 669
 670 static void
 671 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
 672   compile_data *cd)
 673 {
 674 int c;
 675 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
 676 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 677 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
 678 #endif
 679 }
 680
 681
 682
 683 /*************************************************
 684 *          Create bitmap of starting bytes       *
 685 *************************************************/
 686
 687 /* This function scans a compiled unanchored expression recursively and
 688 attempts to build a bitmap of the set of possible starting bytes. As time goes
 689 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
 690 useful for parenthesized groups in patterns such as (a*)b where the group
 691 provides some optional starting bytes but scanning must continue at the outer
 692 level to find at least one mandatory byte. At the outermost level, this
 693 function fails unless the result is SSB_DONE.
 694
 695 Arguments:
 696   code         points to an expression
 697   start_bits   points to a 32-byte table, initialized to 0
 698   utf          TRUE if in UTF-8 / UTF-16 mode
 699   cd           the block with char table pointers
 700
 701 Returns:       SSB_FAIL     => Failed to find any starting bytes
 702                SSB_DONE     => Found mandatory starting bytes
 703                SSB_CONTINUE => Found optional starting bytes
 704                SSB_UNKNOWN  => Hit an unrecognized opcode
 705 */
 706
 707 static int
 708 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
 709   compile_data *cd)
 710 {
 711 int c;
 712 int yield = SSB_DONE;
 713 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 714 int table_limit = utf? 16:32;
 715 #else
 716 int table_limit = 32;
 717 #endif
 718
 719 #if 0
 720 /* ========================================================================= */
 721 /* The following comment and code was inserted in January 1999. In May 2006,
 722 when it was observed to cause compiler warnings about unused values, I took it
 723 out again. If anybody is still using OS/2, they will have to put it back
 724 manually. */
 725
 726 /* This next statement and the later reference to dummy are here in order to
 727 trick the optimizer of the IBM C compiler for OS/2 into generating correct
 728 code. Apparently IBM isn't going to fix the problem, and we would rather not
 729 disable optimization (in this module it actually makes a big difference, and
 730 the pcre module can use all the optimization it can get). */
 731
 732 volatile int dummy;
 733 /* ========================================================================= */
 734 #endif
 735
 736 do
 737   {
 738   BOOL try_next = TRUE;
 739   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
 740
 741   if (*code == OP_CBRA || *code == OP_SCBRA ||
 742       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
 743
 744   while (try_next)    /* Loop for items in this branch */
 745     {
 746     int rc;
 747
 748     switch(*tcode)
 749       {
 750       /* If we reach something we don't understand, it means a new opcode has
 751       been created that hasn't been added to this code. Hopefully this problem
 752       will be discovered during testing. */
 753
 754       default:
 755       return SSB_UNKNOWN;
 756
 757       /* Fail for a valid opcode that implies no starting bits. */
 758
 759       case OP_ACCEPT:
 760       case OP_ASSERT_ACCEPT:
 761       case OP_ALLANY:
 762       case OP_ANY:
 763       case OP_ANYBYTE:
 764       case OP_CIRC:
 765       case OP_CIRCM:
 766       case OP_CLOSE:
 767       case OP_COMMIT:
 768       case OP_COND:
 769       case OP_CREF:
 770       case OP_DEF:
 771       case OP_DOLL:
 772       case OP_DOLLM:
 773       case OP_END:
 774       case OP_EOD:
 775       case OP_EODN:
 776       case OP_EXTUNI:
 777       case OP_FAIL:
 778       case OP_MARK:
 779       case OP_NCREF:
 780       case OP_NOT:
 781       case OP_NOTEXACT:
 782       case OP_NOTEXACTI:
 783       case OP_NOTI:
 784       case OP_NOTMINPLUS:
 785       case OP_NOTMINPLUSI:
 786       case OP_NOTMINQUERY:
 787       case OP_NOTMINQUERYI:
 788       case OP_NOTMINSTAR:
 789       case OP_NOTMINSTARI:
 790       case OP_NOTMINUPTO:
 791       case OP_NOTMINUPTOI:
 792       case OP_NOTPLUS:
 793       case OP_NOTPLUSI:
 794       case OP_NOTPOSPLUS:
 795       case OP_NOTPOSPLUSI:
 796       case OP_NOTPOSQUERY:
 797       case OP_NOTPOSQUERYI:
 798       case OP_NOTPOSSTAR:
 799       case OP_NOTPOSSTARI:
 800       case OP_NOTPOSUPTO:
 801       case OP_NOTPOSUPTOI:
 802       case OP_NOTPROP:
 803       case OP_NOTQUERY:
 804       case OP_NOTQUERYI:
 805       case OP_NOTSTAR:
 806       case OP_NOTSTARI:
 807       case OP_NOTUPTO:
 808       case OP_NOTUPTOI:
 809       case OP_NOT_HSPACE:
 810       case OP_NOT_VSPACE:
 811       case OP_NRREF:
 812       case OP_PROP:
 813       case OP_PRUNE:
 814       case OP_PRUNE_ARG:
 815       case OP_RECURSE:
 816       case OP_REF:
 817       case OP_REFI:
 818       case OP_REVERSE:
 819       case OP_RREF:
 820       case OP_SCOND:
 821       case OP_SET_SOM:
 822       case OP_SKIP:
 823       case OP_SKIP_ARG:
 824       case OP_SOD:
 825       case OP_SOM:
 826       case OP_THEN:
 827       case OP_THEN_ARG:
 828 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 829       case OP_XCLASS:
 830 #endif
 831       return SSB_FAIL;
 832
 833       /* We can ignore word boundary tests. */
 834
 835       case OP_WORD_BOUNDARY:
 836       case OP_NOT_WORD_BOUNDARY:
 837       tcode++;
 838       break;
 839
 840       /* If we hit a bracket or a positive lookahead assertion, recurse to set
 841       bits from within the subpattern. If it can't find anything, we have to
 842       give up. If it finds some mandatory character(s), we are done for this
 843       branch. Otherwise, carry on scanning after the subpattern. */
 844
 845       case OP_BRA:
 846       case OP_SBRA:
 847       case OP_CBRA:
 848       case OP_SCBRA:
 849       case OP_BRAPOS:
 850       case OP_SBRAPOS:
 851       case OP_CBRAPOS:
 852       case OP_SCBRAPOS:
 853       case OP_ONCE:
 854       case OP_ONCE_NC:
 855       case OP_ASSERT:
 856       rc = set_start_bits(tcode, start_bits, utf, cd);
 857       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 858       if (rc == SSB_DONE) try_next = FALSE; else
 859         {
 860         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 861         tcode += 1 + LINK_SIZE;
 862         }
 863       break;
 864
 865       /* If we hit ALT or KET, it means we haven't found anything mandatory in
 866       this branch, though we might have found something optional. For ALT, we
 867       continue with the next alternative, but we have to arrange that the final
 868       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
 869       return SSB_CONTINUE: if this is the top level, that indicates failure,
 870       but after a nested subpattern, it causes scanning to continue. */
 871
 872       case OP_ALT:
 873       yield = SSB_CONTINUE;
 874       try_next = FALSE;
 875       break;
 876
 877       case OP_KET:
 878       case OP_KETRMAX:
 879       case OP_KETRMIN:
 880       case OP_KETRPOS:
 881       return SSB_CONTINUE;
 882
 883       /* Skip over callout */
 884
 885       case OP_CALLOUT:
 886       tcode += 2 + 2*LINK_SIZE;
 887       break;
 888
 889       /* Skip over lookbehind and negative lookahead assertions */
 890
 891       case OP_ASSERT_NOT:
 892       case OP_ASSERTBACK:
 893       case OP_ASSERTBACK_NOT:
 894       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 895       tcode += 1 + LINK_SIZE;
 896       break;
 897
 898       /* BRAZERO does the bracket, but carries on. */
 899
 900       case OP_BRAZERO:
 901       case OP_BRAMINZERO:
 902       case OP_BRAPOSZERO:
 903       rc = set_start_bits(++tcode, start_bits, utf, cd);
 904       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 905 /* =========================================================================
 906       See the comment at the head of this function concerning the next line,
 907       which was an old fudge for the benefit of OS/2.
 908       dummy = 1;
 909   ========================================================================= */
 910       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 911       tcode += 1 + LINK_SIZE;
 912       break;
 913
 914       /* SKIPZERO skips the bracket. */
 915
 916       case OP_SKIPZERO:
 917       tcode++;
 918       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 919       tcode += 1 + LINK_SIZE;
 920       break;
 921
 922       /* Single-char * or ? sets the bit and tries the next item */
 923
 924       case OP_STAR:
 925       case OP_MINSTAR:
 926       case OP_POSSTAR:
 927       case OP_QUERY:
 928       case OP_MINQUERY:
 929       case OP_POSQUERY:
 930       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
 931       break;
 932
 933       case OP_STARI:
 934       case OP_MINSTARI:
 935       case OP_POSSTARI:
 936       case OP_QUERYI:
 937       case OP_MINQUERYI:
 938       case OP_POSQUERYI:
 939       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
 940       break;
 941
 942       /* Single-char upto sets the bit and tries the next */
 943
 944       case OP_UPTO:
 945       case OP_MINUPTO:
 946       case OP_POSUPTO:
 947       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
 948       break;
 949
 950       case OP_UPTOI:
 951       case OP_MINUPTOI:
 952       case OP_POSUPTOI:
 953       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
 954       break;
 955
 956       /* At least one single char sets the bit and stops */
 957
 958       case OP_EXACT:
 959       tcode += IMM2_SIZE;
 960       /* Fall through */
 961       case OP_CHAR:
 962       case OP_PLUS:
 963       case OP_MINPLUS:
 964       case OP_POSPLUS:
 965       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
 966       try_next = FALSE;
 967       break;
 968
 969       case OP_EXACTI:
 970       tcode += IMM2_SIZE;
 971       /* Fall through */
 972       case OP_CHARI:
 973       case OP_PLUSI:
 974       case OP_MINPLUSI:
 975       case OP_POSPLUSI:
 976       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
 977       try_next = FALSE;
 978       break;
 979
 980       /* Special spacing and line-terminating items. These recognize specific
 981       lists of characters. The difference between VSPACE and ANYNL is that the
 982       latter can match the two-character CRLF sequence, but that is not
 983       relevant for finding the first character, so their code here is
 984       identical. */
 985
 986       case OP_HSPACE:
 987       SET_BIT(0x09);
 988       SET_BIT(0x20);
 989 #ifdef SUPPORT_UTF
 990       if (utf)
 991         {
 992 #ifdef COMPILE_PCRE8
 993         SET_BIT(0xC2);  /* For U+00A0 */
 994         SET_BIT(0xE1);  /* For U+1680, U+180E */
 995         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
 996         SET_BIT(0xE3);  /* For U+3000 */
 997 #endif
 998 #ifdef COMPILE_PCRE16
 999         SET_BIT(0xA0);
1000         SET_BIT(0xFF);  /* For characters > 255 */
1001 #endif
1002         }
1003       else
1004 #endif /* SUPPORT_UTF */
1005         {
1006         SET_BIT(0xA0);
1007 #ifdef COMPILE_PCRE16
1008         SET_BIT(0xFF);  /* For characters > 255 */
1009 #endif
1010         }
1011       try_next = FALSE;
1012       break;
1013
1014       case OP_ANYNL:
1015       case OP_VSPACE:
1016       SET_BIT(0x0A);
1017       SET_BIT(0x0B);
1018       SET_BIT(0x0C);
1019       SET_BIT(0x0D);
1020 #ifdef SUPPORT_UTF
1021       if (utf)
1022         {
1023 #ifdef COMPILE_PCRE8
1024         SET_BIT(0xC2);  /* For U+0085 */
1025         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1026 #endif
1027 #ifdef COMPILE_PCRE16
1028         SET_BIT(0x85);
1029         SET_BIT(0xFF);  /* For characters > 255 */
1030 #endif
1031         }
1032       else
1033 #endif /* SUPPORT_UTF */
1034         {
1035         SET_BIT(0x85);
1036 #ifdef COMPILE_PCRE16
1037         SET_BIT(0xFF);  /* For characters > 255 */
1038 #endif
1039         }
1040       try_next = FALSE;
1041       break;
1042
1043       /* Single character types set the bits and stop. Note that if PCRE_UCP
1044       is set, we do not see these op codes because \d etc are converted to
1045       properties. Therefore, these apply in the case when only characters less
1046       than 256 are recognized to match the types. */
1047
1048       case OP_NOT_DIGIT:
1049       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1050       try_next = FALSE;
1051       break;
1052
1053       case OP_DIGIT:
1054       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1055       try_next = FALSE;
1056       break;
1057
1058       /* The cbit_space table has vertical tab as whitespace; we have to
1059       ensure it is set as not whitespace. */
1060
1061       case OP_NOT_WHITESPACE:
1062       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1063       start_bits[1] |= 0x08;
1064       try_next = FALSE;
1065       break;
1066
1067       /* The cbit_space table has vertical tab as whitespace; we have to
1068       not set it from the table. */
1069
1070       case OP_WHITESPACE:
1071       c = start_bits[1];    /* Save in case it was already set */
1072       set_type_bits(start_bits, cbit_space, table_limit, cd);
1073       start_bits[1] = (start_bits[1] & ~0x08) | c;
1074       try_next = FALSE;
1075       break;
1076
1077       case OP_NOT_WORDCHAR:
1078       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1079       try_next = FALSE;
1080       break;
1081
1082       case OP_WORDCHAR:
1083       set_type_bits(start_bits, cbit_word, table_limit, cd);
1084       try_next = FALSE;
1085       break;
1086
1087       /* One or more character type fudges the pointer and restarts, knowing
1088       it will hit a single character type and stop there. */
1089
1090       case OP_TYPEPLUS:
1091       case OP_TYPEMINPLUS:
1092       case OP_TYPEPOSPLUS:
1093       tcode++;
1094       break;
1095
1096       case OP_TYPEEXACT:
1097       tcode += 1 + IMM2_SIZE;
1098       break;
1099
1100       /* Zero or more repeats of character types set the bits and then
1101       try again. */
1102
1103       case OP_TYPEUPTO:
1104       case OP_TYPEMINUPTO:
1105       case OP_TYPEPOSUPTO:
1106       tcode += IMM2_SIZE;  /* Fall through */
1107
1108       case OP_TYPESTAR:
1109       case OP_TYPEMINSTAR:
1110       case OP_TYPEPOSSTAR:
1111       case OP_TYPEQUERY:
1112       case OP_TYPEMINQUERY:
1113       case OP_TYPEPOSQUERY:
1114       switch(tcode[1])
1115         {
1116         default:
1117         case OP_ANY:
1118         case OP_ALLANY:
1119         return SSB_FAIL;
1120
1121         case OP_HSPACE:
1122         SET_BIT(0x09);
1123         SET_BIT(0x20);
1124 #ifdef SUPPORT_UTF
1125         if (utf)
1126           {
1127 #ifdef COMPILE_PCRE8
1128           SET_BIT(0xC2);  /* For U+00A0 */
1129           SET_BIT(0xE1);  /* For U+1680, U+180E */
1130           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1131           SET_BIT(0xE3);  /* For U+3000 */
1132 #endif
1133 #ifdef COMPILE_PCRE16
1134           SET_BIT(0xA0);
1135           SET_BIT(0xFF);  /* For characters > 255 */
1136 #endif
1137           }
1138         else
1139 #endif /* SUPPORT_UTF */
1140           SET_BIT(0xA0);
1141         break;
1142
1143         case OP_ANYNL:
1144         case OP_VSPACE:
1145         SET_BIT(0x0A);
1146         SET_BIT(0x0B);
1147         SET_BIT(0x0C);
1148         SET_BIT(0x0D);
1149 #ifdef SUPPORT_UTF
1150         if (utf)
1151           {
1152 #ifdef COMPILE_PCRE8
1153           SET_BIT(0xC2);  /* For U+0085 */
1154           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1155 #endif
1156 #ifdef COMPILE_PCRE16
1157           SET_BIT(0x85);
1158           SET_BIT(0xFF);  /* For characters > 255 */
1159 #endif
1160           }
1161         else
1162 #endif /* SUPPORT_UTF */
1163           SET_BIT(0x85);
1164         break;
1165
1166         case OP_NOT_DIGIT:
1167         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1168         break;
1169
1170         case OP_DIGIT:
1171         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1172         break;
1173
1174         /* The cbit_space table has vertical tab as whitespace; we have to
1175         ensure it gets set as not whitespace. */
1176
1177         case OP_NOT_WHITESPACE:
1178         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1179         start_bits[1] |= 0x08;
1180         break;
1181
1182         /* The cbit_space table has vertical tab as whitespace; we have to
1183         avoid setting it. */
1184
1185         case OP_WHITESPACE:
1186         c = start_bits[1];    /* Save in case it was already set */
1187         set_type_bits(start_bits, cbit_space, table_limit, cd);
1188         start_bits[1] = (start_bits[1] & ~0x08) | c;
1189         break;
1190
1191         case OP_NOT_WORDCHAR:
1192         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1193         break;
1194
1195         case OP_WORDCHAR:
1196         set_type_bits(start_bits, cbit_word, table_limit, cd);
1197         break;
1198         }
1199
1200       tcode += 2;
1201       break;
1202
1203       /* Character class where all the information is in a bit map: set the
1204       bits and either carry on or not, according to the repeat count. If it was
1205       a negative class, and we are operating with UTF-8 characters, any byte
1206       with a value >= 0xc4 is a potentially valid starter because it starts a
1207       character with a value > 255. */
1208
1209       case OP_NCLASS:
1210 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1211       if (utf)
1212         {
1213         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1214         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1215         }
1216 #endif
1217 #ifdef COMPILE_PCRE16
1218       SET_BIT(0xFF);                         /* For characters > 255 */
1219 #endif
1220       /* Fall through */
1221
1222       case OP_CLASS:
1223         {
1224         pcre_uint8 *map;
1225         tcode++;
1226         map = (pcre_uint8 *)tcode;
1227
1228         /* In UTF-8 mode, the bits in a bit map correspond to character
1229         values, not to byte values. However, the bit map we are constructing is
1230         for byte values. So we have to do a conversion for characters whose
1231         value is > 127. In fact, there are only two possible starting bytes for
1232         characters in the range 128 - 255. */
1233
1234 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1235         if (utf)
1236           {
1237           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1238           for (c = 128; c < 256; c++)
1239             {
1240             if ((map[c/8] && (1 << (c&7))) != 0)
1241               {
1242               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1243               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1244               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1245               }
1246             }
1247           }
1248         else
1249 #endif
1250           {
1251           /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1252           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1253           }
1254
1255         /* Advance past the bit map, and act on what follows. For a zero
1256         minimum repeat, continue; otherwise stop processing. */
1257
1258         tcode += 32 / sizeof(pcre_uchar);
1259         switch (*tcode)
1260           {
1261           case OP_CRSTAR:
1262           case OP_CRMINSTAR:
1263           case OP_CRQUERY:
1264           case OP_CRMINQUERY:
1265           tcode++;
1266           break;
1267
1268           case OP_CRRANGE:
1269           case OP_CRMINRANGE:
1270           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1271             else try_next = FALSE;
1272           break;
1273
1274           default:
1275           try_next = FALSE;
1276           break;
1277           }
1278         }
1279       break; /* End of bitmap class handling */
1280
1281       }      /* End of switch */
1282     }        /* End of try_next loop */
1283
1284   code += GET(code, 1);   /* Advance to next branch */
1285   }
1286 while (*code == OP_ALT);
1287 return yield;
1288 }
1289
1290
1291
1292
1293
1294 /*************************************************
1295 *          Study a compiled expression           *
1296 *************************************************/
1297
1298 /* This function is handed a compiled expression that it must study to produce
1299 information that will speed up the matching. It returns a pcre[16]_extra block
1300 which then gets handed back to pcre_exec().
1301
1302 Arguments:
1303   re        points to the compiled expression
1304   options   contains option bits
1305   errorptr  points to where to place error messages;
1306             set NULL unless error
1307
1308 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1309               the appropriate flags set;
1310             NULL on error or if no optimization possible
1311 */
1312
1313 #ifdef COMPILE_PCRE8
1314 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1315 pcre_study(const pcre *external_re, int options, const char **errorptr)
1316 #else
1317 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1318 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1319 #endif
1320 {
1321 int min;
1322 BOOL bits_set = FALSE;
1323 pcre_uint8 start_bits[32];
1324 PUBL(extra) *extra = NULL;
1325 pcre_study_data *study;
1326 const pcre_uint8 *tables;
1327 pcre_uchar *code;
1328 compile_data compile_block;
1329 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1330
1331 *errorptr = NULL;
1332
1333 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1334   {
1335   *errorptr = "argument is not a compiled regular expression";
1336   return NULL;
1337   }
1338
1339 if ((re->flags & PCRE_MODE) == 0)
1340   {
1341 #ifdef COMPILE_PCRE8
1342   *errorptr = "argument is compiled in 16 bit mode";
1343 #else
1344   *errorptr = "argument is compiled in 8 bit mode";
1345 #endif
1346   return NULL;
1347   }
1348
1349 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1350   {
1351   *errorptr = "unknown or incorrect option bit(s) set";
1352   return NULL;
1353   }
1354
1355 code = (pcre_uchar *)re + re->name_table_offset +
1356   (re->name_count * re->name_entry_size);
1357
1358 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1359 a multiline pattern that matches only at "line starts", there is no point in
1360 seeking a list of starting bytes. */
1361
1362 if ((re->options & PCRE_ANCHORED) == 0 &&
1363     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1364   {
1365   int rc;
1366
1367   /* Set the character tables in the block that is passed around */
1368
1369   tables = re->tables;
1370
1371 #ifdef COMPILE_PCRE8
1372   if (tables == NULL)
1373     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1374     (void *)(&tables));
1375 #else
1376   if (tables == NULL)
1377     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1378     (void *)(&tables));
1379 #endif
1380
1381   compile_block.lcc = tables + lcc_offset;
1382   compile_block.fcc = tables + fcc_offset;
1383   compile_block.cbits = tables + cbits_offset;
1384   compile_block.ctypes = tables + ctypes_offset;
1385
1386   /* See if we can find a fixed set of initial characters for the pattern. */
1387
1388   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1389   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1390     &compile_block);
1391   bits_set = rc == SSB_DONE;
1392   if (rc == SSB_UNKNOWN)
1393     {
1394     *errorptr = "internal error: opcode not recognized";
1395     return NULL;
1396     }
1397   }
1398
1399 /* Find the minimum length of subject string. */
1400
1401 switch(min = find_minlength(code, code, re->options, 0))
1402   {
1403   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1404   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1405   default: break;
1406   }
1407
1408 /* If a set of starting bytes has been identified, or if the minimum length is
1409 greater than zero, or if JIT optimization has been requested, get a
1410 pcre[16]_extra block and a pcre_study_data block. The study data is put in the
1411 latter, which is pointed to by the former, which may also get additional data
1412 set later by the calling program. At the moment, the size of pcre_study_data
1413 is fixed. We nevertheless save it in a field for returning via the
1414 pcre_fullinfo() function so that if it becomes variable in the future,
1415 we don't have to change that code. */
1416
1417 if (bits_set || min > 0
1418 #ifdef SUPPORT_JIT
1419     || (options & (PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE
1420                  | PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE)) != 0
1421 #endif
1422   )
1423   {
1424   extra = (PUBL(extra) *)(PUBL(malloc))
1425     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1426   if (extra == NULL)
1427     {
1428     *errorptr = "failed to get memory";
1429     return NULL;
1430     }
1431
1432   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1433   extra->flags = PCRE_EXTRA_STUDY_DATA;
1434   extra->study_data = study;
1435
1436   study->size = sizeof(pcre_study_data);
1437   study->flags = 0;
1438
1439   /* Set the start bits always, to avoid unset memory errors if the
1440   study data is written to a file, but set the flag only if any of the bits
1441   are set, to save time looking when none are. */
1442
1443   if (bits_set)
1444     {
1445     study->flags |= PCRE_STUDY_MAPPED;
1446     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1447     }
1448   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1449
1450 #ifdef PCRE_DEBUG
1451   if (bits_set)
1452     {
1453     pcre_uint8 *ptr = start_bits;
1454     int i;
1455
1456     printf("Start bits:\n");
1457     for (i = 0; i < 32; i++)
1458       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1459     }
1460 #endif
1461
1462   /* Always set the minlength value in the block, because the JIT compiler
1463   makes use of it. However, don't set the bit unless the length is greater than
1464   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1465   checking the zero case. */
1466
1467   if (min > 0)
1468     {
1469     study->flags |= PCRE_STUDY_MINLEN;
1470     study->minlength = min;
1471     }
1472   else study->minlength = 0;
1473
1474   /* If JIT support was compiled and requested, attempt the JIT compilation.
1475   If no starting bytes were found, and the minimum length is zero, and JIT
1476   compilation fails, abandon the extra block and return NULL. */
1477
1478 #ifdef SUPPORT_JIT
1479   extra->executable_jit = NULL;
1480   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1481     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1482   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1483     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1484   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1485     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1486
1487   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
1488     {
1489 #ifdef COMPILE_PCRE8
1490     pcre_free_study(extra);
1491 #endif
1492 #ifdef COMPILE_PCRE16
1493     pcre16_free_study(extra);
1494 #endif
1495     extra = NULL;
1496     }
1497 #endif
1498   }
1499
1500 return extra;
1501 }
1502
1503
1504 /*************************************************
1505 *          Free the study data                   *
1506 *************************************************/
1507
1508 /* This function frees the memory that was obtained by pcre_study().
1509
1510 Argument:   a pointer to the pcre[16]_extra block
1511 Returns:    nothing
1512 */
1513
1514 #ifdef COMPILE_PCRE8
1515 PCRE_EXP_DEFN void
1516 pcre_free_study(pcre_extra *extra)
1517 #else
1518 PCRE_EXP_DEFN void
1519 pcre16_free_study(pcre16_extra *extra)
1520 #endif
1521 {
1522 if (extra == NULL)
1523   return;
1524 #ifdef SUPPORT_JIT
1525 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1526      extra->executable_jit != NULL)
1527   PRIV(jit_free)(extra->executable_jit);
1528 #endif
1529 PUBL(free)(extra);
1530 }
1531
1532 /* End of pcre_study.c */