third_party/re2/ucs2.diff

   1 This is a dump from Google's source control system of the change
   2 that removed UCS-2 support from RE2.  As the explanation below
   3 says, UCS-2 mode is fundamentally at odds with things like ^ and $,
   4 so it never really worked very well.  But if you are interested in using
   5 it without those operators, it did work for that.  It assumed that the
   6 UCS-2 data was in the native host byte order.
   7
   8 If you are interested in adding UCS-2 mode back, this patch might
   9 be a good starting point.
  10
  11
  12 Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
  13
  14         Retire UCS-2 mode.
  15
  16         I added it as an experiment for V8, but it
  17         requires 2-byte lookahead to do completely,
  18         and RE2 has 1-byte lookahead (enough for UTF-8)
  19         as a fairly deep fundamental assumption,
  20         so it did not support ^ or $.
  21
  22 ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
  23 re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
  24       cap_[0] = p;
  25       if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
  26         return true;
  27 -     if (prog_->flags() & Regexp::UCS2)
  28 -       p++;
  29     }
  30     return false;
  31   }
  32 ==== re2/compile.cc#17 - re2/compile.cc#18 ====
  33 re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
  34   // Input encodings.
  35   enum Encoding {
  36     kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
  37 -   kEncodingUCS2,     // UCS-2 (0-FFFF), native byte order
  38     kEncodingLatin1,    // Latin1 (0-FF)
  39   };
  40
  41 re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
  42     void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
  43     void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
  44     void Add_80_10ffff();
  45 -   void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
  46 -   void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
  47 -                    uint8 lo2, uint8 hi2, bool fold2);
  48
  49     // New suffix that matches the byte range lo-hi, then goes to next.
  50     Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
  51 re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
  52
  53   // Converts rune range lo-hi into a fragment that recognizes
  54   // the bytes that would make up those runes in the current
  55 - // encoding (Latin 1, UTF-8, or UCS-2).
  56 + // encoding (Latin 1 or UTF-8).
  57   // This lets the machine work byte-by-byte even when
  58   // using multibyte encodings.
  59
  60 re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
  61       case kEncodingLatin1:
  62         AddRuneRangeLatin1(lo, hi, foldcase);
  63         break;
  64 -     case kEncodingUCS2:
  65 -       AddRuneRangeUCS2(lo, hi, foldcase);
  66 -       break;
  67     }
  68   }
  69
  70 re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
  71     AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
  72   }
  73
  74 - // Test whether 16-bit values are big or little endian.
  75 - static bool BigEndian() {
  76 -   union {
  77 -     char byte[2];
  78 -     int16 endian;
  79 -   } u;
  80 -
  81 -   u.byte[0] = 1;
  82 -   u.byte[1] = 2;
  83 -   return u.endian == 0x0102;
  84 - }
  85 -
  86 - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
  87 -                            uint8 lo2, uint8 hi2, bool fold2) {
  88 -   Inst* ip;
  89 -   if (reversed_) {
  90 -     ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
  91 -     ip = RuneByteSuffix(lo2, hi2, fold2, ip);
  92 -   } else {
  93 -     ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
  94 -     ip = RuneByteSuffix(lo1, hi1, fold1, ip);
  95 -   }
  96 -   AddSuffix(ip);
  97 - }
  98 -
  99 - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
 100 -   if (lo > hi || lo > 0xFFFF)
 101 -     return;
 102 -   if (hi > 0xFFFF)
 103 -     hi = 0xFFFF;
 104 -
 105 -   // We'll assemble a pattern assuming big endian.
 106 -   // If the machine isn't, tell Cat to reverse its arguments.
 107 -   bool oldreversed = reversed_;
 108 -   if (!BigEndian()) {
 109 -     reversed_ = !oldreversed;
 110 -   }
 111 -
 112 -   // Split into bytes.
 113 -   int lo1 = lo >> 8;
 114 -   int lo2 = lo & 0xFF;
 115 -   int hi1 = hi >> 8;
 116 -   int hi2 = hi & 0xFF;
 117 -
 118 -   if (lo1 == hi1) {
 119 -     // Easy case: high bits are same in both.
 120 -     // Only do ASCII case folding on the second byte if the top byte is 00.
 121 -     AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
 122 -   } else {
 123 -     // Harder case: different second byte ranges depending on first byte.
 124 -
 125 -     // Initial fragment.
 126 -     if (lo2 > 0) {
 127 -       AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
 128 -       lo1++;
 129 -     }
 130 -
 131 -     // Trailing fragment.
 132 -     if (hi2 < 0xFF) {
 133 -       AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
 134 -       hi1--;
 135 -     }
 136 -
 137 -     // Inner ranges.
 138 -     if (lo1 <= hi1) {
 139 -       AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
 140 -     }
 141 -   }
 142 -
 143 -   // Restore reverse setting.
 144 -   reversed_ = oldreversed;
 145 - }
 146 -
 147   // Table describing how to make a UTF-8 matching machine
 148   // for the rune range 80-10FFFF (Runeself-Runemax).
 149   // This range happens frequently enough (for example /./ and /[^a-z]/)
 150 re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
 151
 152   Frag Compiler::Literal(Rune r, bool foldcase) {
 153     switch (encoding_) {
 154 -     default:  // UCS-2 or something new
 155 -       BeginRange();
 156 -       AddRuneRange(r, r, foldcase);
 157 -       return EndRange();
 158 +     default:
 159 +       return kNullFrag;
 160
 161       case kEncodingLatin1:
 162         return ByteRange(r, r, foldcase);
 163 re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
 164
 165     if (re->parse_flags() & Regexp::Latin1)
 166       c.encoding_ = kEncodingLatin1;
 167 -   else if (re->parse_flags() & Regexp::UCS2)
 168 -     c.encoding_ = kEncodingUCS2;
 169     c.reversed_ = reversed;
 170     if (max_mem <= 0) {
 171       c.max_inst_ = 100000;  // more than enough
 172 re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
 173       c.prog_->set_start_unanchored(c.prog_->start());
 174     } else {
 175       Frag dot;
 176 -     if (c.encoding_ == kEncodingUCS2) {
 177 -       dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
 178 -     } else {
 179 -       dot = c.ByteRange(0x00, 0xFF, false);
 180 -     }
 181 +     dot = c.ByteRange(0x00, 0xFF, false);
 182       Frag dotloop = c.Star(dot, true);
 183       Frag unanchored = c.Cat(dotloop, all);
 184       c.prog_->set_start_unanchored(unanchored.begin);
 185 ==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
 186 re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
 187     const char* bp = context.begin();
 188     int c = -1;
 189     int wasword = 0;
 190 -   bool ucs2 = prog_->flags() & Regexp::UCS2;
 191
 192     if (text.begin() > context.begin()) {
 193       c = text.begin()[-1] & 0xFF;
 194 re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
 195         // If there's a required first byte for an unanchored search
 196         // and we're not in the middle of any possible matches,
 197         // use memchr to search for the byte quickly.
 198 -       if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
 199 +       if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
 200             p < text.end() && (p[0] & 0xFF) != first_byte_) {
 201           p = reinterpret_cast<const char*>(memchr(p, first_byte_,
 202                                                    text.end() - p));
 203 re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
 204           flag = Prog::EmptyFlags(context, p);
 205         }
 206
 207 -       // In UCS-2 mode, if we need to start a new thread,
 208 -       // make sure to do it on an even boundary.
 209 -       if(ucs2 && runq->size() == 0 &&
 210 -           (p - context.begin()) % 2 && p < text.end()) {
 211 -         p++;
 212 -         flag = Prog::EmptyFlags(context, p);
 213 -       }
 214 -
 215         // Steal match storage (cleared but unused as of yet)
 216         // temporarily to hold match boundaries for new thread.
 217 -       // In UCS-2 mode, only start the thread on a 2-byte boundary.
 218 -       if(!ucs2 || (p - context.begin()) % 2 == 0) {
 219 -         match_[0] = p;
 220 -         AddToThreadq(runq, start_, flag, p, match_);
 221 -         match_[0] = NULL;
 222 -       }
 223 +       match_[0] = p;
 224 +       AddToThreadq(runq, start_, flag, p, match_);
 225 +       match_[0] = NULL;
 226       }
 227
 228       // If all the threads have died, stop early.
 229 ==== re2/parse.cc#22 - re2/parse.cc#23 ====
 230 re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
 231       status_(status), stacktop_(NULL), ncap_(0) {
 232     if (flags_ & Latin1)
 233       rune_max_ = 0xFF;
 234 -   else if (flags & UCS2)
 235 -     rune_max_ = 0xFFFF;
 236     else
 237       rune_max_ = Runemax;
 238   }
 239 re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
 240   bool Regexp::ParseState::PushCarat() {
 241     if (flags_ & OneLine) {
 242       return PushSimpleOp(kRegexpBeginText);
 243 -   } else {
 244 -     if (flags_ & UCS2) {
 245 -       status_->set_code(kRegexpUnsupported);
 246 -       status_->set_error_arg("multiline ^ in UCS-2 mode");
 247 -       return false;
 248 -     }
 249 -     return PushSimpleOp(kRegexpBeginLine);
 250     }
 251 +   return PushSimpleOp(kRegexpBeginLine);
 252   }
 253
 254   // Pushes a \b or \B onto the stack.
 255   bool Regexp::ParseState::PushWordBoundary(bool word) {
 256 -   if (flags_ & UCS2) {
 257 -     status_->set_code(kRegexpUnsupported);
 258 -     status_->set_error_arg("\\b or \\B in UCS-2 mode");
 259 -     return false;
 260 -   }
 261     if (word)
 262       return PushSimpleOp(kRegexpWordBoundary);
 263     return PushSimpleOp(kRegexpNoWordBoundary);
 264 re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
 265       bool ret = PushSimpleOp(kRegexpEndText);
 266       flags_ = oflags;
 267       return ret;
 268 -   }
 269 -   if (flags_ & UCS2) {
 270 -     status_->set_code(kRegexpUnsupported);
 271 -     status_->set_error_arg("multiline $ in UCS-2 mode");
 272 -     return false;
 273     }
 274     return PushSimpleOp(kRegexpEndLine);
 275   }
 276 ==== re2/re2.cc#34 - re2/re2.cc#35 ====
 277 re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
 278         return RE2::ErrorBadUTF8;
 279       case re2::kRegexpBadNamedCapture:
 280         return RE2::ErrorBadNamedCapture;
 281 -     case re2::kRegexpUnsupported:
 282 -       return RE2::ErrorUnsupported;
 283     }
 284     return RE2::ErrorInternal;
 285   }
 286 re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
 287         break;
 288       case RE2::Options::EncodingLatin1:
 289         flags |= Regexp::Latin1;
 290 -       break;
 291 -     case RE2::Options::EncodingUCS2:
 292 -       flags |= Regexp::UCS2;
 293         break;
 294     }
 295
 296 ==== re2/re2.h#36 - re2/re2.h#37 ====
 297 re2/re2.h#36:246,252 - re2/re2.h#37:246,251
 298       ErrorBadUTF8,            // invalid UTF-8 in regexp
 299       ErrorBadNamedCapture,    // bad named capture group
 300       ErrorPatternTooLarge,    // pattern too large (compile failed)
 301 -     ErrorUnsupported,        // unsupported feature (in UCS-2 mode)
 302     };
 303
 304     // Predefined common options.
 305 re2/re2.h#36:570,576 - re2/re2.h#37:569,574
 306
 307       enum Encoding {
 308         EncodingUTF8 = 1,
 309 -       EncodingUCS2,      // 16-bit Unicode 0-FFFF only
 310         EncodingLatin1
 311       };
 312
 313 ==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
 314 re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
 315   // the regexp that remains after the prefix.  The prefix might
 316   // be ASCII case-insensitive.
 317   bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
 318 -   // Don't even bother for UCS-2; it's time to throw that code away.
 319 -   if (parse_flags_ & UCS2)
 320 -     return false;
 321 -
 322     // No need for a walker: the regexp must be of the form
 323     // 1. some number of ^ anchors
 324     // 2. a literal char or string
 325 ==== re2/regexp.h#20 - re2/regexp.h#21 ====
 326 re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
 327     kRegexpBadPerlOp,          // bad perl operator
 328     kRegexpBadUTF8,            // invalid UTF-8 in regexp
 329     kRegexpBadNamedCapture,    // bad named capture
 330 -   kRegexpUnsupported,        // unsupported operator
 331   };
 332
 333   // Error status for certain operations.
 334 re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
 335                              //   \Q and \E to disable/enable metacharacters
 336                              //   (?P<name>expr) for named captures
 337                              //   \C to match any single byte
 338 -     UCS2         = 1<<10,  // Text is in UCS-2, regexp is in UTF-8.
 339 -     UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
 340 +     UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
 341                              //   and \P{Han} for its negation.
 342 -     NeverNL      = 1<<12,  // Never match NL, even if the regexp mentions
 343 +     NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
 344                              //   it explicitly.
 345
 346       // As close to Perl as we can get.
 347 ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
 348 re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
 349       cap_[0] = p;
 350       if (Visit(prog_->start(), p))  // Match must be leftmost; done.
 351         return true;
 352 -     if (prog_->flags() & Regexp::UCS2)
 353 -       p++;
 354     }
 355     return false;
 356   }
 357 ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
 358 re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
 359   static ParseMode parse_modes[] = {
 360     { single_line,                   "single-line"          },
 361     { single_line|Regexp::Latin1,    "single-line, latin1"  },
 362 -   { single_line|Regexp::UCS2,     "single-line, ucs2"   },
 363     { multi_line,                    "multiline"            },
 364     { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
 365     { multi_line|Regexp::Latin1,     "multiline, latin1"    },
 366 -   { multi_line|Regexp::UCS2,      "multiline, ucs2"     },
 367   };
 368
 369   static string FormatMode(Regexp::ParseFlags flags) {
 370 re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
 371     RegexpStatus status;
 372     regexp_ = Regexp::Parse(regexp_str, flags, &status);
 373     if (regexp_ == NULL) {
 374 -     if (status.code() != kRegexpUnsupported) {
 375 -       LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
 376 -                 << " mode: " << FormatMode(flags);
 377 -       error_ = true;
 378 -     }
 379 +     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
 380 +               << " mode: " << FormatMode(flags);
 381 +     error_ = true;
 382       return;
 383     }
 384     prog_ = regexp_->CompileToProg(0);
 385 re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
 386       RE2::Options options;
 387       if (flags & Regexp::Latin1)
 388         options.set_encoding(RE2::Options::EncodingLatin1);
 389 -     else if (flags & Regexp::UCS2)
 390 -       options.set_encoding(RE2::Options::EncodingUCS2);
 391       if (kind_ == Prog::kLongestMatch)
 392         options.set_longest_match(true);
 393       re2_ = new RE2(re, options);
 394 re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
 395       delete re2_;
 396   }
 397
 398 - // Converts UTF-8 string in text into UCS-2 string in new_text.
 399 - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
 400 -   const char* p = text.begin();
 401 -   const char* ep = text.end();
 402 -   uint16* q = new uint16[ep - p];
 403 -   uint16* q0 = q;
 404 -
 405 -   int n;
 406 -   Rune r;
 407 -   for (; p < ep; p += n) {
 408 -     if (!fullrune(p, ep - p)) {
 409 -       delete[] q0;
 410 -       return false;
 411 -     }
 412 -     n = chartorune(&r, p);
 413 -     if (r > 0xFFFF) {
 414 -       delete[] q0;
 415 -       return false;
 416 -     }
 417 -     *q++ = r;
 418 -   }
 419 -   *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
 420 -   return true;
 421 - }
 422 -
 423 - // Rewrites *sp from being a pointer into text8 (UTF-8)
 424 - // to being a pointer into text16 (equivalent text but in UCS-2).
 425 - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
 426 -                               StringPiece *sp) {
 427 -   if (sp->begin() == NULL && text8.begin() != NULL)
 428 -     return;
 429 -
 430 -   int nrune = 0;
 431 -   int n;
 432 -   Rune r;
 433 -   const char* p = text8.begin();
 434 -   const char* ep = text8.end();
 435 -   const char* spbegin = NULL;
 436 -   const char* spend = NULL;
 437 -   for (;;) {
 438 -     if (p == sp->begin())
 439 -       spbegin = text16.begin() + sizeof(uint16)*nrune;
 440 -     if (p == sp->end())
 441 -       spend = text16.begin() + sizeof(uint16)*nrune;
 442 -     if (p >= ep)
 443 -       break;
 444 -     n = chartorune(&r, p);
 445 -     p += n;
 446 -     nrune++;
 447 -   }
 448 -   if (spbegin == NULL || spend == NULL) {
 449 -     LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
 450 -                << CEscape(text8) << " "
 451 -                << (int)(sp->begin() - text8.begin()) << " "
 452 -                << (int)(sp->end() - text8.begin());
 453 -   }
 454 -   *sp = StringPiece(spbegin, spend - spbegin);
 455 - }
 456 -
 457 - // Rewrites *sp from begin a pointer into text16 (UCS-2)
 458 - // to being a pointer into text8 (equivalent text but in UTF-8).
 459 - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
 460 -                               StringPiece* sp) {
 461 -   if (sp->begin() == NULL)
 462 -     return;
 463 -
 464 -   int nrune = 0;
 465 -   int n;
 466 -   Rune r;
 467 -   const char* p = text8.begin();
 468 -   const char* ep = text8.end();
 469 -   const char* spbegin = NULL;
 470 -   const char* spend = NULL;
 471 -   for (;;) {
 472 -     if (nrune == (sp->begin() - text16.begin())/2)
 473 -       spbegin = p;
 474 -     if (nrune == (sp->end() - text16.begin())/2)
 475 -       spend = p;
 476 -     if (p >= ep)
 477 -       break;
 478 -     n = chartorune(&r, p);
 479 -     p += n;
 480 -     nrune++;
 481 -   }
 482 -   if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
 483 -     LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
 484 -                << CEscape(text16) << " "
 485 -                << (int)(sp->begin() - text16.begin()) << " "
 486 -                << (int)(sp->end() - text16.begin());
 487 -   }
 488 -   *sp = StringPiece(spbegin, spend - spbegin);
 489 - }
 490 -
 491   // Runs a single search using the named engine type.
 492   // This interface hides all the irregularities of the various
 493   // engine interfaces from the rest of this file.
 494 re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
 495
 496     StringPiece text = orig_text;
 497     StringPiece context = orig_context;
 498 -   bool ucs2 = false;
 499
 500 -   if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
 501 -     if (!ConvertUTF8ToUCS2(orig_context, &context)) {
 502 -       result->skipped = true;
 503 -       return;
 504 -     }
 505 -
 506 -     // Rewrite context to refer to new text.
 507 -     AdjustUTF8ToUCS2(orig_context, context, &text);
 508 -     ucs2 = true;
 509 -   }
 510 -
 511     switch (type) {
 512       default:
 513         LOG(FATAL) << "Bad RunSearch type: " << (int)type;
 514 re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
 515       }
 516     }
 517
 518 -   // If we did UCS-2 matching, rewrite the matches to refer
 519 -   // to the original UTF-8 text.
 520 -   if (ucs2) {
 521 -     if (result->matched) {
 522 -       if (result->have_submatch0) {
 523 -         AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
 524 -       } else if (result->have_submatch) {
 525 -         for (int i = 0; i < nsubmatch; i++) {
 526 -           AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
 527 -         }
 528 -       }
 529 -     }
 530 -     delete[] context.begin();
 531 -   }
 532 -
 533     if (!result->matched)
 534       memset(result->submatch, 0, sizeof result->submatch);
 535   }
 536 re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
 537     return true;
 538   }
 539
 540 - // Check whether text uses only Unicode points <= 0xFFFF
 541 - // (in the BMP).
 542 - static bool IsBMP(const StringPiece& text) {
 543 -   const char* p = text.begin();
 544 -   const char* ep = text.end();
 545 -   while (p < ep) {
 546 -     if (!fullrune(p, ep - p))
 547 -       return false;
 548 -     Rune r;
 549 -     p += chartorune(&r, p);
 550 -     if (r > 0xFFFF)
 551 -       return false;
 552 -   }
 553 -   return true;
 554 - }
 555 -
 556   // Runs a single test.
 557   bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
 558                              Prog::Anchor anchor) {
 559 re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
 560     Result correct;
 561     RunSearch(kEngineBacktrack, text, context, anchor, &correct);
 562     if (correct.skipped) {
 563 -     if (regexp_ == NULL || !IsBMP(context))  // okay to skip in UCS-2 mode
 564 +     if (regexp_ == NULL)
 565         return true;
 566       LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
 567                  << " " << FormatMode(flags_);