lib/Support/StringRef.cpp

   1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9
  10 #include "llvm/ADT/StringRef.h"
  11 #include "llvm/ADT/APFloat.h"
  12 #include "llvm/ADT/APInt.h"
  13 #include "llvm/ADT/Hashing.h"
  14 #include "llvm/ADT/StringExtras.h"
  15 #include "llvm/ADT/edit_distance.h"
  16 #include <bitset>
  17
  18 using namespace llvm;
  19
  20 // MSVC emits references to this into the translation units which reference it.
  21 #ifndef _MSC_VER
  22 const size_t StringRef::npos;
  23 #endif
  24
  25 // strncasecmp() is not available on non-POSIX systems, so define an
  26 // alternative function here.
  27 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
  28   for (size_t I = 0; I < Length; ++I) {
  29     unsigned char LHC = toLower(LHS[I]);
  30     unsigned char RHC = toLower(RHS[I]);
  31     if (LHC != RHC)
  32       return LHC < RHC ? -1 : 1;
  33   }
  34   return 0;
  35 }
  36
  37 /// compare_lower - Compare strings, ignoring case.
  38 int StringRef::compare_lower(StringRef RHS) const {
  39   if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
  40     return Res;
  41   if (Length == RHS.Length)
  42     return 0;
  43   return Length < RHS.Length ? -1 : 1;
  44 }
  45
  46 /// Check if this string starts with the given \p Prefix, ignoring case.
  47 bool StringRef::startswith_lower(StringRef Prefix) const {
  48   return Length >= Prefix.Length &&
  49       ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
  50 }
  51
  52 /// Check if this string ends with the given \p Suffix, ignoring case.
  53 bool StringRef::endswith_lower(StringRef Suffix) const {
  54   return Length >= Suffix.Length &&
  55       ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
  56 }
  57
  58 size_t StringRef::find_lower(char C, size_t From) const {
  59   char L = toLower(C);
  60   return find_if([L](char D) { return toLower(D) == L; }, From);
  61 }
  62
  63 /// compare_numeric - Compare strings, handle embedded numbers.
  64 int StringRef::compare_numeric(StringRef RHS) const {
  65   for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
  66     // Check for sequences of digits.
  67     if (isDigit(Data[I]) && isDigit(RHS.Data[I])) {
  68       // The longer sequence of numbers is considered larger.
  69       // This doesn't really handle prefixed zeros well.
  70       size_t J;
  71       for (J = I + 1; J != E + 1; ++J) {
  72         bool ld = J < Length && isDigit(Data[J]);
  73         bool rd = J < RHS.Length && isDigit(RHS.Data[J]);
  74         if (ld != rd)
  75           return rd ? -1 : 1;
  76         if (!rd)
  77           break;
  78       }
  79       // The two number sequences have the same length (J-I), just memcmp them.
  80       if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
  81         return Res < 0 ? -1 : 1;
  82       // Identical number sequences, continue search after the numbers.
  83       I = J - 1;
  84       continue;
  85     }
  86     if (Data[I] != RHS.Data[I])
  87       return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
  88   }
  89   if (Length == RHS.Length)
  90     return 0;
  91   return Length < RHS.Length ? -1 : 1;
  92 }
  93
  94 // Compute the edit distance between the two given strings.
  95 unsigned StringRef::edit_distance(llvm::StringRef Other,
  96                                   bool AllowReplacements,
  97                                   unsigned MaxEditDistance) const {
  98   return llvm::ComputeEditDistance(
  99       makeArrayRef(data(), size()),
 100       makeArrayRef(Other.data(), Other.size()),
 101       AllowReplacements, MaxEditDistance);
 102 }
 103
 104 //===----------------------------------------------------------------------===//
 105 // String Operations
 106 //===----------------------------------------------------------------------===//
 107
 108 std::string StringRef::lower() const {
 109   std::string Result(size(), char());
 110   for (size_type i = 0, e = size(); i != e; ++i) {
 111     Result[i] = toLower(Data[i]);
 112   }
 113   return Result;
 114 }
 115
 116 std::string StringRef::upper() const {
 117   std::string Result(size(), char());
 118   for (size_type i = 0, e = size(); i != e; ++i) {
 119     Result[i] = toUpper(Data[i]);
 120   }
 121   return Result;
 122 }
 123
 124 //===----------------------------------------------------------------------===//
 125 // String Searching
 126 //===----------------------------------------------------------------------===//
 127
 128
 129 /// find - Search for the first string \arg Str in the string.
 130 ///
 131 /// \return - The index of the first occurrence of \arg Str, or npos if not
 132 /// found.
 133 size_t StringRef::find(StringRef Str, size_t From) const {
 134   if (From > Length)
 135     return npos;
 136
 137   const char *Start = Data + From;
 138   size_t Size = Length - From;
 139
 140   const char *Needle = Str.data();
 141   size_t N = Str.size();
 142   if (N == 0)
 143     return From;
 144   if (Size < N)
 145     return npos;
 146   if (N == 1) {
 147     const char *Ptr = (const char *)::memchr(Start, Needle[0], Size);
 148     return Ptr == nullptr ? npos : Ptr - Data;
 149   }
 150
 151   const char *Stop = Start + (Size - N + 1);
 152
 153   // For short haystacks or unsupported needles fall back to the naive algorithm
 154   if (Size < 16 || N > 255) {
 155     do {
 156       if (std::memcmp(Start, Needle, N) == 0)
 157         return Start - Data;
 158       ++Start;
 159     } while (Start < Stop);
 160     return npos;
 161   }
 162
 163   // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
 164   uint8_t BadCharSkip[256];
 165   std::memset(BadCharSkip, N, 256);
 166   for (unsigned i = 0; i != N-1; ++i)
 167     BadCharSkip[(uint8_t)Str[i]] = N-1-i;
 168
 169   do {
 170     uint8_t Last = Start[N - 1];
 171     if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1]))
 172       if (std::memcmp(Start, Needle, N - 1) == 0)
 173         return Start - Data;
 174
 175     // Otherwise skip the appropriate number of bytes.
 176     Start += BadCharSkip[Last];
 177   } while (Start < Stop);
 178
 179   return npos;
 180 }
 181
 182 size_t StringRef::find_lower(StringRef Str, size_t From) const {
 183   StringRef This = substr(From);
 184   while (This.size() >= Str.size()) {
 185     if (This.startswith_lower(Str))
 186       return From;
 187     This = This.drop_front();
 188     ++From;
 189   }
 190   return npos;
 191 }
 192
 193 size_t StringRef::rfind_lower(char C, size_t From) const {
 194   From = std::min(From, Length);
 195   size_t i = From;
 196   while (i != 0) {
 197     --i;
 198     if (toLower(Data[i]) == toLower(C))
 199       return i;
 200   }
 201   return npos;
 202 }
 203
 204 /// rfind - Search for the last string \arg Str in the string.
 205 ///
 206 /// \return - The index of the last occurrence of \arg Str, or npos if not
 207 /// found.
 208 size_t StringRef::rfind(StringRef Str) const {
 209   size_t N = Str.size();
 210   if (N > Length)
 211     return npos;
 212   for (size_t i = Length - N + 1, e = 0; i != e;) {
 213     --i;
 214     if (substr(i, N).equals(Str))
 215       return i;
 216   }
 217   return npos;
 218 }
 219
 220 size_t StringRef::rfind_lower(StringRef Str) const {
 221   size_t N = Str.size();
 222   if (N > Length)
 223     return npos;
 224   for (size_t i = Length - N + 1, e = 0; i != e;) {
 225     --i;
 226     if (substr(i, N).equals_lower(Str))
 227       return i;
 228   }
 229   return npos;
 230 }
 231
 232 /// find_first_of - Find the first character in the string that is in \arg
 233 /// Chars, or npos if not found.
 234 ///
 235 /// Note: O(size() + Chars.size())
 236 StringRef::size_type StringRef::find_first_of(StringRef Chars,
 237                                               size_t From) const {
 238   std::bitset<1 << CHAR_BIT> CharBits;
 239   for (size_type i = 0; i != Chars.size(); ++i)
 240     CharBits.set((unsigned char)Chars[i]);
 241
 242   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 243     if (CharBits.test((unsigned char)Data[i]))
 244       return i;
 245   return npos;
 246 }
 247
 248 /// find_first_not_of - Find the first character in the string that is not
 249 /// \arg C or npos if not found.
 250 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
 251   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 252     if (Data[i] != C)
 253       return i;
 254   return npos;
 255 }
 256
 257 /// find_first_not_of - Find the first character in the string that is not
 258 /// in the string \arg Chars, or npos if not found.
 259 ///
 260 /// Note: O(size() + Chars.size())
 261 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
 262                                                   size_t From) const {
 263   std::bitset<1 << CHAR_BIT> CharBits;
 264   for (size_type i = 0; i != Chars.size(); ++i)
 265     CharBits.set((unsigned char)Chars[i]);
 266
 267   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 268     if (!CharBits.test((unsigned char)Data[i]))
 269       return i;
 270   return npos;
 271 }
 272
 273 /// find_last_of - Find the last character in the string that is in \arg C,
 274 /// or npos if not found.
 275 ///
 276 /// Note: O(size() + Chars.size())
 277 StringRef::size_type StringRef::find_last_of(StringRef Chars,
 278                                              size_t From) const {
 279   std::bitset<1 << CHAR_BIT> CharBits;
 280   for (size_type i = 0; i != Chars.size(); ++i)
 281     CharBits.set((unsigned char)Chars[i]);
 282
 283   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 284     if (CharBits.test((unsigned char)Data[i]))
 285       return i;
 286   return npos;
 287 }
 288
 289 /// find_last_not_of - Find the last character in the string that is not
 290 /// \arg C, or npos if not found.
 291 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
 292   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 293     if (Data[i] != C)
 294       return i;
 295   return npos;
 296 }
 297
 298 /// find_last_not_of - Find the last character in the string that is not in
 299 /// \arg Chars, or npos if not found.
 300 ///
 301 /// Note: O(size() + Chars.size())
 302 StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
 303                                                  size_t From) const {
 304   std::bitset<1 << CHAR_BIT> CharBits;
 305   for (size_type i = 0, e = Chars.size(); i != e; ++i)
 306     CharBits.set((unsigned char)Chars[i]);
 307
 308   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 309     if (!CharBits.test((unsigned char)Data[i]))
 310       return i;
 311   return npos;
 312 }
 313
 314 void StringRef::split(SmallVectorImpl<StringRef> &A,
 315                       StringRef Separator, int MaxSplit,
 316                       bool KeepEmpty) const {
 317   StringRef S = *this;
 318
 319   // Count down from MaxSplit. When MaxSplit is -1, this will just split
 320   // "forever". This doesn't support splitting more than 2^31 times
 321   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
 322   // but that seems unlikely to be useful.
 323   while (MaxSplit-- != 0) {
 324     size_t Idx = S.find(Separator);
 325     if (Idx == npos)
 326       break;
 327
 328     // Push this split.
 329     if (KeepEmpty || Idx > 0)
 330       A.push_back(S.slice(0, Idx));
 331
 332     // Jump forward.
 333     S = S.slice(Idx + Separator.size(), npos);
 334   }
 335
 336   // Push the tail.
 337   if (KeepEmpty || !S.empty())
 338     A.push_back(S);
 339 }
 340
 341 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
 342                       int MaxSplit, bool KeepEmpty) const {
 343   StringRef S = *this;
 344
 345   // Count down from MaxSplit. When MaxSplit is -1, this will just split
 346   // "forever". This doesn't support splitting more than 2^31 times
 347   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
 348   // but that seems unlikely to be useful.
 349   while (MaxSplit-- != 0) {
 350     size_t Idx = S.find(Separator);
 351     if (Idx == npos)
 352       break;
 353
 354     // Push this split.
 355     if (KeepEmpty || Idx > 0)
 356       A.push_back(S.slice(0, Idx));
 357
 358     // Jump forward.
 359     S = S.slice(Idx + 1, npos);
 360   }
 361
 362   // Push the tail.
 363   if (KeepEmpty || !S.empty())
 364     A.push_back(S);
 365 }
 366
 367 //===----------------------------------------------------------------------===//
 368 // Helpful Algorithms
 369 //===----------------------------------------------------------------------===//
 370
 371 /// count - Return the number of non-overlapped occurrences of \arg Str in
 372 /// the string.
 373 size_t StringRef::count(StringRef Str) const {
 374   size_t Count = 0;
 375   size_t N = Str.size();
 376   if (N > Length)
 377     return 0;
 378   for (size_t i = 0, e = Length - N + 1; i != e; ++i)
 379     if (substr(i, N).equals(Str))
 380       ++Count;
 381   return Count;
 382 }
 383
 384 static unsigned GetAutoSenseRadix(StringRef &Str) {
 385   if (Str.empty())
 386     return 10;
 387
 388   if (Str.startswith("0x") || Str.startswith("0X")) {
 389     Str = Str.substr(2);
 390     return 16;
 391   }
 392
 393   if (Str.startswith("0b") || Str.startswith("0B")) {
 394     Str = Str.substr(2);
 395     return 2;
 396   }
 397
 398   if (Str.startswith("0o")) {
 399     Str = Str.substr(2);
 400     return 8;
 401   }
 402
 403   if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) {
 404     Str = Str.substr(1);
 405     return 8;
 406   }
 407
 408   return 10;
 409 }
 410
 411 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
 412                                   unsigned long long &Result) {
 413   // Autosense radix if not specified.
 414   if (Radix == 0)
 415     Radix = GetAutoSenseRadix(Str);
 416
 417   // Empty strings (after the radix autosense) are invalid.
 418   if (Str.empty()) return true;
 419
 420   // Parse all the bytes of the string given this radix.  Watch for overflow.
 421   StringRef Str2 = Str;
 422   Result = 0;
 423   while (!Str2.empty()) {
 424     unsigned CharVal;
 425     if (Str2[0] >= '0' && Str2[0] <= '9')
 426       CharVal = Str2[0] - '0';
 427     else if (Str2[0] >= 'a' && Str2[0] <= 'z')
 428       CharVal = Str2[0] - 'a' + 10;
 429     else if (Str2[0] >= 'A' && Str2[0] <= 'Z')
 430       CharVal = Str2[0] - 'A' + 10;
 431     else
 432       break;
 433
 434     // If the parsed value is larger than the integer radix, we cannot
 435     // consume any more characters.
 436     if (CharVal >= Radix)
 437       break;
 438
 439     // Add in this character.
 440     unsigned long long PrevResult = Result;
 441     Result = Result * Radix + CharVal;
 442
 443     // Check for overflow by shifting back and seeing if bits were lost.
 444     if (Result / Radix < PrevResult)
 445       return true;
 446
 447     Str2 = Str2.substr(1);
 448   }
 449
 450   // We consider the operation a failure if no characters were consumed
 451   // successfully.
 452   if (Str.size() == Str2.size())
 453     return true;
 454
 455   Str = Str2;
 456   return false;
 457 }
 458
 459 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
 460                                 long long &Result) {
 461   unsigned long long ULLVal;
 462
 463   // Handle positive strings first.
 464   if (Str.empty() || Str.front() != '-') {
 465     if (consumeUnsignedInteger(Str, Radix, ULLVal) ||
 466         // Check for value so large it overflows a signed value.
 467         (long long)ULLVal < 0)
 468       return true;
 469     Result = ULLVal;
 470     return false;
 471   }
 472
 473   // Get the positive part of the value.
 474   StringRef Str2 = Str.drop_front(1);
 475   if (consumeUnsignedInteger(Str2, Radix, ULLVal) ||
 476       // Reject values so large they'd overflow as negative signed, but allow
 477       // "-0".  This negates the unsigned so that the negative isn't undefined
 478       // on signed overflow.
 479       (long long)-ULLVal > 0)
 480     return true;
 481
 482   Str = Str2;
 483   Result = -ULLVal;
 484   return false;
 485 }
 486
 487 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
 488 /// sequence of radix up to 36 to an unsigned long long value.
 489 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
 490                                 unsigned long long &Result) {
 491   if (consumeUnsignedInteger(Str, Radix, Result))
 492     return true;
 493
 494   // For getAsUnsignedInteger, we require the whole string to be consumed or
 495   // else we consider it a failure.
 496   return !Str.empty();
 497 }
 498
 499 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
 500                               long long &Result) {
 501   if (consumeSignedInteger(Str, Radix, Result))
 502     return true;
 503
 504   // For getAsSignedInteger, we require the whole string to be consumed or else
 505   // we consider it a failure.
 506   return !Str.empty();
 507 }
 508
 509 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
 510   StringRef Str = *this;
 511
 512   // Autosense radix if not specified.
 513   if (Radix == 0)
 514     Radix = GetAutoSenseRadix(Str);
 515
 516   assert(Radix > 1 && Radix <= 36);
 517
 518   // Empty strings (after the radix autosense) are invalid.
 519   if (Str.empty()) return true;
 520
 521   // Skip leading zeroes.  This can be a significant improvement if
 522   // it means we don't need > 64 bits.
 523   while (!Str.empty() && Str.front() == '0')
 524     Str = Str.substr(1);
 525
 526   // If it was nothing but zeroes....
 527   if (Str.empty()) {
 528     Result = APInt(64, 0);
 529     return false;
 530   }
 531
 532   // (Over-)estimate the required number of bits.
 533   unsigned Log2Radix = 0;
 534   while ((1U << Log2Radix) < Radix) Log2Radix++;
 535   bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
 536
 537   unsigned BitWidth = Log2Radix * Str.size();
 538   if (BitWidth < Result.getBitWidth())
 539     BitWidth = Result.getBitWidth(); // don't shrink the result
 540   else if (BitWidth > Result.getBitWidth())
 541     Result = Result.zext(BitWidth);
 542
 543   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
 544   if (!IsPowerOf2Radix) {
 545     // These must have the same bit-width as Result.
 546     RadixAP = APInt(BitWidth, Radix);
 547     CharAP = APInt(BitWidth, 0);
 548   }
 549
 550   // Parse all the bytes of the string given this radix.
 551   Result = 0;
 552   while (!Str.empty()) {
 553     unsigned CharVal;
 554     if (Str[0] >= '0' && Str[0] <= '9')
 555       CharVal = Str[0]-'0';
 556     else if (Str[0] >= 'a' && Str[0] <= 'z')
 557       CharVal = Str[0]-'a'+10;
 558     else if (Str[0] >= 'A' && Str[0] <= 'Z')
 559       CharVal = Str[0]-'A'+10;
 560     else
 561       return true;
 562
 563     // If the parsed value is larger than the integer radix, the string is
 564     // invalid.
 565     if (CharVal >= Radix)
 566       return true;
 567
 568     // Add in this character.
 569     if (IsPowerOf2Radix) {
 570       Result <<= Log2Radix;
 571       Result |= CharVal;
 572     } else {
 573       Result *= RadixAP;
 574       CharAP = CharVal;
 575       Result += CharAP;
 576     }
 577
 578     Str = Str.substr(1);
 579   }
 580
 581   return false;
 582 }
 583
 584 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
 585   APFloat F(0.0);
 586   APFloat::opStatus Status =
 587       F.convertFromString(*this, APFloat::rmNearestTiesToEven);
 588   if (Status != APFloat::opOK) {
 589     if (!AllowInexact || !(Status & APFloat::opInexact))
 590       return true;
 591   }
 592
 593   Result = F.convertToDouble();
 594   return false;
 595 }
 596
 597 // Implementation of StringRef hashing.
 598 hash_code llvm::hash_value(StringRef S) {
 599   return hash_combine_range(S.begin(), S.end());
 600 }