llvm/lib/Support/StringRef.cpp

   1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "llvm/ADT/StringRef.h"
  10 #include "llvm/ADT/APFloat.h"
  11 #include "llvm/ADT/APInt.h"
  12 #include "llvm/ADT/Hashing.h"
  13 #include "llvm/ADT/StringExtras.h"
  14 #include "llvm/ADT/edit_distance.h"
  15 #include "llvm/Support/Error.h"
  16 #include <bitset>
  17
  18 using namespace llvm;
  19
  20 // MSVC emits references to this into the translation units which reference it.
  21 #ifndef _MSC_VER
  22 constexpr size_t StringRef::npos;
  23 #endif
  24
  25 // strncasecmp() is not available on non-POSIX systems, so define an
  26 // alternative function here.
  27 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
  28   for (size_t I = 0; I < Length; ++I) {
  29     unsigned char LHC = toLower(LHS[I]);
  30     unsigned char RHC = toLower(RHS[I]);
  31     if (LHC != RHC)
  32       return LHC < RHC ? -1 : 1;
  33   }
  34   return 0;
  35 }
  36
  37 int StringRef::compare_insensitive(StringRef RHS) const {
  38   if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
  39     return Res;
  40   if (Length == RHS.Length)
  41     return 0;
  42   return Length < RHS.Length ? -1 : 1;
  43 }
  44
  45 bool StringRef::startswith_insensitive(StringRef Prefix) const {
  46   return Length >= Prefix.Length &&
  47       ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
  48 }
  49
  50 bool StringRef::endswith_insensitive(StringRef Suffix) const {
  51   return Length >= Suffix.Length &&
  52       ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
  53 }
  54
  55 size_t StringRef::find_insensitive(char C, size_t From) const {
  56   char L = toLower(C);
  57   return find_if([L](char D) { return toLower(D) == L; }, From);
  58 }
  59
  60 /// compare_numeric - Compare strings, handle embedded numbers.
  61 int StringRef::compare_numeric(StringRef RHS) const {
  62   for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
  63     // Check for sequences of digits.
  64     if (isDigit(Data[I]) && isDigit(RHS.Data[I])) {
  65       // The longer sequence of numbers is considered larger.
  66       // This doesn't really handle prefixed zeros well.
  67       size_t J;
  68       for (J = I + 1; J != E + 1; ++J) {
  69         bool ld = J < Length && isDigit(Data[J]);
  70         bool rd = J < RHS.Length && isDigit(RHS.Data[J]);
  71         if (ld != rd)
  72           return rd ? -1 : 1;
  73         if (!rd)
  74           break;
  75       }
  76       // The two number sequences have the same length (J-I), just memcmp them.
  77       if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
  78         return Res < 0 ? -1 : 1;
  79       // Identical number sequences, continue search after the numbers.
  80       I = J - 1;
  81       continue;
  82     }
  83     if (Data[I] != RHS.Data[I])
  84       return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
  85   }
  86   if (Length == RHS.Length)
  87     return 0;
  88   return Length < RHS.Length ? -1 : 1;
  89 }
  90
  91 // Compute the edit distance between the two given strings.
  92 unsigned StringRef::edit_distance(llvm::StringRef Other,
  93                                   bool AllowReplacements,
  94                                   unsigned MaxEditDistance) const {
  95   return llvm::ComputeEditDistance(
  96       makeArrayRef(data(), size()),
  97       makeArrayRef(Other.data(), Other.size()),
  98       AllowReplacements, MaxEditDistance);
  99 }
 100
 101 //===----------------------------------------------------------------------===//
 102 // String Operations
 103 //===----------------------------------------------------------------------===//
 104
 105 std::string StringRef::lower() const {
 106   return std::string(map_iterator(begin(), toLower),
 107                      map_iterator(end(), toLower));
 108 }
 109
 110 std::string StringRef::upper() const {
 111   return std::string(map_iterator(begin(), toUpper),
 112                      map_iterator(end(), toUpper));
 113 }
 114
 115 //===----------------------------------------------------------------------===//
 116 // String Searching
 117 //===----------------------------------------------------------------------===//
 118
 119
 120 /// find - Search for the first string \arg Str in the string.
 121 ///
 122 /// \return - The index of the first occurrence of \arg Str, or npos if not
 123 /// found.
 124 size_t StringRef::find(StringRef Str, size_t From) const {
 125   if (From > Length)
 126     return npos;
 127
 128   const char *Start = Data + From;
 129   size_t Size = Length - From;
 130
 131   const char *Needle = Str.data();
 132   size_t N = Str.size();
 133   if (N == 0)
 134     return From;
 135   if (Size < N)
 136     return npos;
 137   if (N == 1) {
 138     const char *Ptr = (const char *)::memchr(Start, Needle[0], Size);
 139     return Ptr == nullptr ? npos : Ptr - Data;
 140   }
 141
 142   const char *Stop = Start + (Size - N + 1);
 143
 144   // For short haystacks or unsupported needles fall back to the naive algorithm
 145   if (Size < 16 || N > 255) {
 146     do {
 147       if (std::memcmp(Start, Needle, N) == 0)
 148         return Start - Data;
 149       ++Start;
 150     } while (Start < Stop);
 151     return npos;
 152   }
 153
 154   // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
 155   uint8_t BadCharSkip[256];
 156   std::memset(BadCharSkip, N, 256);
 157   for (unsigned i = 0; i != N-1; ++i)
 158     BadCharSkip[(uint8_t)Str[i]] = N-1-i;
 159
 160   do {
 161     uint8_t Last = Start[N - 1];
 162     if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1]))
 163       if (std::memcmp(Start, Needle, N - 1) == 0)
 164         return Start - Data;
 165
 166     // Otherwise skip the appropriate number of bytes.
 167     Start += BadCharSkip[Last];
 168   } while (Start < Stop);
 169
 170   return npos;
 171 }
 172
 173 size_t StringRef::find_insensitive(StringRef Str, size_t From) const {
 174   StringRef This = substr(From);
 175   while (This.size() >= Str.size()) {
 176     if (This.startswith_insensitive(Str))
 177       return From;
 178     This = This.drop_front();
 179     ++From;
 180   }
 181   return npos;
 182 }
 183
 184 size_t StringRef::rfind_insensitive(char C, size_t From) const {
 185   From = std::min(From, Length);
 186   size_t i = From;
 187   while (i != 0) {
 188     --i;
 189     if (toLower(Data[i]) == toLower(C))
 190       return i;
 191   }
 192   return npos;
 193 }
 194
 195 /// rfind - Search for the last string \arg Str in the string.
 196 ///
 197 /// \return - The index of the last occurrence of \arg Str, or npos if not
 198 /// found.
 199 size_t StringRef::rfind(StringRef Str) const {
 200   size_t N = Str.size();
 201   if (N > Length)
 202     return npos;
 203   for (size_t i = Length - N + 1, e = 0; i != e;) {
 204     --i;
 205     if (substr(i, N).equals(Str))
 206       return i;
 207   }
 208   return npos;
 209 }
 210
 211 size_t StringRef::rfind_insensitive(StringRef Str) const {
 212   size_t N = Str.size();
 213   if (N > Length)
 214     return npos;
 215   for (size_t i = Length - N + 1, e = 0; i != e;) {
 216     --i;
 217     if (substr(i, N).equals_insensitive(Str))
 218       return i;
 219   }
 220   return npos;
 221 }
 222
 223 /// find_first_of - Find the first character in the string that is in \arg
 224 /// Chars, or npos if not found.
 225 ///
 226 /// Note: O(size() + Chars.size())
 227 StringRef::size_type StringRef::find_first_of(StringRef Chars,
 228                                               size_t From) const {
 229   std::bitset<1 << CHAR_BIT> CharBits;
 230   for (size_type i = 0; i != Chars.size(); ++i)
 231     CharBits.set((unsigned char)Chars[i]);
 232
 233   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 234     if (CharBits.test((unsigned char)Data[i]))
 235       return i;
 236   return npos;
 237 }
 238
 239 /// find_first_not_of - Find the first character in the string that is not
 240 /// \arg C or npos if not found.
 241 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
 242   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 243     if (Data[i] != C)
 244       return i;
 245   return npos;
 246 }
 247
 248 /// find_first_not_of - Find the first character in the string that is not
 249 /// in the string \arg Chars, or npos if not found.
 250 ///
 251 /// Note: O(size() + Chars.size())
 252 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
 253                                                   size_t From) const {
 254   std::bitset<1 << CHAR_BIT> CharBits;
 255   for (size_type i = 0; i != Chars.size(); ++i)
 256     CharBits.set((unsigned char)Chars[i]);
 257
 258   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 259     if (!CharBits.test((unsigned char)Data[i]))
 260       return i;
 261   return npos;
 262 }
 263
 264 /// find_last_of - Find the last character in the string that is in \arg C,
 265 /// or npos if not found.
 266 ///
 267 /// Note: O(size() + Chars.size())
 268 StringRef::size_type StringRef::find_last_of(StringRef Chars,
 269                                              size_t From) const {
 270   std::bitset<1 << CHAR_BIT> CharBits;
 271   for (size_type i = 0; i != Chars.size(); ++i)
 272     CharBits.set((unsigned char)Chars[i]);
 273
 274   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 275     if (CharBits.test((unsigned char)Data[i]))
 276       return i;
 277   return npos;
 278 }
 279
 280 /// find_last_not_of - Find the last character in the string that is not
 281 /// \arg C, or npos if not found.
 282 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
 283   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 284     if (Data[i] != C)
 285       return i;
 286   return npos;
 287 }
 288
 289 /// find_last_not_of - Find the last character in the string that is not in
 290 /// \arg Chars, or npos if not found.
 291 ///
 292 /// Note: O(size() + Chars.size())
 293 StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
 294                                                  size_t From) const {
 295   std::bitset<1 << CHAR_BIT> CharBits;
 296   for (size_type i = 0, e = Chars.size(); i != e; ++i)
 297     CharBits.set((unsigned char)Chars[i]);
 298
 299   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 300     if (!CharBits.test((unsigned char)Data[i]))
 301       return i;
 302   return npos;
 303 }
 304
 305 void StringRef::split(SmallVectorImpl<StringRef> &A,
 306                       StringRef Separator, int MaxSplit,
 307                       bool KeepEmpty) const {
 308   StringRef S = *this;
 309
 310   // Count down from MaxSplit. When MaxSplit is -1, this will just split
 311   // "forever". This doesn't support splitting more than 2^31 times
 312   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
 313   // but that seems unlikely to be useful.
 314   while (MaxSplit-- != 0) {
 315     size_t Idx = S.find(Separator);
 316     if (Idx == npos)
 317       break;
 318
 319     // Push this split.
 320     if (KeepEmpty || Idx > 0)
 321       A.push_back(S.slice(0, Idx));
 322
 323     // Jump forward.
 324     S = S.slice(Idx + Separator.size(), npos);
 325   }
 326
 327   // Push the tail.
 328   if (KeepEmpty || !S.empty())
 329     A.push_back(S);
 330 }
 331
 332 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
 333                       int MaxSplit, bool KeepEmpty) const {
 334   StringRef S = *this;
 335
 336   // Count down from MaxSplit. When MaxSplit is -1, this will just split
 337   // "forever". This doesn't support splitting more than 2^31 times
 338   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
 339   // but that seems unlikely to be useful.
 340   while (MaxSplit-- != 0) {
 341     size_t Idx = S.find(Separator);
 342     if (Idx == npos)
 343       break;
 344
 345     // Push this split.
 346     if (KeepEmpty || Idx > 0)
 347       A.push_back(S.slice(0, Idx));
 348
 349     // Jump forward.
 350     S = S.slice(Idx + 1, npos);
 351   }
 352
 353   // Push the tail.
 354   if (KeepEmpty || !S.empty())
 355     A.push_back(S);
 356 }
 357
 358 //===----------------------------------------------------------------------===//
 359 // Helpful Algorithms
 360 //===----------------------------------------------------------------------===//
 361
 362 /// count - Return the number of non-overlapped occurrences of \arg Str in
 363 /// the string.
 364 size_t StringRef::count(StringRef Str) const {
 365   size_t Count = 0;
 366   size_t N = Str.size();
 367   if (!N || N > Length)
 368     return 0;
 369   for (size_t i = 0, e = Length - N + 1; i < e;) {
 370     if (substr(i, N).equals(Str)) {
 371       ++Count;
 372       i += N;
 373     }
 374     else
 375       ++i;
 376   }
 377   return Count;
 378 }
 379
 380 static unsigned GetAutoSenseRadix(StringRef &Str) {
 381   if (Str.empty())
 382     return 10;
 383
 384   if (Str.startswith("0x") || Str.startswith("0X")) {
 385     Str = Str.substr(2);
 386     return 16;
 387   }
 388
 389   if (Str.startswith("0b") || Str.startswith("0B")) {
 390     Str = Str.substr(2);
 391     return 2;
 392   }
 393
 394   if (Str.startswith("0o")) {
 395     Str = Str.substr(2);
 396     return 8;
 397   }
 398
 399   if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) {
 400     Str = Str.substr(1);
 401     return 8;
 402   }
 403
 404   return 10;
 405 }
 406
 407 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
 408                                   unsigned long long &Result) {
 409   // Autosense radix if not specified.
 410   if (Radix == 0)
 411     Radix = GetAutoSenseRadix(Str);
 412
 413   // Empty strings (after the radix autosense) are invalid.
 414   if (Str.empty()) return true;
 415
 416   // Parse all the bytes of the string given this radix.  Watch for overflow.
 417   StringRef Str2 = Str;
 418   Result = 0;
 419   while (!Str2.empty()) {
 420     unsigned CharVal;
 421     if (Str2[0] >= '0' && Str2[0] <= '9')
 422       CharVal = Str2[0] - '0';
 423     else if (Str2[0] >= 'a' && Str2[0] <= 'z')
 424       CharVal = Str2[0] - 'a' + 10;
 425     else if (Str2[0] >= 'A' && Str2[0] <= 'Z')
 426       CharVal = Str2[0] - 'A' + 10;
 427     else
 428       break;
 429
 430     // If the parsed value is larger than the integer radix, we cannot
 431     // consume any more characters.
 432     if (CharVal >= Radix)
 433       break;
 434
 435     // Add in this character.
 436     unsigned long long PrevResult = Result;
 437     Result = Result * Radix + CharVal;
 438
 439     // Check for overflow by shifting back and seeing if bits were lost.
 440     if (Result / Radix < PrevResult)
 441       return true;
 442
 443     Str2 = Str2.substr(1);
 444   }
 445
 446   // We consider the operation a failure if no characters were consumed
 447   // successfully.
 448   if (Str.size() == Str2.size())
 449     return true;
 450
 451   Str = Str2;
 452   return false;
 453 }
 454
 455 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
 456                                 long long &Result) {
 457   unsigned long long ULLVal;
 458
 459   // Handle positive strings first.
 460   if (Str.empty() || Str.front() != '-') {
 461     if (consumeUnsignedInteger(Str, Radix, ULLVal) ||
 462         // Check for value so large it overflows a signed value.
 463         (long long)ULLVal < 0)
 464       return true;
 465     Result = ULLVal;
 466     return false;
 467   }
 468
 469   // Get the positive part of the value.
 470   StringRef Str2 = Str.drop_front(1);
 471   if (consumeUnsignedInteger(Str2, Radix, ULLVal) ||
 472       // Reject values so large they'd overflow as negative signed, but allow
 473       // "-0".  This negates the unsigned so that the negative isn't undefined
 474       // on signed overflow.
 475       (long long)-ULLVal > 0)
 476     return true;
 477
 478   Str = Str2;
 479   Result = -ULLVal;
 480   return false;
 481 }
 482
 483 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
 484 /// sequence of radix up to 36 to an unsigned long long value.
 485 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
 486                                 unsigned long long &Result) {
 487   if (consumeUnsignedInteger(Str, Radix, Result))
 488     return true;
 489
 490   // For getAsUnsignedInteger, we require the whole string to be consumed or
 491   // else we consider it a failure.
 492   return !Str.empty();
 493 }
 494
 495 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
 496                               long long &Result) {
 497   if (consumeSignedInteger(Str, Radix, Result))
 498     return true;
 499
 500   // For getAsSignedInteger, we require the whole string to be consumed or else
 501   // we consider it a failure.
 502   return !Str.empty();
 503 }
 504
 505 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
 506   StringRef Str = *this;
 507
 508   // Autosense radix if not specified.
 509   if (Radix == 0)
 510     Radix = GetAutoSenseRadix(Str);
 511
 512   assert(Radix > 1 && Radix <= 36);
 513
 514   // Empty strings (after the radix autosense) are invalid.
 515   if (Str.empty()) return true;
 516
 517   // Skip leading zeroes.  This can be a significant improvement if
 518   // it means we don't need > 64 bits.
 519   while (!Str.empty() && Str.front() == '0')
 520     Str = Str.substr(1);
 521
 522   // If it was nothing but zeroes....
 523   if (Str.empty()) {
 524     Result = APInt(64, 0);
 525     return false;
 526   }
 527
 528   // (Over-)estimate the required number of bits.
 529   unsigned Log2Radix = 0;
 530   while ((1U << Log2Radix) < Radix) Log2Radix++;
 531   bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
 532
 533   unsigned BitWidth = Log2Radix * Str.size();
 534   if (BitWidth < Result.getBitWidth())
 535     BitWidth = Result.getBitWidth(); // don't shrink the result
 536   else if (BitWidth > Result.getBitWidth())
 537     Result = Result.zext(BitWidth);
 538
 539   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
 540   if (!IsPowerOf2Radix) {
 541     // These must have the same bit-width as Result.
 542     RadixAP = APInt(BitWidth, Radix);
 543     CharAP = APInt(BitWidth, 0);
 544   }
 545
 546   // Parse all the bytes of the string given this radix.
 547   Result = 0;
 548   while (!Str.empty()) {
 549     unsigned CharVal;
 550     if (Str[0] >= '0' && Str[0] <= '9')
 551       CharVal = Str[0]-'0';
 552     else if (Str[0] >= 'a' && Str[0] <= 'z')
 553       CharVal = Str[0]-'a'+10;
 554     else if (Str[0] >= 'A' && Str[0] <= 'Z')
 555       CharVal = Str[0]-'A'+10;
 556     else
 557       return true;
 558
 559     // If the parsed value is larger than the integer radix, the string is
 560     // invalid.
 561     if (CharVal >= Radix)
 562       return true;
 563
 564     // Add in this character.
 565     if (IsPowerOf2Radix) {
 566       Result <<= Log2Radix;
 567       Result |= CharVal;
 568     } else {
 569       Result *= RadixAP;
 570       CharAP = CharVal;
 571       Result += CharAP;
 572     }
 573
 574     Str = Str.substr(1);
 575   }
 576
 577   return false;
 578 }
 579
 580 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
 581   APFloat F(0.0);
 582   auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
 583   if (errorToBool(StatusOrErr.takeError()))
 584     return true;
 585
 586   APFloat::opStatus Status = *StatusOrErr;
 587   if (Status != APFloat::opOK) {
 588     if (!AllowInexact || !(Status & APFloat::opInexact))
 589       return true;
 590   }
 591
 592   Result = F.convertToDouble();
 593   return false;
 594 }
 595
 596 // Implementation of StringRef hashing.
 597 hash_code llvm::hash_value(StringRef S) {
 598   return hash_combine_range(S.begin(), S.end());
 599 }