1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/ADT/StringRef.h"
10 #include "llvm/ADT/APFloat.h"
11 #include "llvm/ADT/APInt.h"
12 #include "llvm/ADT/Hashing.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/edit_distance.h"
15 #include "llvm/Support/Error.h"
20 // MSVC emits references to this into the translation units which reference it.
22 constexpr size_t StringRef::npos
;
25 // strncasecmp() is not available on non-POSIX systems, so define an
26 // alternative function here.
27 static int ascii_strncasecmp(const char *LHS
, const char *RHS
, size_t Length
) {
28 for (size_t I
= 0; I
< Length
; ++I
) {
29 unsigned char LHC
= toLower(LHS
[I
]);
30 unsigned char RHC
= toLower(RHS
[I
]);
32 return LHC
< RHC
? -1 : 1;
37 int StringRef::compare_insensitive(StringRef RHS
) const {
38 if (int Res
= ascii_strncasecmp(Data
, RHS
.Data
, std::min(Length
, RHS
.Length
)))
40 if (Length
== RHS
.Length
)
42 return Length
< RHS
.Length
? -1 : 1;
45 bool StringRef::starts_with_insensitive(StringRef Prefix
) const {
46 return Length
>= Prefix
.Length
&&
47 ascii_strncasecmp(Data
, Prefix
.Data
, Prefix
.Length
) == 0;
50 bool StringRef::ends_with_insensitive(StringRef Suffix
) const {
51 return Length
>= Suffix
.Length
&&
52 ascii_strncasecmp(end() - Suffix
.Length
, Suffix
.Data
, Suffix
.Length
) == 0;
55 size_t StringRef::find_insensitive(char C
, size_t From
) const {
57 return find_if([L
](char D
) { return toLower(D
) == L
; }, From
);
60 /// compare_numeric - Compare strings, handle embedded numbers.
61 int StringRef::compare_numeric(StringRef RHS
) const {
62 for (size_t I
= 0, E
= std::min(Length
, RHS
.Length
); I
!= E
; ++I
) {
63 // Check for sequences of digits.
64 if (isDigit(Data
[I
]) && isDigit(RHS
.Data
[I
])) {
65 // The longer sequence of numbers is considered larger.
66 // This doesn't really handle prefixed zeros well.
68 for (J
= I
+ 1; J
!= E
+ 1; ++J
) {
69 bool ld
= J
< Length
&& isDigit(Data
[J
]);
70 bool rd
= J
< RHS
.Length
&& isDigit(RHS
.Data
[J
]);
76 // The two number sequences have the same length (J-I), just memcmp them.
77 if (int Res
= compareMemory(Data
+ I
, RHS
.Data
+ I
, J
- I
))
78 return Res
< 0 ? -1 : 1;
79 // Identical number sequences, continue search after the numbers.
83 if (Data
[I
] != RHS
.Data
[I
])
84 return (unsigned char)Data
[I
] < (unsigned char)RHS
.Data
[I
] ? -1 : 1;
86 if (Length
== RHS
.Length
)
88 return Length
< RHS
.Length
? -1 : 1;
91 // Compute the edit distance between the two given strings.
92 unsigned StringRef::edit_distance(llvm::StringRef Other
,
93 bool AllowReplacements
,
94 unsigned MaxEditDistance
) const {
95 return llvm::ComputeEditDistance(ArrayRef(data(), size()),
96 ArrayRef(Other
.data(), Other
.size()),
97 AllowReplacements
, MaxEditDistance
);
100 unsigned llvm::StringRef::edit_distance_insensitive(
101 StringRef Other
, bool AllowReplacements
, unsigned MaxEditDistance
) const {
102 return llvm::ComputeMappedEditDistance(
103 ArrayRef(data(), size()), ArrayRef(Other
.data(), Other
.size()),
104 llvm::toLower
, AllowReplacements
, MaxEditDistance
);
107 //===----------------------------------------------------------------------===//
109 //===----------------------------------------------------------------------===//
111 std::string
StringRef::lower() const {
112 return std::string(map_iterator(begin(), toLower
),
113 map_iterator(end(), toLower
));
116 std::string
StringRef::upper() const {
117 return std::string(map_iterator(begin(), toUpper
),
118 map_iterator(end(), toUpper
));
121 //===----------------------------------------------------------------------===//
123 //===----------------------------------------------------------------------===//
126 /// find - Search for the first string \arg Str in the string.
128 /// \return - The index of the first occurrence of \arg Str, or npos if not
130 size_t StringRef::find(StringRef Str
, size_t From
) const {
134 const char *Start
= Data
+ From
;
135 size_t Size
= Length
- From
;
137 const char *Needle
= Str
.data();
138 size_t N
= Str
.size();
144 const char *Ptr
= (const char *)::memchr(Start
, Needle
[0], Size
);
145 return Ptr
== nullptr ? npos
: Ptr
- Data
;
148 const char *Stop
= Start
+ (Size
- N
+ 1);
151 // Provide a fast path for newline finding (CRLF case) in InclusionRewriter.
152 // Not the most optimized strategy, but getting memcmp inlined should be
155 if (std::memcmp(Start
, Needle
, 2) == 0)
158 } while (Start
< Stop
);
162 // For short haystacks or unsupported needles fall back to the naive algorithm
163 if (Size
< 16 || N
> 255) {
165 if (std::memcmp(Start
, Needle
, N
) == 0)
168 } while (Start
< Stop
);
172 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
173 uint8_t BadCharSkip
[256];
174 std::memset(BadCharSkip
, N
, 256);
175 for (unsigned i
= 0; i
!= N
-1; ++i
)
176 BadCharSkip
[(uint8_t)Str
[i
]] = N
-1-i
;
179 uint8_t Last
= Start
[N
- 1];
180 if (LLVM_UNLIKELY(Last
== (uint8_t)Needle
[N
- 1]))
181 if (std::memcmp(Start
, Needle
, N
- 1) == 0)
184 // Otherwise skip the appropriate number of bytes.
185 Start
+= BadCharSkip
[Last
];
186 } while (Start
< Stop
);
191 size_t StringRef::find_insensitive(StringRef Str
, size_t From
) const {
192 StringRef This
= substr(From
);
193 while (This
.size() >= Str
.size()) {
194 if (This
.starts_with_insensitive(Str
))
196 This
= This
.drop_front();
202 size_t StringRef::rfind_insensitive(char C
, size_t From
) const {
203 From
= std::min(From
, Length
);
207 if (toLower(Data
[i
]) == toLower(C
))
213 /// rfind - Search for the last string \arg Str in the string.
215 /// \return - The index of the last occurrence of \arg Str, or npos if not
217 size_t StringRef::rfind(StringRef Str
) const {
218 return std::string_view(*this).rfind(Str
);
221 size_t StringRef::rfind_insensitive(StringRef Str
) const {
222 size_t N
= Str
.size();
225 for (size_t i
= Length
- N
+ 1, e
= 0; i
!= e
;) {
227 if (substr(i
, N
).equals_insensitive(Str
))
233 /// find_first_of - Find the first character in the string that is in \arg
234 /// Chars, or npos if not found.
236 /// Note: O(size() + Chars.size())
237 StringRef::size_type
StringRef::find_first_of(StringRef Chars
,
239 std::bitset
<1 << CHAR_BIT
> CharBits
;
241 CharBits
.set((unsigned char)C
);
243 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
244 if (CharBits
.test((unsigned char)Data
[i
]))
249 /// find_first_not_of - Find the first character in the string that is not
250 /// \arg C or npos if not found.
251 StringRef::size_type
StringRef::find_first_not_of(char C
, size_t From
) const {
252 return std::string_view(*this).find_first_not_of(C
, From
);
255 /// find_first_not_of - Find the first character in the string that is not
256 /// in the string \arg Chars, or npos if not found.
258 /// Note: O(size() + Chars.size())
259 StringRef::size_type
StringRef::find_first_not_of(StringRef Chars
,
261 std::bitset
<1 << CHAR_BIT
> CharBits
;
263 CharBits
.set((unsigned char)C
);
265 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
266 if (!CharBits
.test((unsigned char)Data
[i
]))
271 /// find_last_of - Find the last character in the string that is in \arg C,
272 /// or npos if not found.
274 /// Note: O(size() + Chars.size())
275 StringRef::size_type
StringRef::find_last_of(StringRef Chars
,
277 std::bitset
<1 << CHAR_BIT
> CharBits
;
279 CharBits
.set((unsigned char)C
);
281 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
282 if (CharBits
.test((unsigned char)Data
[i
]))
287 /// find_last_not_of - Find the last character in the string that is not
288 /// \arg C, or npos if not found.
289 StringRef::size_type
StringRef::find_last_not_of(char C
, size_t From
) const {
290 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
296 /// find_last_not_of - Find the last character in the string that is not in
297 /// \arg Chars, or npos if not found.
299 /// Note: O(size() + Chars.size())
300 StringRef::size_type
StringRef::find_last_not_of(StringRef Chars
,
302 std::bitset
<1 << CHAR_BIT
> CharBits
;
304 CharBits
.set((unsigned char)C
);
306 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
307 if (!CharBits
.test((unsigned char)Data
[i
]))
312 void StringRef::split(SmallVectorImpl
<StringRef
> &A
,
313 StringRef Separator
, int MaxSplit
,
314 bool KeepEmpty
) const {
317 // Count down from MaxSplit. When MaxSplit is -1, this will just split
318 // "forever". This doesn't support splitting more than 2^31 times
319 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
320 // but that seems unlikely to be useful.
321 while (MaxSplit
-- != 0) {
322 size_t Idx
= S
.find(Separator
);
327 if (KeepEmpty
|| Idx
> 0)
328 A
.push_back(S
.slice(0, Idx
));
331 S
= S
.slice(Idx
+ Separator
.size(), npos
);
335 if (KeepEmpty
|| !S
.empty())
339 void StringRef::split(SmallVectorImpl
<StringRef
> &A
, char Separator
,
340 int MaxSplit
, bool KeepEmpty
) const {
343 // Count down from MaxSplit. When MaxSplit is -1, this will just split
344 // "forever". This doesn't support splitting more than 2^31 times
345 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
346 // but that seems unlikely to be useful.
347 while (MaxSplit
-- != 0) {
348 size_t Idx
= S
.find(Separator
);
353 if (KeepEmpty
|| Idx
> 0)
354 A
.push_back(S
.slice(0, Idx
));
357 S
= S
.slice(Idx
+ 1, npos
);
361 if (KeepEmpty
|| !S
.empty())
365 //===----------------------------------------------------------------------===//
366 // Helpful Algorithms
367 //===----------------------------------------------------------------------===//
369 /// count - Return the number of non-overlapped occurrences of \arg Str in
371 size_t StringRef::count(StringRef Str
) const {
374 size_t N
= Str
.size();
375 // TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing
376 // this to `Length + 1` which is more in-line with the function
380 while ((Pos
= find(Str
, Pos
)) != npos
) {
387 static unsigned GetAutoSenseRadix(StringRef
&Str
) {
391 if (Str
.startswith("0x") || Str
.startswith("0X")) {
396 if (Str
.startswith("0b") || Str
.startswith("0B")) {
401 if (Str
.startswith("0o")) {
406 if (Str
[0] == '0' && Str
.size() > 1 && isDigit(Str
[1])) {
414 bool llvm::consumeUnsignedInteger(StringRef
&Str
, unsigned Radix
,
415 unsigned long long &Result
) {
416 // Autosense radix if not specified.
418 Radix
= GetAutoSenseRadix(Str
);
420 // Empty strings (after the radix autosense) are invalid.
421 if (Str
.empty()) return true;
423 // Parse all the bytes of the string given this radix. Watch for overflow.
424 StringRef Str2
= Str
;
426 while (!Str2
.empty()) {
428 if (Str2
[0] >= '0' && Str2
[0] <= '9')
429 CharVal
= Str2
[0] - '0';
430 else if (Str2
[0] >= 'a' && Str2
[0] <= 'z')
431 CharVal
= Str2
[0] - 'a' + 10;
432 else if (Str2
[0] >= 'A' && Str2
[0] <= 'Z')
433 CharVal
= Str2
[0] - 'A' + 10;
437 // If the parsed value is larger than the integer radix, we cannot
438 // consume any more characters.
439 if (CharVal
>= Radix
)
442 // Add in this character.
443 unsigned long long PrevResult
= Result
;
444 Result
= Result
* Radix
+ CharVal
;
446 // Check for overflow by shifting back and seeing if bits were lost.
447 if (Result
/ Radix
< PrevResult
)
450 Str2
= Str2
.substr(1);
453 // We consider the operation a failure if no characters were consumed
455 if (Str
.size() == Str2
.size())
462 bool llvm::consumeSignedInteger(StringRef
&Str
, unsigned Radix
,
464 unsigned long long ULLVal
;
466 // Handle positive strings first.
467 if (Str
.empty() || Str
.front() != '-') {
468 if (consumeUnsignedInteger(Str
, Radix
, ULLVal
) ||
469 // Check for value so large it overflows a signed value.
470 (long long)ULLVal
< 0)
476 // Get the positive part of the value.
477 StringRef Str2
= Str
.drop_front(1);
478 if (consumeUnsignedInteger(Str2
, Radix
, ULLVal
) ||
479 // Reject values so large they'd overflow as negative signed, but allow
480 // "-0". This negates the unsigned so that the negative isn't undefined
481 // on signed overflow.
482 (long long)-ULLVal
> 0)
490 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
491 /// sequence of radix up to 36 to an unsigned long long value.
492 bool llvm::getAsUnsignedInteger(StringRef Str
, unsigned Radix
,
493 unsigned long long &Result
) {
494 if (consumeUnsignedInteger(Str
, Radix
, Result
))
497 // For getAsUnsignedInteger, we require the whole string to be consumed or
498 // else we consider it a failure.
502 bool llvm::getAsSignedInteger(StringRef Str
, unsigned Radix
,
504 if (consumeSignedInteger(Str
, Radix
, Result
))
507 // For getAsSignedInteger, we require the whole string to be consumed or else
508 // we consider it a failure.
512 bool StringRef::consumeInteger(unsigned Radix
, APInt
&Result
) {
513 StringRef Str
= *this;
515 // Autosense radix if not specified.
517 Radix
= GetAutoSenseRadix(Str
);
519 assert(Radix
> 1 && Radix
<= 36);
521 // Empty strings (after the radix autosense) are invalid.
522 if (Str
.empty()) return true;
524 // Skip leading zeroes. This can be a significant improvement if
525 // it means we don't need > 64 bits.
526 while (!Str
.empty() && Str
.front() == '0')
529 // If it was nothing but zeroes....
531 Result
= APInt(64, 0);
536 // (Over-)estimate the required number of bits.
537 unsigned Log2Radix
= 0;
538 while ((1U << Log2Radix
) < Radix
) Log2Radix
++;
539 bool IsPowerOf2Radix
= ((1U << Log2Radix
) == Radix
);
541 unsigned BitWidth
= Log2Radix
* Str
.size();
542 if (BitWidth
< Result
.getBitWidth())
543 BitWidth
= Result
.getBitWidth(); // don't shrink the result
544 else if (BitWidth
> Result
.getBitWidth())
545 Result
= Result
.zext(BitWidth
);
547 APInt RadixAP
, CharAP
; // unused unless !IsPowerOf2Radix
548 if (!IsPowerOf2Radix
) {
549 // These must have the same bit-width as Result.
550 RadixAP
= APInt(BitWidth
, Radix
);
551 CharAP
= APInt(BitWidth
, 0);
554 // Parse all the bytes of the string given this radix.
556 while (!Str
.empty()) {
558 if (Str
[0] >= '0' && Str
[0] <= '9')
559 CharVal
= Str
[0]-'0';
560 else if (Str
[0] >= 'a' && Str
[0] <= 'z')
561 CharVal
= Str
[0]-'a'+10;
562 else if (Str
[0] >= 'A' && Str
[0] <= 'Z')
563 CharVal
= Str
[0]-'A'+10;
567 // If the parsed value is larger than the integer radix, the string is
569 if (CharVal
>= Radix
)
572 // Add in this character.
573 if (IsPowerOf2Radix
) {
574 Result
<<= Log2Radix
;
585 // We consider the operation a failure if no characters were consumed
587 if (size() == Str
.size())
594 bool StringRef::getAsInteger(unsigned Radix
, APInt
&Result
) const {
595 StringRef Str
= *this;
596 if (Str
.consumeInteger(Radix
, Result
))
599 // For getAsInteger, we require the whole string to be consumed or else we
600 // consider it a failure.
604 bool StringRef::getAsDouble(double &Result
, bool AllowInexact
) const {
606 auto StatusOrErr
= F
.convertFromString(*this, APFloat::rmNearestTiesToEven
);
607 if (errorToBool(StatusOrErr
.takeError()))
610 APFloat::opStatus Status
= *StatusOrErr
;
611 if (Status
!= APFloat::opOK
) {
612 if (!AllowInexact
|| !(Status
& APFloat::opInexact
))
616 Result
= F
.convertToDouble();
620 // Implementation of StringRef hashing.
621 hash_code
llvm::hash_value(StringRef S
) {
622 return hash_combine_range(S
.begin(), S
.end());
625 unsigned DenseMapInfo
<StringRef
, void>::getHashValue(StringRef Val
) {
626 assert(Val
.data() != getEmptyKey().data() &&
627 "Cannot hash the empty key!");
628 assert(Val
.data() != getTombstoneKey().data() &&
629 "Cannot hash the tombstone key!");
630 return (unsigned)(hash_value(Val
));