1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/ADT/StringRef.h"
10 #include "llvm/ADT/APFloat.h"
11 #include "llvm/ADT/APInt.h"
12 #include "llvm/ADT/Hashing.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/edit_distance.h"
15 #include "llvm/Support/Error.h"
20 // MSVC emits references to this into the translation units which reference it.
22 constexpr size_t StringRef::npos
;
25 // strncasecmp() is not available on non-POSIX systems, so define an
26 // alternative function here.
27 static int ascii_strncasecmp(const char *LHS
, const char *RHS
, size_t Length
) {
28 for (size_t I
= 0; I
< Length
; ++I
) {
29 unsigned char LHC
= toLower(LHS
[I
]);
30 unsigned char RHC
= toLower(RHS
[I
]);
32 return LHC
< RHC
? -1 : 1;
37 int StringRef::compare_insensitive(StringRef RHS
) const {
38 if (int Res
= ascii_strncasecmp(Data
, RHS
.Data
, std::min(Length
, RHS
.Length
)))
40 if (Length
== RHS
.Length
)
42 return Length
< RHS
.Length
? -1 : 1;
45 bool StringRef::starts_with_insensitive(StringRef Prefix
) const {
46 return Length
>= Prefix
.Length
&&
47 ascii_strncasecmp(Data
, Prefix
.Data
, Prefix
.Length
) == 0;
50 bool StringRef::ends_with_insensitive(StringRef Suffix
) const {
51 return Length
>= Suffix
.Length
&&
52 ascii_strncasecmp(end() - Suffix
.Length
, Suffix
.Data
, Suffix
.Length
) == 0;
55 size_t StringRef::find_insensitive(char C
, size_t From
) const {
57 return find_if([L
](char D
) { return toLower(D
) == L
; }, From
);
60 /// compare_numeric - Compare strings, handle embedded numbers.
61 int StringRef::compare_numeric(StringRef RHS
) const {
62 for (size_t I
= 0, E
= std::min(Length
, RHS
.Length
); I
!= E
; ++I
) {
63 // Check for sequences of digits.
64 if (isDigit(Data
[I
]) && isDigit(RHS
.Data
[I
])) {
65 // The longer sequence of numbers is considered larger.
66 // This doesn't really handle prefixed zeros well.
68 for (J
= I
+ 1; J
!= E
+ 1; ++J
) {
69 bool ld
= J
< Length
&& isDigit(Data
[J
]);
70 bool rd
= J
< RHS
.Length
&& isDigit(RHS
.Data
[J
]);
76 // The two number sequences have the same length (J-I), just memcmp them.
77 if (int Res
= compareMemory(Data
+ I
, RHS
.Data
+ I
, J
- I
))
78 return Res
< 0 ? -1 : 1;
79 // Identical number sequences, continue search after the numbers.
83 if (Data
[I
] != RHS
.Data
[I
])
84 return (unsigned char)Data
[I
] < (unsigned char)RHS
.Data
[I
] ? -1 : 1;
86 if (Length
== RHS
.Length
)
88 return Length
< RHS
.Length
? -1 : 1;
91 // Compute the edit distance between the two given strings.
92 unsigned StringRef::edit_distance(llvm::StringRef Other
,
93 bool AllowReplacements
,
94 unsigned MaxEditDistance
) const {
95 return llvm::ComputeEditDistance(ArrayRef(data(), size()),
96 ArrayRef(Other
.data(), Other
.size()),
97 AllowReplacements
, MaxEditDistance
);
100 unsigned llvm::StringRef::edit_distance_insensitive(
101 StringRef Other
, bool AllowReplacements
, unsigned MaxEditDistance
) const {
102 return llvm::ComputeMappedEditDistance(
103 ArrayRef(data(), size()), ArrayRef(Other
.data(), Other
.size()),
104 llvm::toLower
, AllowReplacements
, MaxEditDistance
);
107 //===----------------------------------------------------------------------===//
109 //===----------------------------------------------------------------------===//
111 std::string
StringRef::lower() const {
112 return std::string(map_iterator(begin(), toLower
),
113 map_iterator(end(), toLower
));
116 std::string
StringRef::upper() const {
117 return std::string(map_iterator(begin(), toUpper
),
118 map_iterator(end(), toUpper
));
121 //===----------------------------------------------------------------------===//
123 //===----------------------------------------------------------------------===//
126 /// find - Search for the first string \arg Str in the string.
128 /// \return - The index of the first occurrence of \arg Str, or npos if not
130 size_t StringRef::find(StringRef Str
, size_t From
) const {
134 const char *Start
= Data
+ From
;
135 size_t Size
= Length
- From
;
137 const char *Needle
= Str
.data();
138 size_t N
= Str
.size();
144 const char *Ptr
= (const char *)::memchr(Start
, Needle
[0], Size
);
145 return Ptr
== nullptr ? npos
: Ptr
- Data
;
148 const char *Stop
= Start
+ (Size
- N
+ 1);
151 // Provide a fast path for newline finding (CRLF case) in InclusionRewriter.
152 // Not the most optimized strategy, but getting memcmp inlined should be
155 if (std::memcmp(Start
, Needle
, 2) == 0)
158 } while (Start
< Stop
);
162 // For short haystacks or unsupported needles fall back to the naive algorithm
163 if (Size
< 16 || N
> 255) {
165 if (std::memcmp(Start
, Needle
, N
) == 0)
168 } while (Start
< Stop
);
172 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
173 uint8_t BadCharSkip
[256];
174 std::memset(BadCharSkip
, N
, 256);
175 for (unsigned i
= 0; i
!= N
-1; ++i
)
176 BadCharSkip
[(uint8_t)Str
[i
]] = N
-1-i
;
179 uint8_t Last
= Start
[N
- 1];
180 if (LLVM_UNLIKELY(Last
== (uint8_t)Needle
[N
- 1]))
181 if (std::memcmp(Start
, Needle
, N
- 1) == 0)
184 // Otherwise skip the appropriate number of bytes.
185 Start
+= BadCharSkip
[Last
];
186 } while (Start
< Stop
);
191 size_t StringRef::find_insensitive(StringRef Str
, size_t From
) const {
192 StringRef This
= substr(From
);
193 while (This
.size() >= Str
.size()) {
194 if (This
.starts_with_insensitive(Str
))
196 This
= This
.drop_front();
202 size_t StringRef::rfind_insensitive(char C
, size_t From
) const {
203 From
= std::min(From
, Length
);
207 if (toLower(Data
[i
]) == toLower(C
))
213 /// rfind - Search for the last string \arg Str in the string.
215 /// \return - The index of the last occurrence of \arg Str, or npos if not
217 size_t StringRef::rfind(StringRef Str
) const {
218 return std::string_view(*this).rfind(Str
);
221 size_t StringRef::rfind_insensitive(StringRef Str
) const {
222 size_t N
= Str
.size();
225 for (size_t i
= Length
- N
+ 1, e
= 0; i
!= e
;) {
227 if (substr(i
, N
).equals_insensitive(Str
))
233 /// find_first_of - Find the first character in the string that is in \arg
234 /// Chars, or npos if not found.
236 /// Note: O(size() + Chars.size())
237 StringRef::size_type
StringRef::find_first_of(StringRef Chars
,
239 std::bitset
<1 << CHAR_BIT
> CharBits
;
241 CharBits
.set((unsigned char)C
);
243 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
244 if (CharBits
.test((unsigned char)Data
[i
]))
249 /// find_first_not_of - Find the first character in the string that is not
250 /// \arg C or npos if not found.
251 StringRef::size_type
StringRef::find_first_not_of(char C
, size_t From
) const {
252 return std::string_view(*this).find_first_not_of(C
, From
);
255 /// find_first_not_of - Find the first character in the string that is not
256 /// in the string \arg Chars, or npos if not found.
258 /// Note: O(size() + Chars.size())
259 StringRef::size_type
StringRef::find_first_not_of(StringRef Chars
,
261 std::bitset
<1 << CHAR_BIT
> CharBits
;
263 CharBits
.set((unsigned char)C
);
265 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
266 if (!CharBits
.test((unsigned char)Data
[i
]))
271 /// find_last_of - Find the last character in the string that is in \arg C,
272 /// or npos if not found.
274 /// Note: O(size() + Chars.size())
275 StringRef::size_type
StringRef::find_last_of(StringRef Chars
,
277 std::bitset
<1 << CHAR_BIT
> CharBits
;
279 CharBits
.set((unsigned char)C
);
281 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
282 if (CharBits
.test((unsigned char)Data
[i
]))
287 /// find_last_not_of - Find the last character in the string that is not
288 /// \arg C, or npos if not found.
289 StringRef::size_type
StringRef::find_last_not_of(char C
, size_t From
) const {
290 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
296 /// find_last_not_of - Find the last character in the string that is not in
297 /// \arg Chars, or npos if not found.
299 /// Note: O(size() + Chars.size())
300 StringRef::size_type
StringRef::find_last_not_of(StringRef Chars
,
302 std::bitset
<1 << CHAR_BIT
> CharBits
;
304 CharBits
.set((unsigned char)C
);
306 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
307 if (!CharBits
.test((unsigned char)Data
[i
]))
312 void StringRef::split(SmallVectorImpl
<StringRef
> &A
,
313 StringRef Separator
, int MaxSplit
,
314 bool KeepEmpty
) const {
317 // Count down from MaxSplit. When MaxSplit is -1, this will just split
318 // "forever". This doesn't support splitting more than 2^31 times
319 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
320 // but that seems unlikely to be useful.
321 while (MaxSplit
-- != 0) {
322 size_t Idx
= S
.find(Separator
);
327 if (KeepEmpty
|| Idx
> 0)
328 A
.push_back(S
.slice(0, Idx
));
331 S
= S
.slice(Idx
+ Separator
.size(), npos
);
335 if (KeepEmpty
|| !S
.empty())
339 void StringRef::split(SmallVectorImpl
<StringRef
> &A
, char Separator
,
340 int MaxSplit
, bool KeepEmpty
) const {
343 // Count down from MaxSplit. When MaxSplit is -1, this will just split
344 // "forever". This doesn't support splitting more than 2^31 times
345 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
346 // but that seems unlikely to be useful.
347 while (MaxSplit
-- != 0) {
348 size_t Idx
= S
.find(Separator
);
353 if (KeepEmpty
|| Idx
> 0)
354 A
.push_back(S
.slice(0, Idx
));
357 S
= S
.slice(Idx
+ 1, npos
);
361 if (KeepEmpty
|| !S
.empty())
365 //===----------------------------------------------------------------------===//
366 // Helpful Algorithms
367 //===----------------------------------------------------------------------===//
369 /// count - Return the number of non-overlapped occurrences of \arg Str in
371 size_t StringRef::count(StringRef Str
) const {
374 size_t N
= Str
.size();
375 // TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing
376 // this to `Length + 1` which is more in-line with the function
380 while ((Pos
= find(Str
, Pos
)) != npos
) {
387 static unsigned GetAutoSenseRadix(StringRef
&Str
) {
391 if (Str
.consume_front_insensitive("0x"))
394 if (Str
.consume_front_insensitive("0b"))
397 if (Str
.consume_front("0o"))
400 if (Str
[0] == '0' && Str
.size() > 1 && isDigit(Str
[1])) {
408 bool llvm::consumeUnsignedInteger(StringRef
&Str
, unsigned Radix
,
409 unsigned long long &Result
) {
410 // Autosense radix if not specified.
412 Radix
= GetAutoSenseRadix(Str
);
414 // Empty strings (after the radix autosense) are invalid.
415 if (Str
.empty()) return true;
417 // Parse all the bytes of the string given this radix. Watch for overflow.
418 StringRef Str2
= Str
;
420 while (!Str2
.empty()) {
422 if (Str2
[0] >= '0' && Str2
[0] <= '9')
423 CharVal
= Str2
[0] - '0';
424 else if (Str2
[0] >= 'a' && Str2
[0] <= 'z')
425 CharVal
= Str2
[0] - 'a' + 10;
426 else if (Str2
[0] >= 'A' && Str2
[0] <= 'Z')
427 CharVal
= Str2
[0] - 'A' + 10;
431 // If the parsed value is larger than the integer radix, we cannot
432 // consume any more characters.
433 if (CharVal
>= Radix
)
436 // Add in this character.
437 unsigned long long PrevResult
= Result
;
438 Result
= Result
* Radix
+ CharVal
;
440 // Check for overflow by shifting back and seeing if bits were lost.
441 if (Result
/ Radix
< PrevResult
)
444 Str2
= Str2
.substr(1);
447 // We consider the operation a failure if no characters were consumed
449 if (Str
.size() == Str2
.size())
456 bool llvm::consumeSignedInteger(StringRef
&Str
, unsigned Radix
,
458 unsigned long long ULLVal
;
460 // Handle positive strings first.
461 if (Str
.empty() || Str
.front() != '-') {
462 if (consumeUnsignedInteger(Str
, Radix
, ULLVal
) ||
463 // Check for value so large it overflows a signed value.
464 (long long)ULLVal
< 0)
470 // Get the positive part of the value.
471 StringRef Str2
= Str
.drop_front(1);
472 if (consumeUnsignedInteger(Str2
, Radix
, ULLVal
) ||
473 // Reject values so large they'd overflow as negative signed, but allow
474 // "-0". This negates the unsigned so that the negative isn't undefined
475 // on signed overflow.
476 (long long)-ULLVal
> 0)
484 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
485 /// sequence of radix up to 36 to an unsigned long long value.
486 bool llvm::getAsUnsignedInteger(StringRef Str
, unsigned Radix
,
487 unsigned long long &Result
) {
488 if (consumeUnsignedInteger(Str
, Radix
, Result
))
491 // For getAsUnsignedInteger, we require the whole string to be consumed or
492 // else we consider it a failure.
496 bool llvm::getAsSignedInteger(StringRef Str
, unsigned Radix
,
498 if (consumeSignedInteger(Str
, Radix
, Result
))
501 // For getAsSignedInteger, we require the whole string to be consumed or else
502 // we consider it a failure.
506 bool StringRef::consumeInteger(unsigned Radix
, APInt
&Result
) {
507 StringRef Str
= *this;
509 // Autosense radix if not specified.
511 Radix
= GetAutoSenseRadix(Str
);
513 assert(Radix
> 1 && Radix
<= 36);
515 // Empty strings (after the radix autosense) are invalid.
516 if (Str
.empty()) return true;
518 // Skip leading zeroes. This can be a significant improvement if
519 // it means we don't need > 64 bits.
520 Str
= Str
.ltrim('0');
522 // If it was nothing but zeroes....
524 Result
= APInt(64, 0);
529 // (Over-)estimate the required number of bits.
530 unsigned Log2Radix
= 0;
531 while ((1U << Log2Radix
) < Radix
) Log2Radix
++;
532 bool IsPowerOf2Radix
= ((1U << Log2Radix
) == Radix
);
534 unsigned BitWidth
= Log2Radix
* Str
.size();
535 if (BitWidth
< Result
.getBitWidth())
536 BitWidth
= Result
.getBitWidth(); // don't shrink the result
537 else if (BitWidth
> Result
.getBitWidth())
538 Result
= Result
.zext(BitWidth
);
540 APInt RadixAP
, CharAP
; // unused unless !IsPowerOf2Radix
541 if (!IsPowerOf2Radix
) {
542 // These must have the same bit-width as Result.
543 RadixAP
= APInt(BitWidth
, Radix
);
544 CharAP
= APInt(BitWidth
, 0);
547 // Parse all the bytes of the string given this radix.
549 while (!Str
.empty()) {
551 if (Str
[0] >= '0' && Str
[0] <= '9')
552 CharVal
= Str
[0]-'0';
553 else if (Str
[0] >= 'a' && Str
[0] <= 'z')
554 CharVal
= Str
[0]-'a'+10;
555 else if (Str
[0] >= 'A' && Str
[0] <= 'Z')
556 CharVal
= Str
[0]-'A'+10;
560 // If the parsed value is larger than the integer radix, the string is
562 if (CharVal
>= Radix
)
565 // Add in this character.
566 if (IsPowerOf2Radix
) {
567 Result
<<= Log2Radix
;
578 // We consider the operation a failure if no characters were consumed
580 if (size() == Str
.size())
587 bool StringRef::getAsInteger(unsigned Radix
, APInt
&Result
) const {
588 StringRef Str
= *this;
589 if (Str
.consumeInteger(Radix
, Result
))
592 // For getAsInteger, we require the whole string to be consumed or else we
593 // consider it a failure.
597 bool StringRef::getAsDouble(double &Result
, bool AllowInexact
) const {
599 auto StatusOrErr
= F
.convertFromString(*this, APFloat::rmNearestTiesToEven
);
600 if (errorToBool(StatusOrErr
.takeError()))
603 APFloat::opStatus Status
= *StatusOrErr
;
604 if (Status
!= APFloat::opOK
) {
605 if (!AllowInexact
|| !(Status
& APFloat::opInexact
))
609 Result
= F
.convertToDouble();
613 // Implementation of StringRef hashing.
614 hash_code
llvm::hash_value(StringRef S
) {
615 return hash_combine_range(S
.begin(), S
.end());
618 unsigned DenseMapInfo
<StringRef
, void>::getHashValue(StringRef Val
) {
619 assert(Val
.data() != getEmptyKey().data() &&
620 "Cannot hash the empty key!");
621 assert(Val
.data() != getTombstoneKey().data() &&
622 "Cannot hash the tombstone key!");
623 return (unsigned)(hash_value(Val
));