1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/ADT/StringRef.h"
10 #include "llvm/ADT/APFloat.h"
11 #include "llvm/ADT/APInt.h"
12 #include "llvm/ADT/Hashing.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/edit_distance.h"
19 // MSVC emits references to this into the translation units which reference it.
21 const size_t StringRef::npos
;
24 // strncasecmp() is not available on non-POSIX systems, so define an
25 // alternative function here.
26 static int ascii_strncasecmp(const char *LHS
, const char *RHS
, size_t Length
) {
27 for (size_t I
= 0; I
< Length
; ++I
) {
28 unsigned char LHC
= toLower(LHS
[I
]);
29 unsigned char RHC
= toLower(RHS
[I
]);
31 return LHC
< RHC
? -1 : 1;
36 /// compare_lower - Compare strings, ignoring case.
37 int StringRef::compare_lower(StringRef RHS
) const {
38 if (int Res
= ascii_strncasecmp(Data
, RHS
.Data
, std::min(Length
, RHS
.Length
)))
40 if (Length
== RHS
.Length
)
42 return Length
< RHS
.Length
? -1 : 1;
45 /// Check if this string starts with the given \p Prefix, ignoring case.
46 bool StringRef::startswith_lower(StringRef Prefix
) const {
47 return Length
>= Prefix
.Length
&&
48 ascii_strncasecmp(Data
, Prefix
.Data
, Prefix
.Length
) == 0;
51 /// Check if this string ends with the given \p Suffix, ignoring case.
52 bool StringRef::endswith_lower(StringRef Suffix
) const {
53 return Length
>= Suffix
.Length
&&
54 ascii_strncasecmp(end() - Suffix
.Length
, Suffix
.Data
, Suffix
.Length
) == 0;
57 size_t StringRef::find_lower(char C
, size_t From
) const {
59 return find_if([L
](char D
) { return toLower(D
) == L
; }, From
);
62 /// compare_numeric - Compare strings, handle embedded numbers.
63 int StringRef::compare_numeric(StringRef RHS
) const {
64 for (size_t I
= 0, E
= std::min(Length
, RHS
.Length
); I
!= E
; ++I
) {
65 // Check for sequences of digits.
66 if (isDigit(Data
[I
]) && isDigit(RHS
.Data
[I
])) {
67 // The longer sequence of numbers is considered larger.
68 // This doesn't really handle prefixed zeros well.
70 for (J
= I
+ 1; J
!= E
+ 1; ++J
) {
71 bool ld
= J
< Length
&& isDigit(Data
[J
]);
72 bool rd
= J
< RHS
.Length
&& isDigit(RHS
.Data
[J
]);
78 // The two number sequences have the same length (J-I), just memcmp them.
79 if (int Res
= compareMemory(Data
+ I
, RHS
.Data
+ I
, J
- I
))
80 return Res
< 0 ? -1 : 1;
81 // Identical number sequences, continue search after the numbers.
85 if (Data
[I
] != RHS
.Data
[I
])
86 return (unsigned char)Data
[I
] < (unsigned char)RHS
.Data
[I
] ? -1 : 1;
88 if (Length
== RHS
.Length
)
90 return Length
< RHS
.Length
? -1 : 1;
93 // Compute the edit distance between the two given strings.
94 unsigned StringRef::edit_distance(llvm::StringRef Other
,
95 bool AllowReplacements
,
96 unsigned MaxEditDistance
) const {
97 return llvm::ComputeEditDistance(
98 makeArrayRef(data(), size()),
99 makeArrayRef(Other
.data(), Other
.size()),
100 AllowReplacements
, MaxEditDistance
);
103 //===----------------------------------------------------------------------===//
105 //===----------------------------------------------------------------------===//
107 std::string
StringRef::lower() const {
108 std::string
Result(size(), char());
109 for (size_type i
= 0, e
= size(); i
!= e
; ++i
) {
110 Result
[i
] = toLower(Data
[i
]);
115 std::string
StringRef::upper() const {
116 std::string
Result(size(), char());
117 for (size_type i
= 0, e
= size(); i
!= e
; ++i
) {
118 Result
[i
] = toUpper(Data
[i
]);
123 //===----------------------------------------------------------------------===//
125 //===----------------------------------------------------------------------===//
128 /// find - Search for the first string \arg Str in the string.
130 /// \return - The index of the first occurrence of \arg Str, or npos if not
132 size_t StringRef::find(StringRef Str
, size_t From
) const {
136 const char *Start
= Data
+ From
;
137 size_t Size
= Length
- From
;
139 const char *Needle
= Str
.data();
140 size_t N
= Str
.size();
146 const char *Ptr
= (const char *)::memchr(Start
, Needle
[0], Size
);
147 return Ptr
== nullptr ? npos
: Ptr
- Data
;
150 const char *Stop
= Start
+ (Size
- N
+ 1);
152 // For short haystacks or unsupported needles fall back to the naive algorithm
153 if (Size
< 16 || N
> 255) {
155 if (std::memcmp(Start
, Needle
, N
) == 0)
158 } while (Start
< Stop
);
162 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
163 uint8_t BadCharSkip
[256];
164 std::memset(BadCharSkip
, N
, 256);
165 for (unsigned i
= 0; i
!= N
-1; ++i
)
166 BadCharSkip
[(uint8_t)Str
[i
]] = N
-1-i
;
169 uint8_t Last
= Start
[N
- 1];
170 if (LLVM_UNLIKELY(Last
== (uint8_t)Needle
[N
- 1]))
171 if (std::memcmp(Start
, Needle
, N
- 1) == 0)
174 // Otherwise skip the appropriate number of bytes.
175 Start
+= BadCharSkip
[Last
];
176 } while (Start
< Stop
);
181 size_t StringRef::find_lower(StringRef Str
, size_t From
) const {
182 StringRef This
= substr(From
);
183 while (This
.size() >= Str
.size()) {
184 if (This
.startswith_lower(Str
))
186 This
= This
.drop_front();
192 size_t StringRef::rfind_lower(char C
, size_t From
) const {
193 From
= std::min(From
, Length
);
197 if (toLower(Data
[i
]) == toLower(C
))
203 /// rfind - Search for the last string \arg Str in the string.
205 /// \return - The index of the last occurrence of \arg Str, or npos if not
207 size_t StringRef::rfind(StringRef Str
) const {
208 size_t N
= Str
.size();
211 for (size_t i
= Length
- N
+ 1, e
= 0; i
!= e
;) {
213 if (substr(i
, N
).equals(Str
))
219 size_t StringRef::rfind_lower(StringRef Str
) const {
220 size_t N
= Str
.size();
223 for (size_t i
= Length
- N
+ 1, e
= 0; i
!= e
;) {
225 if (substr(i
, N
).equals_lower(Str
))
231 /// find_first_of - Find the first character in the string that is in \arg
232 /// Chars, or npos if not found.
234 /// Note: O(size() + Chars.size())
235 StringRef::size_type
StringRef::find_first_of(StringRef Chars
,
237 std::bitset
<1 << CHAR_BIT
> CharBits
;
238 for (size_type i
= 0; i
!= Chars
.size(); ++i
)
239 CharBits
.set((unsigned char)Chars
[i
]);
241 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
242 if (CharBits
.test((unsigned char)Data
[i
]))
247 /// find_first_not_of - Find the first character in the string that is not
248 /// \arg C or npos if not found.
249 StringRef::size_type
StringRef::find_first_not_of(char C
, size_t From
) const {
250 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
256 /// find_first_not_of - Find the first character in the string that is not
257 /// in the string \arg Chars, or npos if not found.
259 /// Note: O(size() + Chars.size())
260 StringRef::size_type
StringRef::find_first_not_of(StringRef Chars
,
262 std::bitset
<1 << CHAR_BIT
> CharBits
;
263 for (size_type i
= 0; i
!= Chars
.size(); ++i
)
264 CharBits
.set((unsigned char)Chars
[i
]);
266 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
267 if (!CharBits
.test((unsigned char)Data
[i
]))
272 /// find_last_of - Find the last character in the string that is in \arg C,
273 /// or npos if not found.
275 /// Note: O(size() + Chars.size())
276 StringRef::size_type
StringRef::find_last_of(StringRef Chars
,
278 std::bitset
<1 << CHAR_BIT
> CharBits
;
279 for (size_type i
= 0; i
!= Chars
.size(); ++i
)
280 CharBits
.set((unsigned char)Chars
[i
]);
282 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
283 if (CharBits
.test((unsigned char)Data
[i
]))
288 /// find_last_not_of - Find the last character in the string that is not
289 /// \arg C, or npos if not found.
290 StringRef::size_type
StringRef::find_last_not_of(char C
, size_t From
) const {
291 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
297 /// find_last_not_of - Find the last character in the string that is not in
298 /// \arg Chars, or npos if not found.
300 /// Note: O(size() + Chars.size())
301 StringRef::size_type
StringRef::find_last_not_of(StringRef Chars
,
303 std::bitset
<1 << CHAR_BIT
> CharBits
;
304 for (size_type i
= 0, e
= Chars
.size(); i
!= e
; ++i
)
305 CharBits
.set((unsigned char)Chars
[i
]);
307 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
308 if (!CharBits
.test((unsigned char)Data
[i
]))
313 void StringRef::split(SmallVectorImpl
<StringRef
> &A
,
314 StringRef Separator
, int MaxSplit
,
315 bool KeepEmpty
) const {
318 // Count down from MaxSplit. When MaxSplit is -1, this will just split
319 // "forever". This doesn't support splitting more than 2^31 times
320 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
321 // but that seems unlikely to be useful.
322 while (MaxSplit
-- != 0) {
323 size_t Idx
= S
.find(Separator
);
328 if (KeepEmpty
|| Idx
> 0)
329 A
.push_back(S
.slice(0, Idx
));
332 S
= S
.slice(Idx
+ Separator
.size(), npos
);
336 if (KeepEmpty
|| !S
.empty())
340 void StringRef::split(SmallVectorImpl
<StringRef
> &A
, char Separator
,
341 int MaxSplit
, bool KeepEmpty
) const {
344 // Count down from MaxSplit. When MaxSplit is -1, this will just split
345 // "forever". This doesn't support splitting more than 2^31 times
346 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
347 // but that seems unlikely to be useful.
348 while (MaxSplit
-- != 0) {
349 size_t Idx
= S
.find(Separator
);
354 if (KeepEmpty
|| Idx
> 0)
355 A
.push_back(S
.slice(0, Idx
));
358 S
= S
.slice(Idx
+ 1, npos
);
362 if (KeepEmpty
|| !S
.empty())
366 //===----------------------------------------------------------------------===//
367 // Helpful Algorithms
368 //===----------------------------------------------------------------------===//
370 /// count - Return the number of non-overlapped occurrences of \arg Str in
372 size_t StringRef::count(StringRef Str
) const {
374 size_t N
= Str
.size();
377 for (size_t i
= 0, e
= Length
- N
+ 1; i
!= e
; ++i
)
378 if (substr(i
, N
).equals(Str
))
383 static unsigned GetAutoSenseRadix(StringRef
&Str
) {
387 if (Str
.startswith("0x") || Str
.startswith("0X")) {
392 if (Str
.startswith("0b") || Str
.startswith("0B")) {
397 if (Str
.startswith("0o")) {
402 if (Str
[0] == '0' && Str
.size() > 1 && isDigit(Str
[1])) {
410 bool llvm::consumeUnsignedInteger(StringRef
&Str
, unsigned Radix
,
411 unsigned long long &Result
) {
412 // Autosense radix if not specified.
414 Radix
= GetAutoSenseRadix(Str
);
416 // Empty strings (after the radix autosense) are invalid.
417 if (Str
.empty()) return true;
419 // Parse all the bytes of the string given this radix. Watch for overflow.
420 StringRef Str2
= Str
;
422 while (!Str2
.empty()) {
424 if (Str2
[0] >= '0' && Str2
[0] <= '9')
425 CharVal
= Str2
[0] - '0';
426 else if (Str2
[0] >= 'a' && Str2
[0] <= 'z')
427 CharVal
= Str2
[0] - 'a' + 10;
428 else if (Str2
[0] >= 'A' && Str2
[0] <= 'Z')
429 CharVal
= Str2
[0] - 'A' + 10;
433 // If the parsed value is larger than the integer radix, we cannot
434 // consume any more characters.
435 if (CharVal
>= Radix
)
438 // Add in this character.
439 unsigned long long PrevResult
= Result
;
440 Result
= Result
* Radix
+ CharVal
;
442 // Check for overflow by shifting back and seeing if bits were lost.
443 if (Result
/ Radix
< PrevResult
)
446 Str2
= Str2
.substr(1);
449 // We consider the operation a failure if no characters were consumed
451 if (Str
.size() == Str2
.size())
458 bool llvm::consumeSignedInteger(StringRef
&Str
, unsigned Radix
,
460 unsigned long long ULLVal
;
462 // Handle positive strings first.
463 if (Str
.empty() || Str
.front() != '-') {
464 if (consumeUnsignedInteger(Str
, Radix
, ULLVal
) ||
465 // Check for value so large it overflows a signed value.
466 (long long)ULLVal
< 0)
472 // Get the positive part of the value.
473 StringRef Str2
= Str
.drop_front(1);
474 if (consumeUnsignedInteger(Str2
, Radix
, ULLVal
) ||
475 // Reject values so large they'd overflow as negative signed, but allow
476 // "-0". This negates the unsigned so that the negative isn't undefined
477 // on signed overflow.
478 (long long)-ULLVal
> 0)
486 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
487 /// sequence of radix up to 36 to an unsigned long long value.
488 bool llvm::getAsUnsignedInteger(StringRef Str
, unsigned Radix
,
489 unsigned long long &Result
) {
490 if (consumeUnsignedInteger(Str
, Radix
, Result
))
493 // For getAsUnsignedInteger, we require the whole string to be consumed or
494 // else we consider it a failure.
498 bool llvm::getAsSignedInteger(StringRef Str
, unsigned Radix
,
500 if (consumeSignedInteger(Str
, Radix
, Result
))
503 // For getAsSignedInteger, we require the whole string to be consumed or else
504 // we consider it a failure.
508 bool StringRef::getAsInteger(unsigned Radix
, APInt
&Result
) const {
509 StringRef Str
= *this;
511 // Autosense radix if not specified.
513 Radix
= GetAutoSenseRadix(Str
);
515 assert(Radix
> 1 && Radix
<= 36);
517 // Empty strings (after the radix autosense) are invalid.
518 if (Str
.empty()) return true;
520 // Skip leading zeroes. This can be a significant improvement if
521 // it means we don't need > 64 bits.
522 while (!Str
.empty() && Str
.front() == '0')
525 // If it was nothing but zeroes....
527 Result
= APInt(64, 0);
531 // (Over-)estimate the required number of bits.
532 unsigned Log2Radix
= 0;
533 while ((1U << Log2Radix
) < Radix
) Log2Radix
++;
534 bool IsPowerOf2Radix
= ((1U << Log2Radix
) == Radix
);
536 unsigned BitWidth
= Log2Radix
* Str
.size();
537 if (BitWidth
< Result
.getBitWidth())
538 BitWidth
= Result
.getBitWidth(); // don't shrink the result
539 else if (BitWidth
> Result
.getBitWidth())
540 Result
= Result
.zext(BitWidth
);
542 APInt RadixAP
, CharAP
; // unused unless !IsPowerOf2Radix
543 if (!IsPowerOf2Radix
) {
544 // These must have the same bit-width as Result.
545 RadixAP
= APInt(BitWidth
, Radix
);
546 CharAP
= APInt(BitWidth
, 0);
549 // Parse all the bytes of the string given this radix.
551 while (!Str
.empty()) {
553 if (Str
[0] >= '0' && Str
[0] <= '9')
554 CharVal
= Str
[0]-'0';
555 else if (Str
[0] >= 'a' && Str
[0] <= 'z')
556 CharVal
= Str
[0]-'a'+10;
557 else if (Str
[0] >= 'A' && Str
[0] <= 'Z')
558 CharVal
= Str
[0]-'A'+10;
562 // If the parsed value is larger than the integer radix, the string is
564 if (CharVal
>= Radix
)
567 // Add in this character.
568 if (IsPowerOf2Radix
) {
569 Result
<<= Log2Radix
;
583 bool StringRef::getAsDouble(double &Result
, bool AllowInexact
) const {
585 APFloat::opStatus Status
=
586 F
.convertFromString(*this, APFloat::rmNearestTiesToEven
);
587 if (Status
!= APFloat::opOK
) {
588 if (!AllowInexact
|| !(Status
& APFloat::opInexact
))
592 Result
= F
.convertToDouble();
596 // Implementation of StringRef hashing.
597 hash_code
llvm::hash_value(StringRef S
) {
598 return hash_combine_range(S
.begin(), S
.end());