1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/ADT/StringRef.h"
10 #include "llvm/ADT/APFloat.h"
11 #include "llvm/ADT/APInt.h"
12 #include "llvm/ADT/Hashing.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/edit_distance.h"
15 #include "llvm/Support/Error.h"
20 // MSVC emits references to this into the translation units which reference it.
22 constexpr size_t StringRef::npos
;
25 // strncasecmp() is not available on non-POSIX systems, so define an
26 // alternative function here.
27 static int ascii_strncasecmp(const char *LHS
, const char *RHS
, size_t Length
) {
28 for (size_t I
= 0; I
< Length
; ++I
) {
29 unsigned char LHC
= toLower(LHS
[I
]);
30 unsigned char RHC
= toLower(RHS
[I
]);
32 return LHC
< RHC
? -1 : 1;
37 int StringRef::compare_insensitive(StringRef RHS
) const {
38 if (int Res
= ascii_strncasecmp(Data
, RHS
.Data
, std::min(Length
, RHS
.Length
)))
40 if (Length
== RHS
.Length
)
42 return Length
< RHS
.Length
? -1 : 1;
45 bool StringRef::startswith_insensitive(StringRef Prefix
) const {
46 return Length
>= Prefix
.Length
&&
47 ascii_strncasecmp(Data
, Prefix
.Data
, Prefix
.Length
) == 0;
50 bool StringRef::endswith_insensitive(StringRef Suffix
) const {
51 return Length
>= Suffix
.Length
&&
52 ascii_strncasecmp(end() - Suffix
.Length
, Suffix
.Data
, Suffix
.Length
) == 0;
55 size_t StringRef::find_insensitive(char C
, size_t From
) const {
57 return find_if([L
](char D
) { return toLower(D
) == L
; }, From
);
60 /// compare_numeric - Compare strings, handle embedded numbers.
61 int StringRef::compare_numeric(StringRef RHS
) const {
62 for (size_t I
= 0, E
= std::min(Length
, RHS
.Length
); I
!= E
; ++I
) {
63 // Check for sequences of digits.
64 if (isDigit(Data
[I
]) && isDigit(RHS
.Data
[I
])) {
65 // The longer sequence of numbers is considered larger.
66 // This doesn't really handle prefixed zeros well.
68 for (J
= I
+ 1; J
!= E
+ 1; ++J
) {
69 bool ld
= J
< Length
&& isDigit(Data
[J
]);
70 bool rd
= J
< RHS
.Length
&& isDigit(RHS
.Data
[J
]);
76 // The two number sequences have the same length (J-I), just memcmp them.
77 if (int Res
= compareMemory(Data
+ I
, RHS
.Data
+ I
, J
- I
))
78 return Res
< 0 ? -1 : 1;
79 // Identical number sequences, continue search after the numbers.
83 if (Data
[I
] != RHS
.Data
[I
])
84 return (unsigned char)Data
[I
] < (unsigned char)RHS
.Data
[I
] ? -1 : 1;
86 if (Length
== RHS
.Length
)
88 return Length
< RHS
.Length
? -1 : 1;
91 // Compute the edit distance between the two given strings.
92 unsigned StringRef::edit_distance(llvm::StringRef Other
,
93 bool AllowReplacements
,
94 unsigned MaxEditDistance
) const {
95 return llvm::ComputeEditDistance(
96 makeArrayRef(data(), size()),
97 makeArrayRef(Other
.data(), Other
.size()),
98 AllowReplacements
, MaxEditDistance
);
101 //===----------------------------------------------------------------------===//
103 //===----------------------------------------------------------------------===//
105 std::string
StringRef::lower() const {
106 return std::string(map_iterator(begin(), toLower
),
107 map_iterator(end(), toLower
));
110 std::string
StringRef::upper() const {
111 return std::string(map_iterator(begin(), toUpper
),
112 map_iterator(end(), toUpper
));
115 //===----------------------------------------------------------------------===//
117 //===----------------------------------------------------------------------===//
120 /// find - Search for the first string \arg Str in the string.
122 /// \return - The index of the first occurrence of \arg Str, or npos if not
124 size_t StringRef::find(StringRef Str
, size_t From
) const {
128 const char *Start
= Data
+ From
;
129 size_t Size
= Length
- From
;
131 const char *Needle
= Str
.data();
132 size_t N
= Str
.size();
138 const char *Ptr
= (const char *)::memchr(Start
, Needle
[0], Size
);
139 return Ptr
== nullptr ? npos
: Ptr
- Data
;
142 const char *Stop
= Start
+ (Size
- N
+ 1);
144 // For short haystacks or unsupported needles fall back to the naive algorithm
145 if (Size
< 16 || N
> 255) {
147 if (std::memcmp(Start
, Needle
, N
) == 0)
150 } while (Start
< Stop
);
154 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
155 uint8_t BadCharSkip
[256];
156 std::memset(BadCharSkip
, N
, 256);
157 for (unsigned i
= 0; i
!= N
-1; ++i
)
158 BadCharSkip
[(uint8_t)Str
[i
]] = N
-1-i
;
161 uint8_t Last
= Start
[N
- 1];
162 if (LLVM_UNLIKELY(Last
== (uint8_t)Needle
[N
- 1]))
163 if (std::memcmp(Start
, Needle
, N
- 1) == 0)
166 // Otherwise skip the appropriate number of bytes.
167 Start
+= BadCharSkip
[Last
];
168 } while (Start
< Stop
);
173 size_t StringRef::find_insensitive(StringRef Str
, size_t From
) const {
174 StringRef This
= substr(From
);
175 while (This
.size() >= Str
.size()) {
176 if (This
.startswith_insensitive(Str
))
178 This
= This
.drop_front();
184 size_t StringRef::rfind_insensitive(char C
, size_t From
) const {
185 From
= std::min(From
, Length
);
189 if (toLower(Data
[i
]) == toLower(C
))
195 /// rfind - Search for the last string \arg Str in the string.
197 /// \return - The index of the last occurrence of \arg Str, or npos if not
199 size_t StringRef::rfind(StringRef Str
) const {
200 size_t N
= Str
.size();
203 for (size_t i
= Length
- N
+ 1, e
= 0; i
!= e
;) {
205 if (substr(i
, N
).equals(Str
))
211 size_t StringRef::rfind_insensitive(StringRef Str
) const {
212 size_t N
= Str
.size();
215 for (size_t i
= Length
- N
+ 1, e
= 0; i
!= e
;) {
217 if (substr(i
, N
).equals_insensitive(Str
))
223 /// find_first_of - Find the first character in the string that is in \arg
224 /// Chars, or npos if not found.
226 /// Note: O(size() + Chars.size())
227 StringRef::size_type
StringRef::find_first_of(StringRef Chars
,
229 std::bitset
<1 << CHAR_BIT
> CharBits
;
230 for (size_type i
= 0; i
!= Chars
.size(); ++i
)
231 CharBits
.set((unsigned char)Chars
[i
]);
233 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
234 if (CharBits
.test((unsigned char)Data
[i
]))
239 /// find_first_not_of - Find the first character in the string that is not
240 /// \arg C or npos if not found.
241 StringRef::size_type
StringRef::find_first_not_of(char C
, size_t From
) const {
242 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
248 /// find_first_not_of - Find the first character in the string that is not
249 /// in the string \arg Chars, or npos if not found.
251 /// Note: O(size() + Chars.size())
252 StringRef::size_type
StringRef::find_first_not_of(StringRef Chars
,
254 std::bitset
<1 << CHAR_BIT
> CharBits
;
255 for (size_type i
= 0; i
!= Chars
.size(); ++i
)
256 CharBits
.set((unsigned char)Chars
[i
]);
258 for (size_type i
= std::min(From
, Length
), e
= Length
; i
!= e
; ++i
)
259 if (!CharBits
.test((unsigned char)Data
[i
]))
264 /// find_last_of - Find the last character in the string that is in \arg C,
265 /// or npos if not found.
267 /// Note: O(size() + Chars.size())
268 StringRef::size_type
StringRef::find_last_of(StringRef Chars
,
270 std::bitset
<1 << CHAR_BIT
> CharBits
;
271 for (size_type i
= 0; i
!= Chars
.size(); ++i
)
272 CharBits
.set((unsigned char)Chars
[i
]);
274 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
275 if (CharBits
.test((unsigned char)Data
[i
]))
280 /// find_last_not_of - Find the last character in the string that is not
281 /// \arg C, or npos if not found.
282 StringRef::size_type
StringRef::find_last_not_of(char C
, size_t From
) const {
283 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
289 /// find_last_not_of - Find the last character in the string that is not in
290 /// \arg Chars, or npos if not found.
292 /// Note: O(size() + Chars.size())
293 StringRef::size_type
StringRef::find_last_not_of(StringRef Chars
,
295 std::bitset
<1 << CHAR_BIT
> CharBits
;
296 for (size_type i
= 0, e
= Chars
.size(); i
!= e
; ++i
)
297 CharBits
.set((unsigned char)Chars
[i
]);
299 for (size_type i
= std::min(From
, Length
) - 1, e
= -1; i
!= e
; --i
)
300 if (!CharBits
.test((unsigned char)Data
[i
]))
305 void StringRef::split(SmallVectorImpl
<StringRef
> &A
,
306 StringRef Separator
, int MaxSplit
,
307 bool KeepEmpty
) const {
310 // Count down from MaxSplit. When MaxSplit is -1, this will just split
311 // "forever". This doesn't support splitting more than 2^31 times
312 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
313 // but that seems unlikely to be useful.
314 while (MaxSplit
-- != 0) {
315 size_t Idx
= S
.find(Separator
);
320 if (KeepEmpty
|| Idx
> 0)
321 A
.push_back(S
.slice(0, Idx
));
324 S
= S
.slice(Idx
+ Separator
.size(), npos
);
328 if (KeepEmpty
|| !S
.empty())
332 void StringRef::split(SmallVectorImpl
<StringRef
> &A
, char Separator
,
333 int MaxSplit
, bool KeepEmpty
) const {
336 // Count down from MaxSplit. When MaxSplit is -1, this will just split
337 // "forever". This doesn't support splitting more than 2^31 times
338 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
339 // but that seems unlikely to be useful.
340 while (MaxSplit
-- != 0) {
341 size_t Idx
= S
.find(Separator
);
346 if (KeepEmpty
|| Idx
> 0)
347 A
.push_back(S
.slice(0, Idx
));
350 S
= S
.slice(Idx
+ 1, npos
);
354 if (KeepEmpty
|| !S
.empty())
358 //===----------------------------------------------------------------------===//
359 // Helpful Algorithms
360 //===----------------------------------------------------------------------===//
362 /// count - Return the number of non-overlapped occurrences of \arg Str in
364 size_t StringRef::count(StringRef Str
) const {
366 size_t N
= Str
.size();
367 if (!N
|| N
> Length
)
369 for (size_t i
= 0, e
= Length
- N
+ 1; i
< e
;) {
370 if (substr(i
, N
).equals(Str
)) {
380 static unsigned GetAutoSenseRadix(StringRef
&Str
) {
384 if (Str
.startswith("0x") || Str
.startswith("0X")) {
389 if (Str
.startswith("0b") || Str
.startswith("0B")) {
394 if (Str
.startswith("0o")) {
399 if (Str
[0] == '0' && Str
.size() > 1 && isDigit(Str
[1])) {
407 bool llvm::consumeUnsignedInteger(StringRef
&Str
, unsigned Radix
,
408 unsigned long long &Result
) {
409 // Autosense radix if not specified.
411 Radix
= GetAutoSenseRadix(Str
);
413 // Empty strings (after the radix autosense) are invalid.
414 if (Str
.empty()) return true;
416 // Parse all the bytes of the string given this radix. Watch for overflow.
417 StringRef Str2
= Str
;
419 while (!Str2
.empty()) {
421 if (Str2
[0] >= '0' && Str2
[0] <= '9')
422 CharVal
= Str2
[0] - '0';
423 else if (Str2
[0] >= 'a' && Str2
[0] <= 'z')
424 CharVal
= Str2
[0] - 'a' + 10;
425 else if (Str2
[0] >= 'A' && Str2
[0] <= 'Z')
426 CharVal
= Str2
[0] - 'A' + 10;
430 // If the parsed value is larger than the integer radix, we cannot
431 // consume any more characters.
432 if (CharVal
>= Radix
)
435 // Add in this character.
436 unsigned long long PrevResult
= Result
;
437 Result
= Result
* Radix
+ CharVal
;
439 // Check for overflow by shifting back and seeing if bits were lost.
440 if (Result
/ Radix
< PrevResult
)
443 Str2
= Str2
.substr(1);
446 // We consider the operation a failure if no characters were consumed
448 if (Str
.size() == Str2
.size())
455 bool llvm::consumeSignedInteger(StringRef
&Str
, unsigned Radix
,
457 unsigned long long ULLVal
;
459 // Handle positive strings first.
460 if (Str
.empty() || Str
.front() != '-') {
461 if (consumeUnsignedInteger(Str
, Radix
, ULLVal
) ||
462 // Check for value so large it overflows a signed value.
463 (long long)ULLVal
< 0)
469 // Get the positive part of the value.
470 StringRef Str2
= Str
.drop_front(1);
471 if (consumeUnsignedInteger(Str2
, Radix
, ULLVal
) ||
472 // Reject values so large they'd overflow as negative signed, but allow
473 // "-0". This negates the unsigned so that the negative isn't undefined
474 // on signed overflow.
475 (long long)-ULLVal
> 0)
483 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
484 /// sequence of radix up to 36 to an unsigned long long value.
485 bool llvm::getAsUnsignedInteger(StringRef Str
, unsigned Radix
,
486 unsigned long long &Result
) {
487 if (consumeUnsignedInteger(Str
, Radix
, Result
))
490 // For getAsUnsignedInteger, we require the whole string to be consumed or
491 // else we consider it a failure.
495 bool llvm::getAsSignedInteger(StringRef Str
, unsigned Radix
,
497 if (consumeSignedInteger(Str
, Radix
, Result
))
500 // For getAsSignedInteger, we require the whole string to be consumed or else
501 // we consider it a failure.
505 bool StringRef::getAsInteger(unsigned Radix
, APInt
&Result
) const {
506 StringRef Str
= *this;
508 // Autosense radix if not specified.
510 Radix
= GetAutoSenseRadix(Str
);
512 assert(Radix
> 1 && Radix
<= 36);
514 // Empty strings (after the radix autosense) are invalid.
515 if (Str
.empty()) return true;
517 // Skip leading zeroes. This can be a significant improvement if
518 // it means we don't need > 64 bits.
519 while (!Str
.empty() && Str
.front() == '0')
522 // If it was nothing but zeroes....
524 Result
= APInt(64, 0);
528 // (Over-)estimate the required number of bits.
529 unsigned Log2Radix
= 0;
530 while ((1U << Log2Radix
) < Radix
) Log2Radix
++;
531 bool IsPowerOf2Radix
= ((1U << Log2Radix
) == Radix
);
533 unsigned BitWidth
= Log2Radix
* Str
.size();
534 if (BitWidth
< Result
.getBitWidth())
535 BitWidth
= Result
.getBitWidth(); // don't shrink the result
536 else if (BitWidth
> Result
.getBitWidth())
537 Result
= Result
.zext(BitWidth
);
539 APInt RadixAP
, CharAP
; // unused unless !IsPowerOf2Radix
540 if (!IsPowerOf2Radix
) {
541 // These must have the same bit-width as Result.
542 RadixAP
= APInt(BitWidth
, Radix
);
543 CharAP
= APInt(BitWidth
, 0);
546 // Parse all the bytes of the string given this radix.
548 while (!Str
.empty()) {
550 if (Str
[0] >= '0' && Str
[0] <= '9')
551 CharVal
= Str
[0]-'0';
552 else if (Str
[0] >= 'a' && Str
[0] <= 'z')
553 CharVal
= Str
[0]-'a'+10;
554 else if (Str
[0] >= 'A' && Str
[0] <= 'Z')
555 CharVal
= Str
[0]-'A'+10;
559 // If the parsed value is larger than the integer radix, the string is
561 if (CharVal
>= Radix
)
564 // Add in this character.
565 if (IsPowerOf2Radix
) {
566 Result
<<= Log2Radix
;
580 bool StringRef::getAsDouble(double &Result
, bool AllowInexact
) const {
582 auto StatusOrErr
= F
.convertFromString(*this, APFloat::rmNearestTiesToEven
);
583 if (errorToBool(StatusOrErr
.takeError()))
586 APFloat::opStatus Status
= *StatusOrErr
;
587 if (Status
!= APFloat::opOK
) {
588 if (!AllowInexact
|| !(Status
& APFloat::opInexact
))
592 Result
= F
.convertToDouble();
596 // Implementation of StringRef hashing.
597 hash_code
llvm::hash_value(StringRef S
) {
598 return hash_combine_range(S
.begin(), S
.end());