1 //===-- String utils --------------------------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Standalone string utility functions. Utilities requiring memory allocations
10 // should be placed in allocating_string_utils.h intead.
12 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H
15 #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H
17 #include "src/__support/CPP/bitset.h"
18 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
19 #include "src/string/memory_utils/inline_bzero.h"
20 #include "src/string/memory_utils/inline_memcpy.h"
21 #include <stddef.h> // For size_t
23 namespace LIBC_NAMESPACE
{
26 template <typename Word
> LIBC_INLINE
constexpr Word
repeat_byte(Word byte
) {
27 constexpr size_t BITS_IN_BYTE
= 8;
28 constexpr size_t BYTE_MASK
= 0xff;
30 byte
= byte
& BYTE_MASK
;
31 for (size_t i
= 0; i
< sizeof(Word
); ++i
)
32 result
= (result
<< BITS_IN_BYTE
) | byte
;
36 // The goal of this function is to take in a block of arbitrary size and return
37 // if it has any bytes equal to zero without branching. This is done by
38 // transforming the block such that zero bytes become non-zero and non-zero
40 // The first transformation relies on the properties of carrying in arithmetic
41 // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
42 // then the result for that byte must be equal to 0xff (or 0xfe if the next byte
43 // needs a carry as well).
44 // The next transformation is a simple mask. All zero bytes will have the high
45 // bit set after the subtraction, so each byte is masked with 0x80. This narrows
46 // the set of bytes that result in a non-zero value to only zero bytes and bytes
47 // with the high bit and any other bit set.
48 // The final transformation masks the result of the previous transformations
49 // with the inverse of the original byte. This means that any byte that had the
50 // high bit set will no longer have it set, narrowing the list of bytes which
51 // result in non-zero values to just the zero byte.
52 template <typename Word
> LIBC_INLINE
constexpr bool has_zeroes(Word block
) {
53 constexpr Word LOW_BITS
= repeat_byte
<Word
>(0x01);
54 constexpr Word HIGH_BITS
= repeat_byte
<Word
>(0x80);
55 Word subtracted
= block
- LOW_BITS
;
56 Word inverted
= ~block
;
57 return (subtracted
& inverted
& HIGH_BITS
) != 0;
60 template <typename Word
>
61 LIBC_INLINE
size_t string_length_wide_read(const char *src
) {
62 const char *char_ptr
= src
;
63 // Step 1: read 1 byte at a time to align to block size
64 for (; reinterpret_cast<uintptr_t>(char_ptr
) % sizeof(Word
) != 0;
66 if (*char_ptr
== '\0')
67 return char_ptr
- src
;
69 // Step 2: read blocks
70 for (const Word
*block_ptr
= reinterpret_cast<const Word
*>(char_ptr
);
71 !has_zeroes
<Word
>(*block_ptr
); ++block_ptr
) {
72 char_ptr
= reinterpret_cast<const char *>(block_ptr
);
74 // Step 3: find the zero in the block
75 for (; *char_ptr
!= '\0'; ++char_ptr
) {
78 return char_ptr
- src
;
81 LIBC_INLINE
size_t string_length_byte_read(const char *src
) {
83 for (length
= 0; *src
; ++src
, ++length
)
88 // Returns the length of a string, denoted by the first occurrence
89 // of a null terminator.
90 LIBC_INLINE
size_t string_length(const char *src
) {
91 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
92 // Unsigned int is the default size for most processors, and on x86-64 it
93 // performs better than larger sizes when the src pointer can't be assumed to
94 // be aligned to a word boundary, so it's the size we use for reading the
95 // string a block at a time.
96 return string_length_wide_read
<unsigned int>(src
);
98 return string_length_byte_read(src
);
102 template <typename Word
>
103 LIBC_INLINE
void *find_first_character_wide_read(const unsigned char *src
,
104 unsigned char ch
, size_t n
) {
105 const unsigned char *char_ptr
= src
;
108 // Step 1: read 1 byte at a time to align to block size
109 for (; reinterpret_cast<uintptr_t>(char_ptr
) % sizeof(Word
) != 0 && cur
< n
;
112 return const_cast<unsigned char *>(char_ptr
);
115 const Word ch_mask
= repeat_byte
<Word
>(ch
);
117 // Step 2: read blocks
118 for (const Word
*block_ptr
= reinterpret_cast<const Word
*>(char_ptr
);
119 !has_zeroes
<Word
>((*block_ptr
) ^ ch_mask
) && cur
< n
;
120 ++block_ptr
, cur
+= sizeof(Word
)) {
121 char_ptr
= reinterpret_cast<const unsigned char *>(block_ptr
);
124 // Step 3: find the match in the block
125 for (; *char_ptr
!= ch
&& cur
< n
; ++char_ptr
, ++cur
) {
129 if (*char_ptr
!= ch
|| cur
>= n
)
130 return static_cast<void *>(nullptr);
132 return const_cast<unsigned char *>(char_ptr
);
135 LIBC_INLINE
void *find_first_character_byte_read(const unsigned char *src
,
136 unsigned char ch
, size_t n
) {
137 for (; n
&& *src
!= ch
; --n
, ++src
)
139 return n
? const_cast<unsigned char *>(src
) : nullptr;
142 // Returns the first occurrence of 'ch' within the first 'n' characters of
143 // 'src'. If 'ch' is not found, returns nullptr.
144 LIBC_INLINE
void *find_first_character(const unsigned char *src
,
145 unsigned char ch
, size_t max_strlen
) {
146 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
147 // If the maximum size of the string is small, the overhead of aligning to a
148 // word boundary and generating a bitmask of the appropriate size may be
149 // greater than the gains from reading larger chunks. Based on some testing,
150 // the crossover point between when it's faster to just read bytewise and read
151 // blocks is somewhere between 16 and 32, so 4 times the size of the block
152 // should be in that range.
153 // Unsigned int is used for the same reason as in strlen.
154 using BlockType
= unsigned int;
155 if (max_strlen
> (sizeof(BlockType
) * 4)) {
156 return find_first_character_wide_read
<BlockType
>(src
, ch
, max_strlen
);
159 return find_first_character_byte_read(src
, ch
, max_strlen
);
162 // Returns the maximum length span that contains only characters not found in
163 // 'segment'. If no characters are found, returns the length of 'src'.
164 LIBC_INLINE
size_t complementary_span(const char *src
, const char *segment
) {
165 const char *initial
= src
;
166 cpp::bitset
<256> bitset
;
168 for (; *segment
; ++segment
)
169 bitset
.set(*reinterpret_cast<const unsigned char *>(segment
));
170 for (; *src
&& !bitset
.test(*reinterpret_cast<const unsigned char *>(src
));
173 return src
- initial
;
176 // Given the similarities between strtok and strtok_r, we can implement both
177 // using a utility function. On the first call, 'src' is scanned for the
178 // first character not found in 'delimiter_string'. Once found, it scans until
179 // the first character in the 'delimiter_string' or the null terminator is
180 // found. We define this span as a token. The end of the token is appended with
181 // a null terminator, and the token is returned. The point where the last token
182 // is found is then stored within 'context' for subsequent calls. Subsequent
183 // calls will use 'context' when a nullptr is passed in for 'src'. Once the null
184 // terminating character is reached, returns a nullptr.
185 template <bool SkipDelim
= true>
186 LIBC_INLINE
char *string_token(char *__restrict src
,
187 const char *__restrict delimiter_string
,
188 char **__restrict saveptr
) {
189 // Return nullptr immediately if both src AND saveptr are nullptr
190 if (LIBC_UNLIKELY(src
== nullptr && ((src
= *saveptr
) == nullptr)))
193 cpp::bitset
<256> delimiter_set
;
194 for (; *delimiter_string
!= '\0'; ++delimiter_string
)
195 delimiter_set
.set(*delimiter_string
);
197 if constexpr (SkipDelim
)
198 for (; *src
!= '\0' && delimiter_set
.test(*src
); ++src
)
205 for (; *src
!= '\0'; ++src
) {
206 if (delimiter_set
.test(*src
)) {
216 LIBC_INLINE
size_t strlcpy(char *__restrict dst
, const char *__restrict src
,
218 size_t len
= internal::string_length(src
);
221 size_t n
= len
< size
- 1 ? len
: size
- 1;
222 inline_memcpy(dst
, src
, n
);
223 inline_bzero(dst
+ n
, size
- n
);
227 template <bool ReturnNull
= true>
228 LIBC_INLINE
constexpr static char *strchr_implementation(const char *src
,
230 char ch
= static_cast<char>(c
);
231 for (; *src
&& *src
!= ch
; ++src
)
233 char *ret
= ReturnNull
? nullptr : const_cast<char *>(src
);
234 return *src
== ch
? const_cast<char *>(src
) : ret
;
237 LIBC_INLINE
constexpr static char *strrchr_implementation(const char *src
,
239 char ch
= static_cast<char>(c
);
240 char *last_occurrence
= nullptr;
241 for (; *src
; ++src
) {
243 last_occurrence
= const_cast<char *>(src
);
245 return last_occurrence
;
248 } // namespace internal
249 } // namespace LIBC_NAMESPACE
251 #endif // LLVM_LIBC_SRC_STRING_STRING_UTILS_H