[NFC][Py Reformat] Added more commits to .git-blame-ignore-revs
[llvm-project.git] / libc / src / string / string_utils.h
blob572d47d639565f2226b19d3230e2712066739332
1 //===-- String utils --------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Standalone string utility functions. Utilities requiring memory allocations
10 // should be placed in allocating_string_utils.h intead.
12 //===----------------------------------------------------------------------===//
14 #ifndef LIBC_SRC_STRING_STRING_UTILS_H
15 #define LIBC_SRC_STRING_STRING_UTILS_H
17 #include "src/__support/CPP/bitset.h"
18 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
19 #include "src/string/memory_utils/bzero_implementations.h"
20 #include "src/string/memory_utils/memcpy_implementations.h"
21 #include <stddef.h> // For size_t
23 namespace __llvm_libc {
24 namespace internal {
26 template <typename Word> constexpr Word repeat_byte(Word byte) {
27 constexpr size_t BITS_IN_BYTE = 8;
28 constexpr size_t BYTE_MASK = 0xff;
29 Word result = 0;
30 byte = byte & BYTE_MASK;
31 for (size_t i = 0; i < sizeof(Word); ++i)
32 result = (result << BITS_IN_BYTE) | byte;
33 return result;
36 // The goal of this function is to take in a block of arbitrary size and return
37 // if it has any bytes equal to zero without branching. This is done by
38 // transforming the block such that zero bytes become non-zero and non-zero
39 // bytes become zero.
40 // The first transformation relies on the properties of carrying in arithmetic
41 // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
42 // then the result for that byte must be equal to 0xff (or 0xfe if the next byte
43 // needs a carry as well).
44 // The next transformation is a simple mask. All zero bytes will have the high
45 // bit set after the subtraction, so each byte is masked with 0x80. This narrows
46 // the set of bytes that result in a non-zero value to only zero bytes and bytes
47 // with the high bit and any other bit set.
48 // The final transformation masks the result of the previous transformations
49 // with the inverse of the original byte. This means that any byte that had the
50 // high bit set will no longer have it set, narrowing the list of bytes which
51 // result in non-zero values to just the zero byte.
52 template <typename Word> constexpr bool has_zeroes(Word block) {
53 constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
54 constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
55 Word subtracted = block - LOW_BITS;
56 Word inverted = ~block;
57 return (subtracted & inverted & HIGH_BITS) != 0;
60 template <typename Word>
61 LIBC_INLINE size_t string_length_wide_read(const char *src) {
62 const char *char_ptr = src;
63 // Step 1: read 1 byte at a time to align to block size
64 for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
65 ++char_ptr) {
66 if (*char_ptr == '\0')
67 return char_ptr - src;
69 // Step 2: read blocks
70 for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
71 !has_zeroes<Word>(*block_ptr); ++block_ptr) {
72 char_ptr = reinterpret_cast<const char *>(block_ptr);
74 // Step 3: find the zero in the block
75 for (; *char_ptr != '\0'; ++char_ptr) {
78 return char_ptr - src;
81 LIBC_INLINE size_t string_length_byte_read(const char *src) {
82 size_t length;
83 for (length = 0; *src; ++src, ++length)
85 return length;
88 // Returns the length of a string, denoted by the first occurrence
89 // of a null terminator.
90 LIBC_INLINE size_t string_length(const char *src) {
91 #ifdef LIBC_COPT_UNSAFE_STRING_WIDE_READ
92 // Unsigned int is the default size for most processors, and on x86-64 it
93 // performs better than larger sizes when the src pointer can't be assumed to
94 // be aligned to a word boundary, so it's the size we use for reading the
95 // string a block at a time.
96 return string_length_wide_read<unsigned int>(src);
97 #else
98 return string_length_byte_read(src);
99 #endif
102 template <typename Word>
103 LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src,
104 unsigned char ch, size_t n) {
105 const unsigned char *char_ptr = src;
106 size_t cur = 0;
108 // Step 1: read 1 byte at a time to align to block size
109 for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
110 ++char_ptr, ++cur) {
111 if (*char_ptr == ch)
112 return const_cast<unsigned char *>(char_ptr);
115 const Word ch_mask = repeat_byte<Word>(ch);
117 // Step 2: read blocks
118 for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
119 !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
120 ++block_ptr, cur += sizeof(Word)) {
121 char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
124 // Step 3: find the match in the block
125 for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
129 if (*char_ptr != ch || cur >= n)
130 return static_cast<void *>(nullptr);
132 return const_cast<unsigned char *>(char_ptr);
135 LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
136 unsigned char ch, size_t n) {
137 for (; n && *src != ch; --n, ++src)
139 return n ? const_cast<unsigned char *>(src) : nullptr;
142 // Returns the first occurrence of 'ch' within the first 'n' characters of
143 // 'src'. If 'ch' is not found, returns nullptr.
144 LIBC_INLINE void *find_first_character(const unsigned char *src,
145 unsigned char ch, size_t max_strlen) {
146 #ifdef LIBC_COPT_UNSAFE_STRING_WIDE_READ
147 // If the maximum size of the string is small, the overhead of aligning to a
148 // word boundary and generating a bitmask of the appropriate size may be
149 // greater than the gains from reading larger chunks. Based on some testing,
150 // the crossover point between when it's faster to just read bytewise and read
151 // blocks is somewhere between 16 and 32, so 4 times the size of the block
152 // should be in that range.
153 // Unsigned int is used for the same reason as in strlen.
154 using BlockType = unsigned int;
155 if (max_strlen > (sizeof(BlockType) * 4)) {
156 return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
158 #endif
159 return find_first_character_byte_read(src, ch, max_strlen);
162 // Returns the maximum length span that contains only characters not found in
163 // 'segment'. If no characters are found, returns the length of 'src'.
164 LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
165 const char *initial = src;
166 cpp::bitset<256> bitset;
168 for (; *segment; ++segment)
169 bitset.set(*reinterpret_cast<const unsigned char *>(segment));
170 for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src));
171 ++src)
173 return src - initial;
176 // Given the similarities between strtok and strtok_r, we can implement both
177 // using a utility function. On the first call, 'src' is scanned for the
178 // first character not found in 'delimiter_string'. Once found, it scans until
179 // the first character in the 'delimiter_string' or the null terminator is
180 // found. We define this span as a token. The end of the token is appended with
181 // a null terminator, and the token is returned. The point where the last token
182 // is found is then stored within 'context' for subsequent calls. Subsequent
183 // calls will use 'context' when a nullptr is passed in for 'src'. Once the null
184 // terminating character is reached, returns a nullptr.
185 template <bool SkipDelim = true>
186 LIBC_INLINE char *string_token(char *__restrict src,
187 const char *__restrict delimiter_string,
188 char **__restrict saveptr) {
189 // Return nullptr immediately if both src AND saveptr are nullptr
190 if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
191 return nullptr;
193 cpp::bitset<256> delimiter_set;
194 for (; *delimiter_string != '\0'; ++delimiter_string)
195 delimiter_set.set(*delimiter_string);
197 if constexpr (SkipDelim)
198 for (; *src != '\0' && delimiter_set.test(*src); ++src)
200 if (*src == '\0') {
201 *saveptr = src;
202 return nullptr;
204 char *token = src;
205 for (; *src != '\0'; ++src) {
206 if (delimiter_set.test(*src)) {
207 *src = '\0';
208 ++src;
209 break;
212 *saveptr = src;
213 return token;
216 LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
217 size_t size) {
218 size_t len = internal::string_length(src);
219 if (!size)
220 return len;
221 size_t n = len < size - 1 ? len : size - 1;
222 inline_memcpy(dst, src, n);
223 inline_bzero(dst + n, size - n);
224 return len;
227 template <bool ReturnNull = true>
228 constexpr static char *strchr_implementation(const char *src, int c) {
229 char ch = static_cast<char>(c);
230 for (; *src && *src != ch; ++src)
232 char *ret = ReturnNull ? nullptr : const_cast<char *>(src);
233 return *src == ch ? const_cast<char *>(src) : ret;
236 constexpr static char *strrchr_implementation(const char *src, int c) {
237 char ch = static_cast<char>(c);
238 char *last_occurrence = nullptr;
239 for (; *src; ++src) {
240 if (*src == ch)
241 last_occurrence = const_cast<char *>(src);
243 return last_occurrence;
246 } // namespace internal
247 } // namespace __llvm_libc
249 #endif // LIBC_SRC_STRING_STRING_UTILS_H