libc/src/string/string_utils.h

   1 //===-- String utils --------------------------------------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Standalone string utility functions. Utilities requiring memory allocations
  10 // should be placed in allocating_string_utils.h intead.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #ifndef LIBC_SRC_STRING_STRING_UTILS_H
  15 #define LIBC_SRC_STRING_STRING_UTILS_H
  16
  17 #include "src/__support/CPP/bitset.h"
  18 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
  19 #include "src/string/memory_utils/bzero_implementations.h"
  20 #include "src/string/memory_utils/memcpy_implementations.h"
  21 #include <stddef.h> // For size_t
  22
  23 namespace __llvm_libc {
  24 namespace internal {
  25
  26 template <typename Word> constexpr Word repeat_byte(Word byte) {
  27   constexpr size_t BITS_IN_BYTE = 8;
  28   constexpr size_t BYTE_MASK = 0xff;
  29   Word result = 0;
  30   byte = byte & BYTE_MASK;
  31   for (size_t i = 0; i < sizeof(Word); ++i)
  32     result = (result << BITS_IN_BYTE) | byte;
  33   return result;
  34 }
  35
  36 // The goal of this function is to take in a block of arbitrary size and return
  37 // if it has any bytes equal to zero without branching. This is done by
  38 // transforming the block such that zero bytes become non-zero and non-zero
  39 // bytes become zero.
  40 // The first transformation relies on the properties of carrying in arithmetic
  41 // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
  42 // then the result for that byte must be equal to 0xff (or 0xfe if the next byte
  43 // needs a carry as well).
  44 // The next transformation is a simple mask. All zero bytes will have the high
  45 // bit set after the subtraction, so each byte is masked with 0x80. This narrows
  46 // the set of bytes that result in a non-zero value to only zero bytes and bytes
  47 // with the high bit and any other bit set.
  48 // The final transformation masks the result of the previous transformations
  49 // with the inverse of the original byte. This means that any byte that had the
  50 // high bit set will no longer have it set, narrowing the list of bytes which
  51 // result in non-zero values to just the zero byte.
  52 template <typename Word> constexpr bool has_zeroes(Word block) {
  53   constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
  54   constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
  55   Word subtracted = block - LOW_BITS;
  56   Word inverted = ~block;
  57   return (subtracted & inverted & HIGH_BITS) != 0;
  58 }
  59
  60 template <typename Word>
  61 LIBC_INLINE size_t string_length_wide_read(const char *src) {
  62   const char *char_ptr = src;
  63   // Step 1: read 1 byte at a time to align to block size
  64   for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
  65        ++char_ptr) {
  66     if (*char_ptr == '\0')
  67       return char_ptr - src;
  68   }
  69   // Step 2: read blocks
  70   for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
  71        !has_zeroes<Word>(*block_ptr); ++block_ptr) {
  72     char_ptr = reinterpret_cast<const char *>(block_ptr);
  73   }
  74   // Step 3: find the zero in the block
  75   for (; *char_ptr != '\0'; ++char_ptr) {
  76     ;
  77   }
  78   return char_ptr - src;
  79 }
  80
  81 LIBC_INLINE size_t string_length_byte_read(const char *src) {
  82   size_t length;
  83   for (length = 0; *src; ++src, ++length)
  84     ;
  85   return length;
  86 }
  87
  88 // Returns the length of a string, denoted by the first occurrence
  89 // of a null terminator.
  90 LIBC_INLINE size_t string_length(const char *src) {
  91 #ifdef LIBC_COPT_UNSAFE_STRING_WIDE_READ
  92   // Unsigned int is the default size for most processors, and on x86-64 it
  93   // performs better than larger sizes when the src pointer can't be assumed to
  94   // be aligned to a word boundary, so it's the size we use for reading the
  95   // string a block at a time.
  96   return string_length_wide_read<unsigned int>(src);
  97 #else
  98   return string_length_byte_read(src);
  99 #endif
 100 }
 101
 102 template <typename Word>
 103 LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src,
 104                                                  unsigned char ch, size_t n) {
 105   const unsigned char *char_ptr = src;
 106   size_t cur = 0;
 107
 108   // Step 1: read 1 byte at a time to align to block size
 109   for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
 110        ++char_ptr, ++cur) {
 111     if (*char_ptr == ch)
 112       return const_cast<unsigned char *>(char_ptr);
 113   }
 114
 115   const Word ch_mask = repeat_byte<Word>(ch);
 116
 117   // Step 2: read blocks
 118   for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
 119        !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
 120        ++block_ptr, cur += sizeof(Word)) {
 121     char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
 122   }
 123
 124   // Step 3: find the match in the block
 125   for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
 126     ;
 127   }
 128
 129   if (*char_ptr != ch || cur >= n)
 130     return static_cast<void *>(nullptr);
 131
 132   return const_cast<unsigned char *>(char_ptr);
 133 }
 134
 135 LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
 136                                                  unsigned char ch, size_t n) {
 137   for (; n && *src != ch; --n, ++src)
 138     ;
 139   return n ? const_cast<unsigned char *>(src) : nullptr;
 140 }
 141
 142 // Returns the first occurrence of 'ch' within the first 'n' characters of
 143 // 'src'. If 'ch' is not found, returns nullptr.
 144 LIBC_INLINE void *find_first_character(const unsigned char *src,
 145                                        unsigned char ch, size_t max_strlen) {
 146 #ifdef LIBC_COPT_UNSAFE_STRING_WIDE_READ
 147   // If the maximum size of the string is small, the overhead of aligning to a
 148   // word boundary and generating a bitmask of the appropriate size may be
 149   // greater than the gains from reading larger chunks. Based on some testing,
 150   // the crossover point between when it's faster to just read bytewise and read
 151   // blocks is somewhere between 16 and 32, so 4 times the size of the block
 152   // should be in that range.
 153   // Unsigned int is used for the same reason as in strlen.
 154   using BlockType = unsigned int;
 155   if (max_strlen > (sizeof(BlockType) * 4)) {
 156     return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
 157   }
 158 #endif
 159   return find_first_character_byte_read(src, ch, max_strlen);
 160 }
 161
 162 // Returns the maximum length span that contains only characters not found in
 163 // 'segment'. If no characters are found, returns the length of 'src'.
 164 LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
 165   const char *initial = src;
 166   cpp::bitset<256> bitset;
 167
 168   for (; *segment; ++segment)
 169     bitset.set(*reinterpret_cast<const unsigned char *>(segment));
 170   for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src));
 171        ++src)
 172     ;
 173   return src - initial;
 174 }
 175
 176 // Given the similarities between strtok and strtok_r, we can implement both
 177 // using a utility function. On the first call, 'src' is scanned for the
 178 // first character not found in 'delimiter_string'. Once found, it scans until
 179 // the first character in the 'delimiter_string' or the null terminator is
 180 // found. We define this span as a token. The end of the token is appended with
 181 // a null terminator, and the token is returned. The point where the last token
 182 // is found is then stored within 'context' for subsequent calls. Subsequent
 183 // calls will use 'context' when a nullptr is passed in for 'src'. Once the null
 184 // terminating character is reached, returns a nullptr.
 185 template <bool SkipDelim = true>
 186 LIBC_INLINE char *string_token(char *__restrict src,
 187                                const char *__restrict delimiter_string,
 188                                char **__restrict saveptr) {
 189   // Return nullptr immediately if both src AND saveptr are nullptr
 190   if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
 191     return nullptr;
 192
 193   cpp::bitset<256> delimiter_set;
 194   for (; *delimiter_string != '\0'; ++delimiter_string)
 195     delimiter_set.set(*delimiter_string);
 196
 197   if constexpr (SkipDelim)
 198     for (; *src != '\0' && delimiter_set.test(*src); ++src)
 199       ;
 200   if (*src == '\0') {
 201     *saveptr = src;
 202     return nullptr;
 203   }
 204   char *token = src;
 205   for (; *src != '\0'; ++src) {
 206     if (delimiter_set.test(*src)) {
 207       *src = '\0';
 208       ++src;
 209       break;
 210     }
 211   }
 212   *saveptr = src;
 213   return token;
 214 }
 215
 216 LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
 217                            size_t size) {
 218   size_t len = internal::string_length(src);
 219   if (!size)
 220     return len;
 221   size_t n = len < size - 1 ? len : size - 1;
 222   inline_memcpy(dst, src, n);
 223   inline_bzero(dst + n, size - n);
 224   return len;
 225 }
 226
 227 template <bool ReturnNull = true>
 228 constexpr static char *strchr_implementation(const char *src, int c) {
 229   char ch = static_cast<char>(c);
 230   for (; *src && *src != ch; ++src)
 231     ;
 232   char *ret = ReturnNull ? nullptr : const_cast<char *>(src);
 233   return *src == ch ? const_cast<char *>(src) : ret;
 234 }
 235
 236 constexpr static char *strrchr_implementation(const char *src, int c) {
 237   char ch = static_cast<char>(c);
 238   char *last_occurrence = nullptr;
 239   for (; *src; ++src) {
 240     if (*src == ch)
 241       last_occurrence = const_cast<char *>(src);
 242   }
 243   return last_occurrence;
 244 }
 245
 246 } // namespace internal
 247 } // namespace __llvm_libc
 248
 249 #endif //  LIBC_SRC_STRING_STRING_UTILS_H