[NFC][Py Reformat] Added more commits to .git-blame-ignore-revs
[llvm-project.git] / libc / src / string / memory_utils / op_x86.h
blobdcf7405240c7367d998e02acd755f4a157548907
1 //===-- x86 implementation of memory function building blocks -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides x86 specific building blocks to compose memory functions.
11 //===----------------------------------------------------------------------===//
12 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
13 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
15 #include "src/__support/macros/properties/architectures.h"
17 #if defined(LIBC_TARGET_ARCH_IS_X86_64)
19 #include "src/__support/common.h"
20 #include "src/string/memory_utils/op_builtin.h"
21 #include "src/string/memory_utils/op_generic.h"
23 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \
24 defined(__SSE2__)
25 #include <immintrin.h>
26 #endif
28 // Define fake functions to prevent the compiler from failing on undefined
29 // functions in case the CPU extension is not present.
30 #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
31 #define _mm512_cmpneq_epi8_mask(A, B) 0
32 #endif
33 #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
34 #define _mm256_movemask_epi8(A) 0
35 #endif
36 #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
37 #define _mm_movemask_epi8(A) 0
38 #endif
40 namespace __llvm_libc::x86 {
42 // A set of constants to check compile time features.
43 static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
44 static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
45 static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
46 static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
47 static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
49 ///////////////////////////////////////////////////////////////////////////////
50 // Memcpy repmovsb implementation
51 struct Memcpy {
52 static void repmovsb(void *dst, const void *src, size_t count) {
53 asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
57 ///////////////////////////////////////////////////////////////////////////////
58 // Bcmp
60 // Base implementation for the Bcmp specializations.
61 // - BlockSize is either 16, 32 or 64 depending on the available compile time
62 // features, it is used to switch between "single native operation" or a
63 // "sequence of native operations".
64 // - BlockBcmp is the function that implements the bcmp logic.
65 template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
66 static constexpr size_t SIZE = Size;
67 LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
68 if constexpr (Size == BlockSize) {
69 return BlockBcmp(p1, p2);
70 } else if constexpr (Size % BlockSize == 0) {
71 for (size_t offset = 0; offset < Size; offset += BlockSize)
72 if (auto value = BlockBcmp(p1 + offset, p2 + offset))
73 return value;
74 } else {
75 deferred_static_assert("SIZE not implemented");
77 return BcmpReturnType::ZERO();
80 LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
81 return block(p1 + count - Size, p2 + count - Size);
84 LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
85 return block(p1, p2) | tail(p1, p2, count);
88 LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
89 size_t count) {
90 static_assert(Size > 1, "a loop of size 1 does not need tail");
91 size_t offset = 0;
92 do {
93 if (auto value = block(p1 + offset, p2 + offset))
94 return value;
95 offset += Size;
96 } while (offset < count - Size);
97 return tail(p1, p2, count);
101 namespace sse2 {
102 LIBC_INLINE BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
103 #if defined(__SSE2__)
104 using T = char __attribute__((__vector_size__(16)));
105 // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
106 const int mask =
107 _mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2)));
108 return static_cast<uint32_t>(mask);
109 #else
110 (void)p1;
111 (void)p2;
112 return BcmpReturnType::ZERO();
113 #endif // defined(__SSE2__)
115 template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
116 } // namespace sse2
118 namespace avx2 {
119 LIBC_INLINE BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
120 #if defined(__AVX2__)
121 using T = char __attribute__((__vector_size__(32)));
122 // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
123 const int mask =
124 _mm256_movemask_epi8(cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2)));
125 // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
126 // mask.
127 return static_cast<uint32_t>(mask);
128 #else
129 (void)p1;
130 (void)p2;
131 return BcmpReturnType::ZERO();
132 #endif // defined(__AVX2__)
134 template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
135 } // namespace avx2
137 namespace avx512bw {
138 LIBC_INLINE BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
139 #if defined(__AVX512BW__)
140 using T = char __attribute__((__vector_size__(64)));
141 // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
142 const uint64_t mask = _mm512_cmpneq_epi8_mask(
143 cpp::bit_cast<__m512i>(load<T>(p1)), cpp::bit_cast<__m512i>(load<T>(p2)));
144 const bool mask_is_set = mask != 0;
145 return static_cast<uint32_t>(mask_is_set);
146 #else
147 (void)p1;
148 (void)p2;
149 return BcmpReturnType::ZERO();
150 #endif // defined(__AVX512BW__)
152 template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
153 } // namespace avx512bw
155 // Assuming that the mask is non zero, the index of the first mismatching byte
156 // is the number of trailing zeros in the mask. Trailing zeros and not leading
157 // zeros because the x86 architecture is little endian.
158 LIBC_INLINE MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
159 uint64_t mask) {
160 const size_t diff_index = __builtin_ctzll(mask);
161 const int16_t ca = cpp::to_integer<uint8_t>(p1[diff_index]);
162 const int16_t cb = cpp::to_integer<uint8_t>(p2[diff_index]);
163 return ca - cb;
166 ///////////////////////////////////////////////////////////////////////////////
167 // Memcmp
169 // Base implementation for the Memcmp specializations.
170 // - BlockSize is either 16, 32 or 64 depending on the available compile time
171 // features, it is used to switch between "single native operation" or a
172 // "sequence of native operations".
173 // - BlockMemcmp is the function that implements the memcmp logic.
174 // - BlockBcmp is the function that implements the bcmp logic.
175 template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
176 struct MemcmpImpl {
177 static constexpr size_t SIZE = Size;
178 LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
179 if constexpr (Size == BlockSize) {
180 return BlockMemcmp(p1, p2);
181 } else if constexpr (Size % BlockSize == 0) {
182 for (size_t offset = 0; offset < Size; offset += BlockSize)
183 if (auto value = BlockBcmp(p1 + offset, p2 + offset))
184 return BlockMemcmp(p1 + offset, p2 + offset);
185 } else {
186 deferred_static_assert("SIZE not implemented");
188 return MemcmpReturnType::ZERO();
191 LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
192 return block(p1 + count - Size, p2 + count - Size);
195 LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2,
196 size_t count) {
197 if (auto value = block(p1, p2))
198 return value;
199 return tail(p1, p2, count);
202 LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
203 size_t count) {
204 static_assert(Size > 1, "a loop of size 1 does not need tail");
205 size_t offset = 0;
206 do {
207 if (auto value = block(p1 + offset, p2 + offset))
208 return value;
209 offset += Size;
210 } while (offset < count - Size);
211 return tail(p1, p2, count);
215 namespace sse2 {
216 LIBC_INLINE MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
217 #if defined(__SSE2__)
218 using T = char __attribute__((__vector_size__(16)));
219 // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
220 if (int mask =
221 _mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2))))
222 return char_diff_no_zero(p1, p2, mask);
223 return MemcmpReturnType::ZERO();
224 #else
225 (void)p1;
226 (void)p2;
227 return MemcmpReturnType::ZERO();
228 #endif // defined(__SSE2__)
230 template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
231 } // namespace sse2
233 namespace avx2 {
234 LIBC_INLINE MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
235 #if defined(__AVX2__)
236 using T = char __attribute__((__vector_size__(32)));
237 // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
238 if (int mask = _mm256_movemask_epi8(
239 cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2))))
240 return char_diff_no_zero(p1, p2, mask);
241 return MemcmpReturnType::ZERO();
242 #else
243 (void)p1;
244 (void)p2;
245 return MemcmpReturnType::ZERO();
246 #endif // defined(__AVX2__)
248 template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
249 } // namespace avx2
251 namespace avx512bw {
252 LIBC_INLINE MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
253 #if defined(__AVX512BW__)
254 using T = char __attribute__((__vector_size__(64)));
255 // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
256 if (uint64_t mask =
257 _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load<T>(p1)),
258 cpp::bit_cast<__m512i>(load<T>(p2))))
259 return char_diff_no_zero(p1, p2, mask);
260 return MemcmpReturnType::ZERO();
261 #else
262 (void)p1;
263 (void)p2;
264 return MemcmpReturnType::ZERO();
265 #endif // defined(__AVX512BW__)
267 template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
268 } // namespace avx512bw
270 } // namespace __llvm_libc::x86
272 #endif // LIBC_TARGET_ARCH_IS_X86_64
274 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H