1 //===-- x86 implementation of memory function building blocks -------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file provides x86 specific building blocks to compose memory functions.
11 //===----------------------------------------------------------------------===//
12 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
13 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
15 #include "src/__support/macros/properties/architectures.h"
17 #if defined(LIBC_TARGET_ARCH_IS_X86_64)
19 #include "src/__support/common.h"
20 #include "src/string/memory_utils/op_builtin.h"
21 #include "src/string/memory_utils/op_generic.h"
23 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \
25 #include <immintrin.h>
28 // Define fake functions to prevent the compiler from failing on undefined
29 // functions in case the CPU extension is not present.
30 #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
31 #define _mm512_cmpneq_epi8_mask(A, B) 0
33 #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
34 #define _mm256_movemask_epi8(A) 0
36 #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
37 #define _mm_movemask_epi8(A) 0
40 namespace __llvm_libc::x86
{
42 // A set of constants to check compile time features.
43 static inline constexpr bool kSse2
= LLVM_LIBC_IS_DEFINED(__SSE2__
);
44 static inline constexpr bool kAvx
= LLVM_LIBC_IS_DEFINED(__AVX__
);
45 static inline constexpr bool kAvx2
= LLVM_LIBC_IS_DEFINED(__AVX2__
);
46 static inline constexpr bool kAvx512F
= LLVM_LIBC_IS_DEFINED(__AVX512F__
);
47 static inline constexpr bool kAvx512BW
= LLVM_LIBC_IS_DEFINED(__AVX512BW__
);
49 ///////////////////////////////////////////////////////////////////////////////
50 // Memcpy repmovsb implementation
52 static void repmovsb(void *dst
, const void *src
, size_t count
) {
53 asm volatile("rep movsb" : "+D"(dst
), "+S"(src
), "+c"(count
) : : "memory");
57 ///////////////////////////////////////////////////////////////////////////////
60 // Base implementation for the Bcmp specializations.
61 // - BlockSize is either 16, 32 or 64 depending on the available compile time
62 // features, it is used to switch between "single native operation" or a
63 // "sequence of native operations".
64 // - BlockBcmp is the function that implements the bcmp logic.
65 template <size_t Size
, size_t BlockSize
, auto BlockBcmp
> struct BcmpImpl
{
66 static constexpr size_t SIZE
= Size
;
67 LIBC_INLINE
static BcmpReturnType
block(CPtr p1
, CPtr p2
) {
68 if constexpr (Size
== BlockSize
) {
69 return BlockBcmp(p1
, p2
);
70 } else if constexpr (Size
% BlockSize
== 0) {
71 for (size_t offset
= 0; offset
< Size
; offset
+= BlockSize
)
72 if (auto value
= BlockBcmp(p1
+ offset
, p2
+ offset
))
75 deferred_static_assert("SIZE not implemented");
77 return BcmpReturnType::ZERO();
80 LIBC_INLINE
static BcmpReturnType
tail(CPtr p1
, CPtr p2
, size_t count
) {
81 return block(p1
+ count
- Size
, p2
+ count
- Size
);
84 LIBC_INLINE
static BcmpReturnType
head_tail(CPtr p1
, CPtr p2
, size_t count
) {
85 return block(p1
, p2
) | tail(p1
, p2
, count
);
88 LIBC_INLINE
static BcmpReturnType
loop_and_tail(CPtr p1
, CPtr p2
,
90 static_assert(Size
> 1, "a loop of size 1 does not need tail");
93 if (auto value
= block(p1
+ offset
, p2
+ offset
))
96 } while (offset
< count
- Size
);
97 return tail(p1
, p2
, count
);
102 LIBC_INLINE BcmpReturnType
bcmp16(CPtr p1
, CPtr p2
) {
103 #if defined(__SSE2__)
104 using T
= char __attribute__((__vector_size__(16)));
105 // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
107 _mm_movemask_epi8(cpp::bit_cast
<__m128i
>(load
<T
>(p1
) != load
<T
>(p2
)));
108 return static_cast<uint32_t>(mask
);
112 return BcmpReturnType::ZERO();
113 #endif // defined(__SSE2__)
115 template <size_t Size
> using Bcmp
= BcmpImpl
<Size
, 16, bcmp16
>;
119 LIBC_INLINE BcmpReturnType
bcmp32(CPtr p1
, CPtr p2
) {
120 #if defined(__AVX2__)
121 using T
= char __attribute__((__vector_size__(32)));
122 // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
124 _mm256_movemask_epi8(cpp::bit_cast
<__m256i
>(load
<T
>(p1
) != load
<T
>(p2
)));
125 // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
127 return static_cast<uint32_t>(mask
);
131 return BcmpReturnType::ZERO();
132 #endif // defined(__AVX2__)
134 template <size_t Size
> using Bcmp
= BcmpImpl
<Size
, 32, bcmp32
>;
138 LIBC_INLINE BcmpReturnType
bcmp64(CPtr p1
, CPtr p2
) {
139 #if defined(__AVX512BW__)
140 using T
= char __attribute__((__vector_size__(64)));
141 // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
142 const uint64_t mask
= _mm512_cmpneq_epi8_mask(
143 cpp::bit_cast
<__m512i
>(load
<T
>(p1
)), cpp::bit_cast
<__m512i
>(load
<T
>(p2
)));
144 const bool mask_is_set
= mask
!= 0;
145 return static_cast<uint32_t>(mask_is_set
);
149 return BcmpReturnType::ZERO();
150 #endif // defined(__AVX512BW__)
152 template <size_t Size
> using Bcmp
= BcmpImpl
<Size
, 64, bcmp64
>;
153 } // namespace avx512bw
155 // Assuming that the mask is non zero, the index of the first mismatching byte
156 // is the number of trailing zeros in the mask. Trailing zeros and not leading
157 // zeros because the x86 architecture is little endian.
158 LIBC_INLINE MemcmpReturnType
char_diff_no_zero(CPtr p1
, CPtr p2
,
160 const size_t diff_index
= __builtin_ctzll(mask
);
161 const int16_t ca
= cpp::to_integer
<uint8_t>(p1
[diff_index
]);
162 const int16_t cb
= cpp::to_integer
<uint8_t>(p2
[diff_index
]);
166 ///////////////////////////////////////////////////////////////////////////////
169 // Base implementation for the Memcmp specializations.
170 // - BlockSize is either 16, 32 or 64 depending on the available compile time
171 // features, it is used to switch between "single native operation" or a
172 // "sequence of native operations".
173 // - BlockMemcmp is the function that implements the memcmp logic.
174 // - BlockBcmp is the function that implements the bcmp logic.
175 template <size_t Size
, size_t BlockSize
, auto BlockMemcmp
, auto BlockBcmp
>
177 static constexpr size_t SIZE
= Size
;
178 LIBC_INLINE
static MemcmpReturnType
block(CPtr p1
, CPtr p2
) {
179 if constexpr (Size
== BlockSize
) {
180 return BlockMemcmp(p1
, p2
);
181 } else if constexpr (Size
% BlockSize
== 0) {
182 for (size_t offset
= 0; offset
< Size
; offset
+= BlockSize
)
183 if (auto value
= BlockBcmp(p1
+ offset
, p2
+ offset
))
184 return BlockMemcmp(p1
+ offset
, p2
+ offset
);
186 deferred_static_assert("SIZE not implemented");
188 return MemcmpReturnType::ZERO();
191 LIBC_INLINE
static MemcmpReturnType
tail(CPtr p1
, CPtr p2
, size_t count
) {
192 return block(p1
+ count
- Size
, p2
+ count
- Size
);
195 LIBC_INLINE
static MemcmpReturnType
head_tail(CPtr p1
, CPtr p2
,
197 if (auto value
= block(p1
, p2
))
199 return tail(p1
, p2
, count
);
202 LIBC_INLINE
static MemcmpReturnType
loop_and_tail(CPtr p1
, CPtr p2
,
204 static_assert(Size
> 1, "a loop of size 1 does not need tail");
207 if (auto value
= block(p1
+ offset
, p2
+ offset
))
210 } while (offset
< count
- Size
);
211 return tail(p1
, p2
, count
);
216 LIBC_INLINE MemcmpReturnType
memcmp16(CPtr p1
, CPtr p2
) {
217 #if defined(__SSE2__)
218 using T
= char __attribute__((__vector_size__(16)));
219 // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
221 _mm_movemask_epi8(cpp::bit_cast
<__m128i
>(load
<T
>(p1
) != load
<T
>(p2
))))
222 return char_diff_no_zero(p1
, p2
, mask
);
223 return MemcmpReturnType::ZERO();
227 return MemcmpReturnType::ZERO();
228 #endif // defined(__SSE2__)
230 template <size_t Size
> using Memcmp
= MemcmpImpl
<Size
, 16, memcmp16
, bcmp16
>;
234 LIBC_INLINE MemcmpReturnType
memcmp32(CPtr p1
, CPtr p2
) {
235 #if defined(__AVX2__)
236 using T
= char __attribute__((__vector_size__(32)));
237 // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
238 if (int mask
= _mm256_movemask_epi8(
239 cpp::bit_cast
<__m256i
>(load
<T
>(p1
) != load
<T
>(p2
))))
240 return char_diff_no_zero(p1
, p2
, mask
);
241 return MemcmpReturnType::ZERO();
245 return MemcmpReturnType::ZERO();
246 #endif // defined(__AVX2__)
248 template <size_t Size
> using Memcmp
= MemcmpImpl
<Size
, 32, memcmp32
, bcmp32
>;
252 LIBC_INLINE MemcmpReturnType
memcmp64(CPtr p1
, CPtr p2
) {
253 #if defined(__AVX512BW__)
254 using T
= char __attribute__((__vector_size__(64)));
255 // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
257 _mm512_cmpneq_epi8_mask(cpp::bit_cast
<__m512i
>(load
<T
>(p1
)),
258 cpp::bit_cast
<__m512i
>(load
<T
>(p2
))))
259 return char_diff_no_zero(p1
, p2
, mask
);
260 return MemcmpReturnType::ZERO();
264 return MemcmpReturnType::ZERO();
265 #endif // defined(__AVX512BW__)
267 template <size_t Size
> using Memcmp
= MemcmpImpl
<Size
, 64, memcmp64
, bcmp64
>;
268 } // namespace avx512bw
270 } // namespace __llvm_libc::x86
272 #endif // LIBC_TARGET_ARCH_IS_X86_64
274 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H