[TableGen] Fix validateOperandClass for non Phyical Reg (#118146)
[llvm-project.git] / libc / src / string / memory_utils / op_aarch64.h
blob1090ea2617f096f9f68de6102651f0e5a6804593
1 //===-- aarch64 implementation of memory function building blocks ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides aarch64 specific building blocks to compose memory
10 // functions.
12 //===----------------------------------------------------------------------===//
13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
16 #include "src/__support/macros/config.h"
17 #include "src/__support/macros/properties/architectures.h"
19 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
21 #include "src/__support/CPP/type_traits.h" // cpp::always_false
22 #include "src/__support/common.h"
23 #include "src/string/memory_utils/op_generic.h"
25 #ifdef __ARM_NEON
26 #include <arm_neon.h>
27 #endif //__ARM_NEON
29 namespace LIBC_NAMESPACE_DECL {
30 namespace aarch64 {
32 LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
34 namespace neon {
36 struct BzeroCacheLine {
37 static constexpr size_t SIZE = 64;
39 LIBC_INLINE static void block(Ptr dst, uint8_t) {
40 #if __SIZEOF_POINTER__ == 4
41 asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
42 #else
43 asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
44 #endif
47 LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
48 size_t offset = 0;
49 do {
50 block(dst + offset, value);
51 offset += SIZE;
52 } while (offset < count - SIZE);
53 // Unaligned store, we can't use 'dc zva' here.
54 generic::Memset<generic_v512>::tail(dst, value, count);
58 LIBC_INLINE bool hasZva() {
59 uint64_t zva_val;
60 asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
61 // DC ZVA is permitted if DZP, bit [4] is zero.
62 // BS, bits [3:0] is log2 of the block count in words.
63 // So the next line checks whether the instruction is permitted and block
64 // count is 16 words (i.e. 64 bytes).
65 return (zva_val & 0b11111) == 0b00100;
68 } // namespace neon
70 ///////////////////////////////////////////////////////////////////////////////
71 // Bcmp
72 template <size_t Size> struct Bcmp {
73 static constexpr size_t SIZE = Size;
74 static constexpr size_t BlockSize = 32;
76 LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
77 return reinterpret_cast<const unsigned char *>(ptr);
80 LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
81 if constexpr (Size == 16) {
82 auto _p1 = as_u8(p1);
83 auto _p2 = as_u8(p2);
84 uint8x16_t a = vld1q_u8(_p1);
85 uint8x16_t n = vld1q_u8(_p2);
86 uint8x16_t an = veorq_u8(a, n);
87 uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
88 return vmaxv_u32(an_reduced);
89 } else if constexpr (Size == 32) {
90 auto _p1 = as_u8(p1);
91 auto _p2 = as_u8(p2);
92 uint8x16_t a = vld1q_u8(_p1);
93 uint8x16_t b = vld1q_u8(_p1 + 16);
94 uint8x16_t n = vld1q_u8(_p2);
95 uint8x16_t o = vld1q_u8(_p2 + 16);
96 uint8x16_t an = veorq_u8(a, n);
97 uint8x16_t bo = veorq_u8(b, o);
98 // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
99 // a difference between the two buffers. We reduce this value down to 4
100 // bytes in two steps. First, calculate the saturated move value when
101 // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
102 // a single 32 bit nonzero value if a mismatch occurred.
103 uint8x16_t anbo = vorrq_u8(an, bo);
104 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
105 return vmaxv_u32(anbo_reduced);
106 } else if constexpr ((Size % BlockSize) == 0) {
107 for (size_t offset = 0; offset < Size; offset += BlockSize)
108 if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
109 return value;
110 } else {
111 static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
113 return BcmpReturnType::zero();
116 LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
117 return block(p1 + count - SIZE, p2 + count - SIZE);
120 LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
121 if constexpr (Size == 16) {
122 auto _p1 = as_u8(p1);
123 auto _p2 = as_u8(p2);
124 uint8x16_t a = vld1q_u8(_p1);
125 uint8x16_t b = vld1q_u8(_p1 + count - 16);
126 uint8x16_t n = vld1q_u8(_p2);
127 uint8x16_t o = vld1q_u8(_p2 + count - 16);
128 uint8x16_t an = veorq_u8(a, n);
129 uint8x16_t bo = veorq_u8(b, o);
130 // anbo = (a ^ n) | (b ^ o)
131 uint8x16_t anbo = vorrq_u8(an, bo);
132 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
133 return vmaxv_u32(anbo_reduced);
134 } else if constexpr (Size == 32) {
135 auto _p1 = as_u8(p1);
136 auto _p2 = as_u8(p2);
137 uint8x16_t a = vld1q_u8(_p1);
138 uint8x16_t b = vld1q_u8(_p1 + 16);
139 uint8x16_t c = vld1q_u8(_p1 + count - 16);
140 uint8x16_t d = vld1q_u8(_p1 + count - 32);
141 uint8x16_t n = vld1q_u8(_p2);
142 uint8x16_t o = vld1q_u8(_p2 + 16);
143 uint8x16_t p = vld1q_u8(_p2 + count - 16);
144 uint8x16_t q = vld1q_u8(_p2 + count - 32);
145 uint8x16_t an = veorq_u8(a, n);
146 uint8x16_t bo = veorq_u8(b, o);
147 uint8x16_t cp = veorq_u8(c, p);
148 uint8x16_t dq = veorq_u8(d, q);
149 uint8x16_t anbo = vorrq_u8(an, bo);
150 uint8x16_t cpdq = vorrq_u8(cp, dq);
151 // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
152 // a nonzero 32 bit value if a mismatch occurred.
153 uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
154 uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
155 return vmaxv_u32(abnocpdq_reduced);
156 } else {
157 static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
159 return BcmpReturnType::zero();
162 LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
163 size_t count) {
164 static_assert(Size > 1, "a loop of size 1 does not need tail");
165 size_t offset = 0;
166 do {
167 if (auto value = block(p1 + offset, p2 + offset))
168 return value;
169 offset += SIZE;
170 } while (offset < count - SIZE);
171 return tail(p1, p2, count);
175 } // namespace aarch64
176 } // namespace LIBC_NAMESPACE_DECL
178 namespace LIBC_NAMESPACE_DECL {
179 namespace generic {
181 ///////////////////////////////////////////////////////////////////////////////
182 // Specializations for uint16_t
183 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
184 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
185 return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
187 template <>
188 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
189 return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
191 template <>
192 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
193 return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
194 static_cast<int32_t>(load_be<uint16_t>(p2, offset));
197 ///////////////////////////////////////////////////////////////////////////////
198 // Specializations for uint32_t
199 template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
200 template <>
201 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
202 return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
204 template <>
205 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
206 const auto a = load_be<uint32_t>(p1, offset);
207 const auto b = load_be<uint32_t>(p2, offset);
208 return a > b ? 1 : a < b ? -1 : 0;
211 ///////////////////////////////////////////////////////////////////////////////
212 // Specializations for uint64_t
213 template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
214 template <>
215 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
216 return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
218 template <>
219 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
220 const auto a = load_be<uint64_t>(p1, offset);
221 const auto b = load_be<uint64_t>(p2, offset);
222 if (a != b)
223 return a > b ? 1 : -1;
224 return MemcmpReturnType::zero();
227 ///////////////////////////////////////////////////////////////////////////////
228 // Specializations for uint8x16_t
229 template <> struct is_vector<uint8x16_t> : cpp::true_type {};
230 template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
231 template <>
232 LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
233 for (size_t i = 0; i < 2; ++i) {
234 auto a = load<uint64_t>(p1, offset);
235 auto b = load<uint64_t>(p2, offset);
236 uint32_t cond = a != b;
237 if (cond)
238 return cond;
239 offset += sizeof(uint64_t);
241 return 0;
243 template <>
244 LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
245 for (size_t i = 0; i < 2; ++i) {
246 auto a = load_be<uint64_t>(p1, offset);
247 auto b = load_be<uint64_t>(p2, offset);
248 if (a != b)
249 return cmp_neq_uint64_t(a, b);
250 offset += sizeof(uint64_t);
252 return MemcmpReturnType::zero();
255 ///////////////////////////////////////////////////////////////////////////////
256 // Specializations for uint8x16x2_t
257 template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
258 template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
259 template <>
260 LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
261 size_t offset) {
262 for (size_t i = 0; i < 4; ++i) {
263 auto a = load_be<uint64_t>(p1, offset);
264 auto b = load_be<uint64_t>(p2, offset);
265 if (a != b)
266 return cmp_neq_uint64_t(a, b);
267 offset += sizeof(uint64_t);
269 return MemcmpReturnType::zero();
271 } // namespace generic
272 } // namespace LIBC_NAMESPACE_DECL
274 #endif // LIBC_TARGET_ARCH_IS_AARCH64
276 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H