Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / libc / src / string / memory_utils / op_aarch64.h
blob10de5bcfce85ae229124447544b06067dc3bddcf
1 //===-- aarch64 implementation of memory function building blocks ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides aarch64 specific building blocks to compose memory
10 // functions.
12 //===----------------------------------------------------------------------===//
13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
16 #include "src/__support/macros/properties/architectures.h"
18 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
20 #include "src/__support/common.h"
21 #include "src/string/memory_utils/op_generic.h"
23 #ifdef __ARM_NEON
24 #include <arm_neon.h>
25 #endif //__ARM_NEON
27 namespace LIBC_NAMESPACE::aarch64 {
29 LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
31 namespace neon {
33 struct BzeroCacheLine {
34 static constexpr size_t SIZE = 64;
36 LIBC_INLINE static void block(Ptr dst, uint8_t) {
37 #if __SIZEOF_POINTER__ == 4
38 asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
39 #else
40 asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
41 #endif
44 LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
45 size_t offset = 0;
46 do {
47 block(dst + offset, value);
48 offset += SIZE;
49 } while (offset < count - SIZE);
50 // Unaligned store, we can't use 'dc zva' here.
51 generic::Memset<generic_v512>::tail(dst, value, count);
55 LIBC_INLINE bool hasZva() {
56 uint64_t zva_val;
57 asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
58 // DC ZVA is permitted if DZP, bit [4] is zero.
59 // BS, bits [3:0] is log2 of the block count in words.
60 // So the next line checks whether the instruction is permitted and block
61 // count is 16 words (i.e. 64 bytes).
62 return (zva_val & 0b11111) == 0b00100;
65 } // namespace neon
67 ///////////////////////////////////////////////////////////////////////////////
68 // Bcmp
69 template <size_t Size> struct Bcmp {
70 static constexpr size_t SIZE = Size;
71 static constexpr size_t BlockSize = 32;
73 LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
74 return reinterpret_cast<const unsigned char *>(ptr);
77 LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
78 if constexpr (Size == 16) {
79 auto _p1 = as_u8(p1);
80 auto _p2 = as_u8(p2);
81 uint8x16_t a = vld1q_u8(_p1);
82 uint8x16_t n = vld1q_u8(_p2);
83 uint8x16_t an = veorq_u8(a, n);
84 uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
85 return vmaxv_u32(an_reduced);
86 } else if constexpr (Size == 32) {
87 auto _p1 = as_u8(p1);
88 auto _p2 = as_u8(p2);
89 uint8x16_t a = vld1q_u8(_p1);
90 uint8x16_t b = vld1q_u8(_p1 + 16);
91 uint8x16_t n = vld1q_u8(_p2);
92 uint8x16_t o = vld1q_u8(_p2 + 16);
93 uint8x16_t an = veorq_u8(a, n);
94 uint8x16_t bo = veorq_u8(b, o);
95 // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
96 // a difference between the two buffers. We reduce this value down to 4
97 // bytes in two steps. First, calculate the saturated move value when
98 // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
99 // a single 32 bit nonzero value if a mismatch occurred.
100 uint8x16_t anbo = vorrq_u8(an, bo);
101 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
102 return vmaxv_u32(anbo_reduced);
103 } else if constexpr ((Size % BlockSize) == 0) {
104 for (size_t offset = 0; offset < Size; offset += BlockSize)
105 if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
106 return value;
107 } else {
108 deferred_static_assert("SIZE not implemented");
110 return BcmpReturnType::ZERO();
113 LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
114 return block(p1 + count - SIZE, p2 + count - SIZE);
117 LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
118 if constexpr (Size == 16) {
119 auto _p1 = as_u8(p1);
120 auto _p2 = as_u8(p2);
121 uint8x16_t a = vld1q_u8(_p1);
122 uint8x16_t b = vld1q_u8(_p1 + count - 16);
123 uint8x16_t n = vld1q_u8(_p2);
124 uint8x16_t o = vld1q_u8(_p2 + count - 16);
125 uint8x16_t an = veorq_u8(a, n);
126 uint8x16_t bo = veorq_u8(b, o);
127 // anbo = (a ^ n) | (b ^ o)
128 uint8x16_t anbo = vorrq_u8(an, bo);
129 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
130 return vmaxv_u32(anbo_reduced);
131 } else if constexpr (Size == 32) {
132 auto _p1 = as_u8(p1);
133 auto _p2 = as_u8(p2);
134 uint8x16_t a = vld1q_u8(_p1);
135 uint8x16_t b = vld1q_u8(_p1 + 16);
136 uint8x16_t c = vld1q_u8(_p1 + count - 16);
137 uint8x16_t d = vld1q_u8(_p1 + count - 32);
138 uint8x16_t n = vld1q_u8(_p2);
139 uint8x16_t o = vld1q_u8(_p2 + 16);
140 uint8x16_t p = vld1q_u8(_p2 + count - 16);
141 uint8x16_t q = vld1q_u8(_p2 + count - 32);
142 uint8x16_t an = veorq_u8(a, n);
143 uint8x16_t bo = veorq_u8(b, o);
144 uint8x16_t cp = veorq_u8(c, p);
145 uint8x16_t dq = veorq_u8(d, q);
146 uint8x16_t anbo = vorrq_u8(an, bo);
147 uint8x16_t cpdq = vorrq_u8(cp, dq);
148 // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
149 // a nonzero 32 bit value if a mismatch occurred.
150 uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
151 uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
152 return vmaxv_u32(abnocpdq_reduced);
153 } else {
154 deferred_static_assert("SIZE not implemented");
156 return BcmpReturnType::ZERO();
159 LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
160 size_t count) {
161 static_assert(Size > 1, "a loop of size 1 does not need tail");
162 size_t offset = 0;
163 do {
164 if (auto value = block(p1 + offset, p2 + offset))
165 return value;
166 offset += SIZE;
167 } while (offset < count - SIZE);
168 return tail(p1, p2, count);
172 } // namespace LIBC_NAMESPACE::aarch64
174 namespace LIBC_NAMESPACE::generic {
176 ///////////////////////////////////////////////////////////////////////////////
177 // Specializations for uint16_t
178 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
179 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
180 return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
182 template <>
183 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
184 return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
186 template <>
187 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
188 return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
189 static_cast<int32_t>(load_be<uint16_t>(p2, offset));
192 ///////////////////////////////////////////////////////////////////////////////
193 // Specializations for uint32_t
194 template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
195 template <>
196 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
197 return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
199 template <>
200 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
201 const auto a = load_be<uint32_t>(p1, offset);
202 const auto b = load_be<uint32_t>(p2, offset);
203 return a > b ? 1 : a < b ? -1 : 0;
206 ///////////////////////////////////////////////////////////////////////////////
207 // Specializations for uint64_t
208 template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
209 template <>
210 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
211 return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
213 template <>
214 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
215 const auto a = load_be<uint64_t>(p1, offset);
216 const auto b = load_be<uint64_t>(p2, offset);
217 if (a != b)
218 return a > b ? 1 : -1;
219 return MemcmpReturnType::ZERO();
222 ///////////////////////////////////////////////////////////////////////////////
223 // Specializations for uint8x16_t
224 template <> struct is_vector<uint8x16_t> : cpp::true_type {};
225 template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
226 template <>
227 LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
228 for (size_t i = 0; i < 2; ++i) {
229 auto a = load<uint64_t>(p1, offset);
230 auto b = load<uint64_t>(p2, offset);
231 uint32_t cond = a != b;
232 if (cond)
233 return cond;
234 offset += sizeof(uint64_t);
236 return 0;
238 template <>
239 LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
240 for (size_t i = 0; i < 2; ++i) {
241 auto a = load_be<uint64_t>(p1, offset);
242 auto b = load_be<uint64_t>(p2, offset);
243 if (a != b)
244 return cmp_neq_uint64_t(a, b);
245 offset += sizeof(uint64_t);
247 return MemcmpReturnType::ZERO();
250 ///////////////////////////////////////////////////////////////////////////////
251 // Specializations for uint8x16x2_t
252 template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
253 template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
254 template <>
255 LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
256 size_t offset) {
257 for (size_t i = 0; i < 4; ++i) {
258 auto a = load_be<uint64_t>(p1, offset);
259 auto b = load_be<uint64_t>(p2, offset);
260 if (a != b)
261 return cmp_neq_uint64_t(a, b);
262 offset += sizeof(uint64_t);
264 return MemcmpReturnType::ZERO();
266 } // namespace LIBC_NAMESPACE::generic
268 #endif // LIBC_TARGET_ARCH_IS_AARCH64
270 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H