1 //===-- aarch64 implementation of memory function building blocks ---------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file provides aarch64 specific building blocks to compose memory
12 //===----------------------------------------------------------------------===//
13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
16 #include "src/__support/macros/config.h"
17 #include "src/__support/macros/properties/architectures.h"
19 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
21 #include "src/__support/CPP/type_traits.h" // cpp::always_false
22 #include "src/__support/common.h"
23 #include "src/string/memory_utils/op_generic.h"
29 namespace LIBC_NAMESPACE_DECL
{
32 LIBC_INLINE_VAR
constexpr bool kNeon
= LLVM_LIBC_IS_DEFINED(__ARM_NEON
);
36 struct BzeroCacheLine
{
37 static constexpr size_t SIZE
= 64;
39 LIBC_INLINE
static void block(Ptr dst
, uint8_t) {
40 #if __SIZEOF_POINTER__ == 4
41 asm("dc zva, %w[dst]" : : [dst
] "r"(dst
) : "memory");
43 asm("dc zva, %[dst]" : : [dst
] "r"(dst
) : "memory");
47 LIBC_INLINE
static void loop_and_tail(Ptr dst
, uint8_t value
, size_t count
) {
50 block(dst
+ offset
, value
);
52 } while (offset
< count
- SIZE
);
53 // Unaligned store, we can't use 'dc zva' here.
54 generic::Memset
<generic_v512
>::tail(dst
, value
, count
);
58 LIBC_INLINE
bool hasZva() {
60 asm("mrs %[zva_val], dczid_el0" : [zva_val
] "=r"(zva_val
));
61 // DC ZVA is permitted if DZP, bit [4] is zero.
62 // BS, bits [3:0] is log2 of the block count in words.
63 // So the next line checks whether the instruction is permitted and block
64 // count is 16 words (i.e. 64 bytes).
65 return (zva_val
& 0b11111) == 0b00100;
70 ///////////////////////////////////////////////////////////////////////////////
72 template <size_t Size
> struct Bcmp
{
73 static constexpr size_t SIZE
= Size
;
74 static constexpr size_t BlockSize
= 32;
76 LIBC_INLINE
static const unsigned char *as_u8(CPtr ptr
) {
77 return reinterpret_cast<const unsigned char *>(ptr
);
80 LIBC_INLINE
static BcmpReturnType
block(CPtr p1
, CPtr p2
) {
81 if constexpr (Size
== 16) {
84 uint8x16_t a
= vld1q_u8(_p1
);
85 uint8x16_t n
= vld1q_u8(_p2
);
86 uint8x16_t an
= veorq_u8(a
, n
);
87 uint32x2_t an_reduced
= vqmovn_u64(vreinterpretq_u64_u8(an
));
88 return vmaxv_u32(an_reduced
);
89 } else if constexpr (Size
== 32) {
92 uint8x16_t a
= vld1q_u8(_p1
);
93 uint8x16_t b
= vld1q_u8(_p1
+ 16);
94 uint8x16_t n
= vld1q_u8(_p2
);
95 uint8x16_t o
= vld1q_u8(_p2
+ 16);
96 uint8x16_t an
= veorq_u8(a
, n
);
97 uint8x16_t bo
= veorq_u8(b
, o
);
98 // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
99 // a difference between the two buffers. We reduce this value down to 4
100 // bytes in two steps. First, calculate the saturated move value when
101 // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
102 // a single 32 bit nonzero value if a mismatch occurred.
103 uint8x16_t anbo
= vorrq_u8(an
, bo
);
104 uint32x2_t anbo_reduced
= vqmovn_u64(vreinterpretq_u64_u8(anbo
));
105 return vmaxv_u32(anbo_reduced
);
106 } else if constexpr ((Size
% BlockSize
) == 0) {
107 for (size_t offset
= 0; offset
< Size
; offset
+= BlockSize
)
108 if (auto value
= Bcmp
<BlockSize
>::block(p1
+ offset
, p2
+ offset
))
111 static_assert(cpp::always_false
<decltype(Size
)>, "SIZE not implemented");
113 return BcmpReturnType::zero();
116 LIBC_INLINE
static BcmpReturnType
tail(CPtr p1
, CPtr p2
, size_t count
) {
117 return block(p1
+ count
- SIZE
, p2
+ count
- SIZE
);
120 LIBC_INLINE
static BcmpReturnType
head_tail(CPtr p1
, CPtr p2
, size_t count
) {
121 if constexpr (Size
== 16) {
122 auto _p1
= as_u8(p1
);
123 auto _p2
= as_u8(p2
);
124 uint8x16_t a
= vld1q_u8(_p1
);
125 uint8x16_t b
= vld1q_u8(_p1
+ count
- 16);
126 uint8x16_t n
= vld1q_u8(_p2
);
127 uint8x16_t o
= vld1q_u8(_p2
+ count
- 16);
128 uint8x16_t an
= veorq_u8(a
, n
);
129 uint8x16_t bo
= veorq_u8(b
, o
);
130 // anbo = (a ^ n) | (b ^ o)
131 uint8x16_t anbo
= vorrq_u8(an
, bo
);
132 uint32x2_t anbo_reduced
= vqmovn_u64(vreinterpretq_u64_u8(anbo
));
133 return vmaxv_u32(anbo_reduced
);
134 } else if constexpr (Size
== 32) {
135 auto _p1
= as_u8(p1
);
136 auto _p2
= as_u8(p2
);
137 uint8x16_t a
= vld1q_u8(_p1
);
138 uint8x16_t b
= vld1q_u8(_p1
+ 16);
139 uint8x16_t c
= vld1q_u8(_p1
+ count
- 16);
140 uint8x16_t d
= vld1q_u8(_p1
+ count
- 32);
141 uint8x16_t n
= vld1q_u8(_p2
);
142 uint8x16_t o
= vld1q_u8(_p2
+ 16);
143 uint8x16_t p
= vld1q_u8(_p2
+ count
- 16);
144 uint8x16_t q
= vld1q_u8(_p2
+ count
- 32);
145 uint8x16_t an
= veorq_u8(a
, n
);
146 uint8x16_t bo
= veorq_u8(b
, o
);
147 uint8x16_t cp
= veorq_u8(c
, p
);
148 uint8x16_t dq
= veorq_u8(d
, q
);
149 uint8x16_t anbo
= vorrq_u8(an
, bo
);
150 uint8x16_t cpdq
= vorrq_u8(cp
, dq
);
151 // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
152 // a nonzero 32 bit value if a mismatch occurred.
153 uint64x2_t abnocpdq
= vreinterpretq_u64_u8(anbo
| cpdq
);
154 uint32x2_t abnocpdq_reduced
= vqmovn_u64(abnocpdq
);
155 return vmaxv_u32(abnocpdq_reduced
);
157 static_assert(cpp::always_false
<decltype(Size
)>, "SIZE not implemented");
159 return BcmpReturnType::zero();
162 LIBC_INLINE
static BcmpReturnType
loop_and_tail(CPtr p1
, CPtr p2
,
164 static_assert(Size
> 1, "a loop of size 1 does not need tail");
167 if (auto value
= block(p1
+ offset
, p2
+ offset
))
170 } while (offset
< count
- SIZE
);
171 return tail(p1
, p2
, count
);
175 } // namespace aarch64
176 } // namespace LIBC_NAMESPACE_DECL
178 namespace LIBC_NAMESPACE_DECL
{
181 ///////////////////////////////////////////////////////////////////////////////
182 // Specializations for uint16_t
183 template <> struct cmp_is_expensive
<uint16_t> : public cpp::false_type
{};
184 template <> LIBC_INLINE
bool eq
<uint16_t>(CPtr p1
, CPtr p2
, size_t offset
) {
185 return load
<uint16_t>(p1
, offset
) == load
<uint16_t>(p2
, offset
);
188 LIBC_INLINE
uint32_t neq
<uint16_t>(CPtr p1
, CPtr p2
, size_t offset
) {
189 return load
<uint16_t>(p1
, offset
) ^ load
<uint16_t>(p2
, offset
);
192 LIBC_INLINE MemcmpReturnType cmp
<uint16_t>(CPtr p1
, CPtr p2
, size_t offset
) {
193 return static_cast<int32_t>(load_be
<uint16_t>(p1
, offset
)) -
194 static_cast<int32_t>(load_be
<uint16_t>(p2
, offset
));
197 ///////////////////////////////////////////////////////////////////////////////
198 // Specializations for uint32_t
199 template <> struct cmp_is_expensive
<uint32_t> : cpp::false_type
{};
201 LIBC_INLINE
uint32_t neq
<uint32_t>(CPtr p1
, CPtr p2
, size_t offset
) {
202 return load
<uint32_t>(p1
, offset
) ^ load
<uint32_t>(p2
, offset
);
205 LIBC_INLINE MemcmpReturnType cmp
<uint32_t>(CPtr p1
, CPtr p2
, size_t offset
) {
206 const auto a
= load_be
<uint32_t>(p1
, offset
);
207 const auto b
= load_be
<uint32_t>(p2
, offset
);
208 return a
> b
? 1 : a
< b
? -1 : 0;
211 ///////////////////////////////////////////////////////////////////////////////
212 // Specializations for uint64_t
213 template <> struct cmp_is_expensive
<uint64_t> : cpp::false_type
{};
215 LIBC_INLINE
uint32_t neq
<uint64_t>(CPtr p1
, CPtr p2
, size_t offset
) {
216 return load
<uint64_t>(p1
, offset
) != load
<uint64_t>(p2
, offset
);
219 LIBC_INLINE MemcmpReturnType cmp
<uint64_t>(CPtr p1
, CPtr p2
, size_t offset
) {
220 const auto a
= load_be
<uint64_t>(p1
, offset
);
221 const auto b
= load_be
<uint64_t>(p2
, offset
);
223 return a
> b
? 1 : -1;
224 return MemcmpReturnType::zero();
227 ///////////////////////////////////////////////////////////////////////////////
228 // Specializations for uint8x16_t
229 template <> struct is_vector
<uint8x16_t
> : cpp::true_type
{};
230 template <> struct cmp_is_expensive
<uint8x16_t
> : cpp::false_type
{};
232 LIBC_INLINE
uint32_t neq
<uint8x16_t
>(CPtr p1
, CPtr p2
, size_t offset
) {
233 for (size_t i
= 0; i
< 2; ++i
) {
234 auto a
= load
<uint64_t>(p1
, offset
);
235 auto b
= load
<uint64_t>(p2
, offset
);
236 uint32_t cond
= a
!= b
;
239 offset
+= sizeof(uint64_t);
244 LIBC_INLINE MemcmpReturnType cmp
<uint8x16_t
>(CPtr p1
, CPtr p2
, size_t offset
) {
245 for (size_t i
= 0; i
< 2; ++i
) {
246 auto a
= load_be
<uint64_t>(p1
, offset
);
247 auto b
= load_be
<uint64_t>(p2
, offset
);
249 return cmp_neq_uint64_t(a
, b
);
250 offset
+= sizeof(uint64_t);
252 return MemcmpReturnType::zero();
255 ///////////////////////////////////////////////////////////////////////////////
256 // Specializations for uint8x16x2_t
257 template <> struct is_vector
<uint8x16x2_t
> : cpp::true_type
{};
258 template <> struct cmp_is_expensive
<uint8x16x2_t
> : cpp::false_type
{};
260 LIBC_INLINE MemcmpReturnType cmp
<uint8x16x2_t
>(CPtr p1
, CPtr p2
,
262 for (size_t i
= 0; i
< 4; ++i
) {
263 auto a
= load_be
<uint64_t>(p1
, offset
);
264 auto b
= load_be
<uint64_t>(p2
, offset
);
266 return cmp_neq_uint64_t(a
, b
);
267 offset
+= sizeof(uint64_t);
269 return MemcmpReturnType::zero();
271 } // namespace generic
272 } // namespace LIBC_NAMESPACE_DECL
274 #endif // LIBC_TARGET_ARCH_IS_AARCH64
276 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H