1 //===-- aarch64 implementation of memory function building blocks ---------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file provides aarch64 specific building blocks to compose memory
12 //===----------------------------------------------------------------------===//
13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
16 #include "src/__support/macros/properties/architectures.h"
18 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
20 #include "src/__support/common.h"
21 #include "src/string/memory_utils/op_generic.h"
27 namespace LIBC_NAMESPACE::aarch64
{
29 LIBC_INLINE_VAR
constexpr bool kNeon
= LLVM_LIBC_IS_DEFINED(__ARM_NEON
);
33 struct BzeroCacheLine
{
34 static constexpr size_t SIZE
= 64;
36 LIBC_INLINE
static void block(Ptr dst
, uint8_t) {
37 #if __SIZEOF_POINTER__ == 4
38 asm("dc zva, %w[dst]" : : [dst
] "r"(dst
) : "memory");
40 asm("dc zva, %[dst]" : : [dst
] "r"(dst
) : "memory");
44 LIBC_INLINE
static void loop_and_tail(Ptr dst
, uint8_t value
, size_t count
) {
47 block(dst
+ offset
, value
);
49 } while (offset
< count
- SIZE
);
50 // Unaligned store, we can't use 'dc zva' here.
51 generic::Memset
<generic_v512
>::tail(dst
, value
, count
);
55 LIBC_INLINE
bool hasZva() {
57 asm("mrs %[zva_val], dczid_el0" : [zva_val
] "=r"(zva_val
));
58 // DC ZVA is permitted if DZP, bit [4] is zero.
59 // BS, bits [3:0] is log2 of the block count in words.
60 // So the next line checks whether the instruction is permitted and block
61 // count is 16 words (i.e. 64 bytes).
62 return (zva_val
& 0b11111) == 0b00100;
67 ///////////////////////////////////////////////////////////////////////////////
69 template <size_t Size
> struct Bcmp
{
70 static constexpr size_t SIZE
= Size
;
71 static constexpr size_t BlockSize
= 32;
73 LIBC_INLINE
static const unsigned char *as_u8(CPtr ptr
) {
74 return reinterpret_cast<const unsigned char *>(ptr
);
77 LIBC_INLINE
static BcmpReturnType
block(CPtr p1
, CPtr p2
) {
78 if constexpr (Size
== 16) {
81 uint8x16_t a
= vld1q_u8(_p1
);
82 uint8x16_t n
= vld1q_u8(_p2
);
83 uint8x16_t an
= veorq_u8(a
, n
);
84 uint32x2_t an_reduced
= vqmovn_u64(vreinterpretq_u64_u8(an
));
85 return vmaxv_u32(an_reduced
);
86 } else if constexpr (Size
== 32) {
89 uint8x16_t a
= vld1q_u8(_p1
);
90 uint8x16_t b
= vld1q_u8(_p1
+ 16);
91 uint8x16_t n
= vld1q_u8(_p2
);
92 uint8x16_t o
= vld1q_u8(_p2
+ 16);
93 uint8x16_t an
= veorq_u8(a
, n
);
94 uint8x16_t bo
= veorq_u8(b
, o
);
95 // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
96 // a difference between the two buffers. We reduce this value down to 4
97 // bytes in two steps. First, calculate the saturated move value when
98 // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
99 // a single 32 bit nonzero value if a mismatch occurred.
100 uint8x16_t anbo
= vorrq_u8(an
, bo
);
101 uint32x2_t anbo_reduced
= vqmovn_u64(vreinterpretq_u64_u8(anbo
));
102 return vmaxv_u32(anbo_reduced
);
103 } else if constexpr ((Size
% BlockSize
) == 0) {
104 for (size_t offset
= 0; offset
< Size
; offset
+= BlockSize
)
105 if (auto value
= Bcmp
<BlockSize
>::block(p1
+ offset
, p2
+ offset
))
108 deferred_static_assert("SIZE not implemented");
110 return BcmpReturnType::ZERO();
113 LIBC_INLINE
static BcmpReturnType
tail(CPtr p1
, CPtr p2
, size_t count
) {
114 return block(p1
+ count
- SIZE
, p2
+ count
- SIZE
);
117 LIBC_INLINE
static BcmpReturnType
head_tail(CPtr p1
, CPtr p2
, size_t count
) {
118 if constexpr (Size
== 16) {
119 auto _p1
= as_u8(p1
);
120 auto _p2
= as_u8(p2
);
121 uint8x16_t a
= vld1q_u8(_p1
);
122 uint8x16_t b
= vld1q_u8(_p1
+ count
- 16);
123 uint8x16_t n
= vld1q_u8(_p2
);
124 uint8x16_t o
= vld1q_u8(_p2
+ count
- 16);
125 uint8x16_t an
= veorq_u8(a
, n
);
126 uint8x16_t bo
= veorq_u8(b
, o
);
127 // anbo = (a ^ n) | (b ^ o)
128 uint8x16_t anbo
= vorrq_u8(an
, bo
);
129 uint32x2_t anbo_reduced
= vqmovn_u64(vreinterpretq_u64_u8(anbo
));
130 return vmaxv_u32(anbo_reduced
);
131 } else if constexpr (Size
== 32) {
132 auto _p1
= as_u8(p1
);
133 auto _p2
= as_u8(p2
);
134 uint8x16_t a
= vld1q_u8(_p1
);
135 uint8x16_t b
= vld1q_u8(_p1
+ 16);
136 uint8x16_t c
= vld1q_u8(_p1
+ count
- 16);
137 uint8x16_t d
= vld1q_u8(_p1
+ count
- 32);
138 uint8x16_t n
= vld1q_u8(_p2
);
139 uint8x16_t o
= vld1q_u8(_p2
+ 16);
140 uint8x16_t p
= vld1q_u8(_p2
+ count
- 16);
141 uint8x16_t q
= vld1q_u8(_p2
+ count
- 32);
142 uint8x16_t an
= veorq_u8(a
, n
);
143 uint8x16_t bo
= veorq_u8(b
, o
);
144 uint8x16_t cp
= veorq_u8(c
, p
);
145 uint8x16_t dq
= veorq_u8(d
, q
);
146 uint8x16_t anbo
= vorrq_u8(an
, bo
);
147 uint8x16_t cpdq
= vorrq_u8(cp
, dq
);
148 // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
149 // a nonzero 32 bit value if a mismatch occurred.
150 uint64x2_t abnocpdq
= vreinterpretq_u64_u8(anbo
| cpdq
);
151 uint32x2_t abnocpdq_reduced
= vqmovn_u64(abnocpdq
);
152 return vmaxv_u32(abnocpdq_reduced
);
154 deferred_static_assert("SIZE not implemented");
156 return BcmpReturnType::ZERO();
159 LIBC_INLINE
static BcmpReturnType
loop_and_tail(CPtr p1
, CPtr p2
,
161 static_assert(Size
> 1, "a loop of size 1 does not need tail");
164 if (auto value
= block(p1
+ offset
, p2
+ offset
))
167 } while (offset
< count
- SIZE
);
168 return tail(p1
, p2
, count
);
172 } // namespace LIBC_NAMESPACE::aarch64
174 namespace LIBC_NAMESPACE::generic
{
176 ///////////////////////////////////////////////////////////////////////////////
177 // Specializations for uint16_t
178 template <> struct cmp_is_expensive
<uint16_t> : public cpp::false_type
{};
179 template <> LIBC_INLINE
bool eq
<uint16_t>(CPtr p1
, CPtr p2
, size_t offset
) {
180 return load
<uint16_t>(p1
, offset
) == load
<uint16_t>(p2
, offset
);
183 LIBC_INLINE
uint32_t neq
<uint16_t>(CPtr p1
, CPtr p2
, size_t offset
) {
184 return load
<uint16_t>(p1
, offset
) ^ load
<uint16_t>(p2
, offset
);
187 LIBC_INLINE MemcmpReturnType cmp
<uint16_t>(CPtr p1
, CPtr p2
, size_t offset
) {
188 return static_cast<int32_t>(load_be
<uint16_t>(p1
, offset
)) -
189 static_cast<int32_t>(load_be
<uint16_t>(p2
, offset
));
192 ///////////////////////////////////////////////////////////////////////////////
193 // Specializations for uint32_t
194 template <> struct cmp_is_expensive
<uint32_t> : cpp::false_type
{};
196 LIBC_INLINE
uint32_t neq
<uint32_t>(CPtr p1
, CPtr p2
, size_t offset
) {
197 return load
<uint32_t>(p1
, offset
) ^ load
<uint32_t>(p2
, offset
);
200 LIBC_INLINE MemcmpReturnType cmp
<uint32_t>(CPtr p1
, CPtr p2
, size_t offset
) {
201 const auto a
= load_be
<uint32_t>(p1
, offset
);
202 const auto b
= load_be
<uint32_t>(p2
, offset
);
203 return a
> b
? 1 : a
< b
? -1 : 0;
206 ///////////////////////////////////////////////////////////////////////////////
207 // Specializations for uint64_t
208 template <> struct cmp_is_expensive
<uint64_t> : cpp::false_type
{};
210 LIBC_INLINE
uint32_t neq
<uint64_t>(CPtr p1
, CPtr p2
, size_t offset
) {
211 return load
<uint64_t>(p1
, offset
) != load
<uint64_t>(p2
, offset
);
214 LIBC_INLINE MemcmpReturnType cmp
<uint64_t>(CPtr p1
, CPtr p2
, size_t offset
) {
215 const auto a
= load_be
<uint64_t>(p1
, offset
);
216 const auto b
= load_be
<uint64_t>(p2
, offset
);
218 return a
> b
? 1 : -1;
219 return MemcmpReturnType::ZERO();
222 ///////////////////////////////////////////////////////////////////////////////
223 // Specializations for uint8x16_t
224 template <> struct is_vector
<uint8x16_t
> : cpp::true_type
{};
225 template <> struct cmp_is_expensive
<uint8x16_t
> : cpp::false_type
{};
227 LIBC_INLINE
uint32_t neq
<uint8x16_t
>(CPtr p1
, CPtr p2
, size_t offset
) {
228 for (size_t i
= 0; i
< 2; ++i
) {
229 auto a
= load
<uint64_t>(p1
, offset
);
230 auto b
= load
<uint64_t>(p2
, offset
);
231 uint32_t cond
= a
!= b
;
234 offset
+= sizeof(uint64_t);
239 LIBC_INLINE MemcmpReturnType cmp
<uint8x16_t
>(CPtr p1
, CPtr p2
, size_t offset
) {
240 for (size_t i
= 0; i
< 2; ++i
) {
241 auto a
= load_be
<uint64_t>(p1
, offset
);
242 auto b
= load_be
<uint64_t>(p2
, offset
);
244 return cmp_neq_uint64_t(a
, b
);
245 offset
+= sizeof(uint64_t);
247 return MemcmpReturnType::ZERO();
250 ///////////////////////////////////////////////////////////////////////////////
251 // Specializations for uint8x16x2_t
252 template <> struct is_vector
<uint8x16x2_t
> : cpp::true_type
{};
253 template <> struct cmp_is_expensive
<uint8x16x2_t
> : cpp::false_type
{};
255 LIBC_INLINE MemcmpReturnType cmp
<uint8x16x2_t
>(CPtr p1
, CPtr p2
,
257 for (size_t i
= 0; i
< 4; ++i
) {
258 auto a
= load_be
<uint64_t>(p1
, offset
);
259 auto b
= load_be
<uint64_t>(p2
, offset
);
261 return cmp_neq_uint64_t(a
, b
);
262 offset
+= sizeof(uint64_t);
264 return MemcmpReturnType::ZERO();
266 } // namespace LIBC_NAMESPACE::generic
268 #endif // LIBC_TARGET_ARCH_IS_AARCH64
270 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H