1 //===-- aarch64 implementation of memory function building blocks ---------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file provides aarch64 specific building blocks to compose memory
12 //===----------------------------------------------------------------------===//
13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
16 #include "src/__support/macros/properties/architectures.h"
18 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
20 #include "src/__support/common.h"
21 #include "src/string/memory_utils/op_generic.h"
27 namespace __llvm_libc::aarch64
{
29 static inline constexpr bool kNeon
= LLVM_LIBC_IS_DEFINED(__ARM_NEON
);
33 struct BzeroCacheLine
{
34 static constexpr size_t SIZE
= 64;
36 LIBC_INLINE
static void block(Ptr dst
, uint8_t) {
37 #if __SIZEOF_POINTER__ == 4
38 asm("dc zva, %w[dst]" : : [dst
] "r"(dst
) : "memory");
40 asm("dc zva, %[dst]" : : [dst
] "r"(dst
) : "memory");
44 LIBC_INLINE
static void loop_and_tail(Ptr dst
, uint8_t value
, size_t count
) {
47 block(dst
+ offset
, value
);
49 } while (offset
< count
- SIZE
);
50 // Unaligned store, we can't use 'dc zva' here.
51 generic::Memset
<uint8x64_t
>::tail(dst
, value
, count
);
55 LIBC_INLINE
static bool hasZva() {
57 asm("mrs %[zva_val], dczid_el0" : [zva_val
] "=r"(zva_val
));
58 // DC ZVA is permitted if DZP, bit [4] is zero.
59 // BS, bits [3:0] is log2 of the block count in words.
60 // So the next line checks whether the instruction is permitted and block
61 // count is 16 words (i.e. 64 bytes).
62 return (zva_val
& 0b11111) == 0b00100;
67 ///////////////////////////////////////////////////////////////////////////////
69 template <size_t Size
> struct Bcmp
{
70 static constexpr size_t SIZE
= Size
;
71 static constexpr size_t BlockSize
= 32;
73 LIBC_INLINE
static const unsigned char *as_u8(CPtr ptr
) {
74 return reinterpret_cast<const unsigned char *>(ptr
);
77 LIBC_INLINE
static BcmpReturnType
block(CPtr p1
, CPtr p2
) {
78 if constexpr (Size
== 16) {
81 uint8x16_t a
= vld1q_u8(_p1
);
82 uint8x16_t n
= vld1q_u8(_p2
);
83 uint8x16_t an
= veorq_u8(a
, n
);
84 uint32x2_t an_reduced
= vqmovn_u64(vreinterpretq_u64_u8(an
));
85 return vmaxv_u32(an_reduced
);
86 } else if constexpr (Size
== 32) {
89 uint8x16_t a
= vld1q_u8(_p1
);
90 uint8x16_t b
= vld1q_u8(_p1
+ 16);
91 uint8x16_t n
= vld1q_u8(_p2
);
92 uint8x16_t o
= vld1q_u8(_p2
+ 16);
93 uint8x16_t an
= veorq_u8(a
, n
);
94 uint8x16_t bo
= veorq_u8(b
, o
);
95 // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
96 // a difference between the two buffers. We reduce this value down to 4
97 // bytes in two steps. First, calculate the saturated move value when
98 // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
99 // a single 32 bit nonzero value if a mismatch occurred.
100 uint8x16_t anbo
= vorrq_u8(an
, bo
);
101 uint32x2_t anbo_reduced
= vqmovn_u64(vreinterpretq_u64_u8(anbo
));
102 return vmaxv_u32(anbo_reduced
);
103 } else if constexpr ((Size
% BlockSize
) == 0) {
104 for (size_t offset
= 0; offset
< Size
; offset
+= BlockSize
)
105 if (auto value
= Bcmp
<BlockSize
>::block(p1
+ offset
, p2
+ offset
))
108 deferred_static_assert("SIZE not implemented");
110 return BcmpReturnType::ZERO();
113 LIBC_INLINE
static BcmpReturnType
tail(CPtr p1
, CPtr p2
, size_t count
) {
114 return block(p1
+ count
- SIZE
, p2
+ count
- SIZE
);
117 LIBC_INLINE
static BcmpReturnType
head_tail(CPtr p1
, CPtr p2
, size_t count
) {
118 if constexpr (Size
== 16) {
119 auto _p1
= as_u8(p1
);
120 auto _p2
= as_u8(p2
);
121 uint8x16_t a
= vld1q_u8(_p1
);
122 uint8x16_t b
= vld1q_u8(_p1
+ count
- 16);
123 uint8x16_t n
= vld1q_u8(_p2
);
124 uint8x16_t o
= vld1q_u8(_p2
+ count
- 16);
125 uint8x16_t an
= veorq_u8(a
, n
);
126 uint8x16_t bo
= veorq_u8(b
, o
);
127 // anbo = (a ^ n) | (b ^ o)
128 uint8x16_t anbo
= vorrq_u8(an
, bo
);
129 uint32x2_t anbo_reduced
= vqmovn_u64(vreinterpretq_u64_u8(anbo
));
130 return vmaxv_u32(anbo_reduced
);
131 } else if constexpr (Size
== 32) {
132 auto _p1
= as_u8(p1
);
133 auto _p2
= as_u8(p2
);
134 uint8x16_t a
= vld1q_u8(_p1
);
135 uint8x16_t b
= vld1q_u8(_p1
+ 16);
136 uint8x16_t c
= vld1q_u8(_p1
+ count
- 16);
137 uint8x16_t d
= vld1q_u8(_p1
+ count
- 32);
138 uint8x16_t n
= vld1q_u8(_p2
);
139 uint8x16_t o
= vld1q_u8(_p2
+ 16);
140 uint8x16_t p
= vld1q_u8(_p2
+ count
- 16);
141 uint8x16_t q
= vld1q_u8(_p2
+ count
- 32);
142 uint8x16_t an
= veorq_u8(a
, n
);
143 uint8x16_t bo
= veorq_u8(b
, o
);
144 uint8x16_t cp
= veorq_u8(c
, p
);
145 uint8x16_t dq
= veorq_u8(d
, q
);
146 uint8x16_t anbo
= vorrq_u8(an
, bo
);
147 uint8x16_t cpdq
= vorrq_u8(cp
, dq
);
148 // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
149 // a nonzero 32 bit value if a mismatch occurred.
150 uint64x2_t abnocpdq
= vreinterpretq_u64_u8(anbo
| cpdq
);
151 uint32x2_t abnocpdq_reduced
= vqmovn_u64(abnocpdq
);
152 return vmaxv_u32(abnocpdq_reduced
);
154 deferred_static_assert("SIZE not implemented");
156 return BcmpReturnType::ZERO();
159 LIBC_INLINE
static BcmpReturnType
loop_and_tail(CPtr p1
, CPtr p2
,
161 static_assert(Size
> 1, "a loop of size 1 does not need tail");
164 if (auto value
= block(p1
+ offset
, p2
+ offset
))
167 } while (offset
< count
- SIZE
);
168 return tail(p1
, p2
, count
);
172 } // namespace __llvm_libc::aarch64
174 #endif // LIBC_TARGET_ARCH_IS_AARCH64
176 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H