libc/src/string/memory_utils/op_aarch64.h

   1 //===-- aarch64 implementation of memory function building blocks ---------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file provides aarch64 specific building blocks to compose memory
  10 // functions.
  11 //
  12 //===----------------------------------------------------------------------===//
  13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
  14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
  15
  16 #include "src/__support/macros/properties/architectures.h"
  17
  18 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
  19
  20 #include "src/__support/common.h"
  21 #include "src/string/memory_utils/op_generic.h"
  22
  23 #ifdef __ARM_NEON
  24 #include <arm_neon.h>
  25 #endif //__ARM_NEON
  26
  27 namespace LIBC_NAMESPACE::aarch64 {
  28
  29 LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
  30
  31 namespace neon {
  32
  33 struct BzeroCacheLine {
  34   static constexpr size_t SIZE = 64;
  35
  36   LIBC_INLINE static void block(Ptr dst, uint8_t) {
  37 #if __SIZEOF_POINTER__ == 4
  38     asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
  39 #else
  40     asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
  41 #endif
  42   }
  43
  44   LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
  45     size_t offset = 0;
  46     do {
  47       block(dst + offset, value);
  48       offset += SIZE;
  49     } while (offset < count - SIZE);
  50     // Unaligned store, we can't use 'dc zva' here.
  51     generic::Memset<generic_v512>::tail(dst, value, count);
  52   }
  53 };
  54
  55 LIBC_INLINE bool hasZva() {
  56   uint64_t zva_val;
  57   asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
  58   // DC ZVA is permitted if DZP, bit [4] is zero.
  59   // BS, bits [3:0] is log2 of the block count in words.
  60   // So the next line checks whether the instruction is permitted and block
  61   // count is 16 words (i.e. 64 bytes).
  62   return (zva_val & 0b11111) == 0b00100;
  63 }
  64
  65 } // namespace neon
  66
  67 ///////////////////////////////////////////////////////////////////////////////
  68 // Bcmp
  69 template <size_t Size> struct Bcmp {
  70   static constexpr size_t SIZE = Size;
  71   static constexpr size_t BlockSize = 32;
  72
  73   LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
  74     return reinterpret_cast<const unsigned char *>(ptr);
  75   }
  76
  77   LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
  78     if constexpr (Size == 16) {
  79       auto _p1 = as_u8(p1);
  80       auto _p2 = as_u8(p2);
  81       uint8x16_t a = vld1q_u8(_p1);
  82       uint8x16_t n = vld1q_u8(_p2);
  83       uint8x16_t an = veorq_u8(a, n);
  84       uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
  85       return vmaxv_u32(an_reduced);
  86     } else if constexpr (Size == 32) {
  87       auto _p1 = as_u8(p1);
  88       auto _p2 = as_u8(p2);
  89       uint8x16_t a = vld1q_u8(_p1);
  90       uint8x16_t b = vld1q_u8(_p1 + 16);
  91       uint8x16_t n = vld1q_u8(_p2);
  92       uint8x16_t o = vld1q_u8(_p2 + 16);
  93       uint8x16_t an = veorq_u8(a, n);
  94       uint8x16_t bo = veorq_u8(b, o);
  95       // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
  96       // a difference between the two buffers.  We reduce this value down to 4
  97       // bytes in two steps. First, calculate the saturated move value when
  98       // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
  99       // a single 32 bit nonzero value if a mismatch occurred.
 100       uint8x16_t anbo = vorrq_u8(an, bo);
 101       uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
 102       return vmaxv_u32(anbo_reduced);
 103     } else if constexpr ((Size % BlockSize) == 0) {
 104       for (size_t offset = 0; offset < Size; offset += BlockSize)
 105         if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
 106           return value;
 107     } else {
 108       deferred_static_assert("SIZE not implemented");
 109     }
 110     return BcmpReturnType::ZERO();
 111   }
 112
 113   LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
 114     return block(p1 + count - SIZE, p2 + count - SIZE);
 115   }
 116
 117   LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
 118     if constexpr (Size == 16) {
 119       auto _p1 = as_u8(p1);
 120       auto _p2 = as_u8(p2);
 121       uint8x16_t a = vld1q_u8(_p1);
 122       uint8x16_t b = vld1q_u8(_p1 + count - 16);
 123       uint8x16_t n = vld1q_u8(_p2);
 124       uint8x16_t o = vld1q_u8(_p2 + count - 16);
 125       uint8x16_t an = veorq_u8(a, n);
 126       uint8x16_t bo = veorq_u8(b, o);
 127       // anbo = (a ^ n) | (b ^ o)
 128       uint8x16_t anbo = vorrq_u8(an, bo);
 129       uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
 130       return vmaxv_u32(anbo_reduced);
 131     } else if constexpr (Size == 32) {
 132       auto _p1 = as_u8(p1);
 133       auto _p2 = as_u8(p2);
 134       uint8x16_t a = vld1q_u8(_p1);
 135       uint8x16_t b = vld1q_u8(_p1 + 16);
 136       uint8x16_t c = vld1q_u8(_p1 + count - 16);
 137       uint8x16_t d = vld1q_u8(_p1 + count - 32);
 138       uint8x16_t n = vld1q_u8(_p2);
 139       uint8x16_t o = vld1q_u8(_p2 + 16);
 140       uint8x16_t p = vld1q_u8(_p2 + count - 16);
 141       uint8x16_t q = vld1q_u8(_p2 + count - 32);
 142       uint8x16_t an = veorq_u8(a, n);
 143       uint8x16_t bo = veorq_u8(b, o);
 144       uint8x16_t cp = veorq_u8(c, p);
 145       uint8x16_t dq = veorq_u8(d, q);
 146       uint8x16_t anbo = vorrq_u8(an, bo);
 147       uint8x16_t cpdq = vorrq_u8(cp, dq);
 148       // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
 149       // a nonzero 32 bit value if a mismatch occurred.
 150       uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
 151       uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
 152       return vmaxv_u32(abnocpdq_reduced);
 153     } else {
 154       deferred_static_assert("SIZE not implemented");
 155     }
 156     return BcmpReturnType::ZERO();
 157   }
 158
 159   LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
 160                                                   size_t count) {
 161     static_assert(Size > 1, "a loop of size 1 does not need tail");
 162     size_t offset = 0;
 163     do {
 164       if (auto value = block(p1 + offset, p2 + offset))
 165         return value;
 166       offset += SIZE;
 167     } while (offset < count - SIZE);
 168     return tail(p1, p2, count);
 169   }
 170 };
 171
 172 } // namespace LIBC_NAMESPACE::aarch64
 173
 174 namespace LIBC_NAMESPACE::generic {
 175
 176 ///////////////////////////////////////////////////////////////////////////////
 177 // Specializations for uint16_t
 178 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
 179 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
 180   return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
 181 }
 182 template <>
 183 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
 184   return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
 185 }
 186 template <>
 187 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
 188   return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
 189          static_cast<int32_t>(load_be<uint16_t>(p2, offset));
 190 }
 191
 192 ///////////////////////////////////////////////////////////////////////////////
 193 // Specializations for uint32_t
 194 template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
 195 template <>
 196 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
 197   return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
 198 }
 199 template <>
 200 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
 201   const auto a = load_be<uint32_t>(p1, offset);
 202   const auto b = load_be<uint32_t>(p2, offset);
 203   return a > b ? 1 : a < b ? -1 : 0;
 204 }
 205
 206 ///////////////////////////////////////////////////////////////////////////////
 207 // Specializations for uint64_t
 208 template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
 209 template <>
 210 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
 211   return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
 212 }
 213 template <>
 214 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
 215   const auto a = load_be<uint64_t>(p1, offset);
 216   const auto b = load_be<uint64_t>(p2, offset);
 217   if (a != b)
 218     return a > b ? 1 : -1;
 219   return MemcmpReturnType::ZERO();
 220 }
 221
 222 ///////////////////////////////////////////////////////////////////////////////
 223 // Specializations for uint8x16_t
 224 template <> struct is_vector<uint8x16_t> : cpp::true_type {};
 225 template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
 226 template <>
 227 LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
 228   for (size_t i = 0; i < 2; ++i) {
 229     auto a = load<uint64_t>(p1, offset);
 230     auto b = load<uint64_t>(p2, offset);
 231     uint32_t cond = a != b;
 232     if (cond)
 233       return cond;
 234     offset += sizeof(uint64_t);
 235   }
 236   return 0;
 237 }
 238 template <>
 239 LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
 240   for (size_t i = 0; i < 2; ++i) {
 241     auto a = load_be<uint64_t>(p1, offset);
 242     auto b = load_be<uint64_t>(p2, offset);
 243     if (a != b)
 244       return cmp_neq_uint64_t(a, b);
 245     offset += sizeof(uint64_t);
 246   }
 247   return MemcmpReturnType::ZERO();
 248 }
 249
 250 ///////////////////////////////////////////////////////////////////////////////
 251 // Specializations for uint8x16x2_t
 252 template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
 253 template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
 254 template <>
 255 LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
 256                                                size_t offset) {
 257   for (size_t i = 0; i < 4; ++i) {
 258     auto a = load_be<uint64_t>(p1, offset);
 259     auto b = load_be<uint64_t>(p2, offset);
 260     if (a != b)
 261       return cmp_neq_uint64_t(a, b);
 262     offset += sizeof(uint64_t);
 263   }
 264   return MemcmpReturnType::ZERO();
 265 }
 266 } // namespace LIBC_NAMESPACE::generic
 267
 268 #endif // LIBC_TARGET_ARCH_IS_AARCH64
 269
 270 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H