mozglue/misc/SIMD_avx2.cpp

   1 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   2 /* This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5
   6 #include "mozilla/SIMD.h"
   7
   8 #include "mozilla/SSE.h"
   9 #include "mozilla/Assertions.h"
  10
  11 // Restricting to x86_64 simplifies things, and we're not particularly
  12 // worried about slightly degraded performance on 32 bit processors which
  13 // support AVX2, as this should be quite a minority.
  14 #if defined(MOZILLA_MAY_SUPPORT_AVX2) && defined(__x86_64__)
  15
  16 #  include <cstring>
  17 #  include <immintrin.h>
  18 #  include <stdint.h>
  19 #  include <type_traits>
  20
  21 #  include "mozilla/EndianUtils.h"
  22
  23 namespace mozilla {
  24
  25 const __m256i* Cast256(uintptr_t ptr) {
  26   return reinterpret_cast<const __m256i*>(ptr);
  27 }
  28
  29 template <typename T>
  30 T GetAs(uintptr_t ptr) {
  31   return *reinterpret_cast<const T*>(ptr);
  32 }
  33
  34 uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; }
  35
  36 uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); }
  37
  38 template <typename TValue>
  39 __m128i CmpEq128(__m128i a, __m128i b) {
  40   static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
  41   if (sizeof(TValue) == 1) {
  42     return _mm_cmpeq_epi8(a, b);
  43   }
  44   return _mm_cmpeq_epi16(a, b);
  45 }
  46
  47 template <typename TValue>
  48 __m256i CmpEq256(__m256i a, __m256i b) {
  49   static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
  50                 sizeof(TValue) == 4 || sizeof(TValue) == 8);
  51   if (sizeof(TValue) == 1) {
  52     return _mm256_cmpeq_epi8(a, b);
  53   }
  54   if (sizeof(TValue) == 2) {
  55     return _mm256_cmpeq_epi16(a, b);
  56   }
  57   if (sizeof(TValue) == 4) {
  58     return _mm256_cmpeq_epi32(a, b);
  59   }
  60
  61   return _mm256_cmpeq_epi64(a, b);
  62 }
  63
  64 #  if defined(__GNUC__) && !defined(__clang__)
  65
  66 // See the comment in SIMD.cpp over Load32BitsIntoXMM. This is just adapted
  67 // from that workaround. Testing this, it also yields the correct instructions
  68 // across all tested compilers.
  69 __m128i Load64BitsIntoXMM(uintptr_t ptr) {
  70   int64_t tmp;
  71   memcpy(&tmp, reinterpret_cast<const void*>(ptr), sizeof(tmp));
  72   return _mm_cvtsi64_si128(tmp);
  73 }
  74
  75 #  else
  76
  77 __m128i Load64BitsIntoXMM(uintptr_t ptr) {
  78   return _mm_loadu_si64(reinterpret_cast<const __m128i*>(ptr));
  79 }
  80
  81 #  endif
  82
  83 template <typename TValue>
  84 const TValue* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
  85                             uintptr_t c, uintptr_t d) {
  86   __m128i haystackA = Load64BitsIntoXMM(a);
  87   __m128i cmpA = CmpEq128<TValue>(needle, haystackA);
  88   __m128i haystackB = Load64BitsIntoXMM(b);
  89   __m128i cmpB = CmpEq128<TValue>(needle, haystackB);
  90   __m128i haystackC = Load64BitsIntoXMM(c);
  91   __m128i cmpC = CmpEq128<TValue>(needle, haystackC);
  92   __m128i haystackD = Load64BitsIntoXMM(d);
  93   __m128i cmpD = CmpEq128<TValue>(needle, haystackD);
  94   __m128i or_ab = _mm_or_si128(cmpA, cmpB);
  95   __m128i or_cd = _mm_or_si128(cmpC, cmpD);
  96   __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
  97   int orMask = _mm_movemask_epi8(or_abcd);
  98   if (orMask & 0xff) {
  99     int cmpMask;
 100     cmpMask = _mm_movemask_epi8(cmpA);
 101     if (cmpMask & 0xff) {
 102       return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
 103     }
 104     cmpMask = _mm_movemask_epi8(cmpB);
 105     if (cmpMask & 0xff) {
 106       return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
 107     }
 108     cmpMask = _mm_movemask_epi8(cmpC);
 109     if (cmpMask & 0xff) {
 110       return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
 111     }
 112     cmpMask = _mm_movemask_epi8(cmpD);
 113     if (cmpMask & 0xff) {
 114       return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
 115     }
 116   }
 117
 118   return nullptr;
 119 }
 120
 121 template <typename TValue>
 122 const TValue* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
 123                              uintptr_t c, uintptr_t d) {
 124   __m256i haystackA = _mm256_loadu_si256(Cast256(a));
 125   __m256i cmpA = CmpEq256<TValue>(needle, haystackA);
 126   __m256i haystackB = _mm256_loadu_si256(Cast256(b));
 127   __m256i cmpB = CmpEq256<TValue>(needle, haystackB);
 128   __m256i haystackC = _mm256_loadu_si256(Cast256(c));
 129   __m256i cmpC = CmpEq256<TValue>(needle, haystackC);
 130   __m256i haystackD = _mm256_loadu_si256(Cast256(d));
 131   __m256i cmpD = CmpEq256<TValue>(needle, haystackD);
 132   __m256i or_ab = _mm256_or_si256(cmpA, cmpB);
 133   __m256i or_cd = _mm256_or_si256(cmpC, cmpD);
 134   __m256i or_abcd = _mm256_or_si256(or_ab, or_cd);
 135   int orMask = _mm256_movemask_epi8(or_abcd);
 136   if (orMask) {
 137     int cmpMask;
 138     cmpMask = _mm256_movemask_epi8(cmpA);
 139     if (cmpMask) {
 140       return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
 141     }
 142     cmpMask = _mm256_movemask_epi8(cmpB);
 143     if (cmpMask) {
 144       return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
 145     }
 146     cmpMask = _mm256_movemask_epi8(cmpC);
 147     if (cmpMask) {
 148       return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
 149     }
 150     cmpMask = _mm256_movemask_epi8(cmpD);
 151     if (cmpMask) {
 152       return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
 153     }
 154   }
 155
 156   return nullptr;
 157 }
 158
 159 template <typename TValue>
 160 const TValue* FindInBufferAVX2(const TValue* ptr, TValue value, size_t length) {
 161   static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
 162                 sizeof(TValue) == 4 || sizeof(TValue) == 8);
 163   static_assert(std::is_unsigned<TValue>::value);
 164
 165   // Load our needle into a 32-byte register
 166   __m256i needle;
 167   if (sizeof(TValue) == 1) {
 168     needle = _mm256_set1_epi8(value);
 169   } else if (sizeof(TValue) == 2) {
 170     needle = _mm256_set1_epi16(value);
 171   } else if (sizeof(TValue) == 4) {
 172     needle = _mm256_set1_epi32(value);
 173   } else {
 174     needle = _mm256_set1_epi64x(value);
 175   }
 176
 177   size_t numBytes = length * sizeof(TValue);
 178   uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
 179   uintptr_t end = cur + numBytes;
 180
 181   if (numBytes < 8 || (sizeof(TValue) >= 4 && numBytes < 32)) {
 182     while (cur < end) {
 183       if (GetAs<TValue>(cur) == value) {
 184         return reinterpret_cast<const TValue*>(cur);
 185       }
 186       cur += sizeof(TValue);
 187     }
 188     return nullptr;
 189   }
 190
 191   if constexpr (sizeof(TValue) < 4) {
 192     if (numBytes < 32) {
 193       __m128i needle_narrow;
 194       if (sizeof(TValue) == 1) {
 195         needle_narrow = _mm_set1_epi8(value);
 196       } else {
 197         needle_narrow = _mm_set1_epi16(value);
 198       }
 199       uintptr_t a = cur;
 200       uintptr_t b = cur + ((numBytes & 16) >> 1);
 201       uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
 202       uintptr_t d = end - 8;
 203       return Check4x8Bytes<TValue>(needle_narrow, a, b, c, d);
 204     }
 205   }
 206
 207   if (numBytes < 128) {
 208     // NOTE: here and below, we have some bit fiddling which could look a
 209     // little weird. The important thing to note though is it's just a trick
 210     // for getting the number 32 if numBytes is greater than or equal to 64,
 211     // and 0 otherwise. This lets us fully cover the range without any
 212     // branching for the case where numBytes is in [32,64), and [64,128). We get
 213     // four ranges from this - if numbytes > 64, we get:
 214     //   [0,32), [32,64], [end - 64), [end - 32)
 215     // and if numbytes < 64, we get
 216     //   [0,32), [0,32), [end - 32), [end - 32)
 217     uintptr_t a = cur;
 218     uintptr_t b = cur + ((numBytes & 64) >> 1);
 219     uintptr_t c = end - 32 - ((numBytes & 64) >> 1);
 220     uintptr_t d = end - 32;
 221     return Check4x32Bytes<TValue>(needle, a, b, c, d);
 222   }
 223
 224   // Get the initial unaligned load out of the way. This will overlap with the
 225   // aligned stuff below, but the overlapped part should effectively be free
 226   // (relative to a mispredict from doing a byte-by-byte loop).
 227   __m256i haystack = _mm256_loadu_si256(Cast256(cur));
 228   __m256i cmp = CmpEq256<TValue>(needle, haystack);
 229   int cmpMask = _mm256_movemask_epi8(cmp);
 230   if (cmpMask) {
 231     return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
 232   }
 233
 234   // Now we're working with aligned memory. Hooray! \o/
 235   cur = AlignUp32(cur);
 236
 237   uintptr_t tailStartPtr = AlignDown32(end - 96);
 238   uintptr_t tailEndPtr = end - 32;
 239
 240   while (cur < tailStartPtr) {
 241     uintptr_t a = cur;
 242     uintptr_t b = cur + 32;
 243     uintptr_t c = cur + 64;
 244     uintptr_t d = cur + 96;
 245     const TValue* result = Check4x32Bytes<TValue>(needle, a, b, c, d);
 246     if (result) {
 247       return result;
 248     }
 249     cur += 128;
 250   }
 251
 252   uintptr_t a = tailStartPtr;
 253   uintptr_t b = tailStartPtr + 32;
 254   uintptr_t c = tailStartPtr + 64;
 255   uintptr_t d = tailEndPtr;
 256   return Check4x32Bytes<TValue>(needle, a, b, c, d);
 257 }
 258
 259 const char* SIMD::memchr8AVX2(const char* ptr, char value, size_t length) {
 260   const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr);
 261   unsigned char uvalue = static_cast<unsigned char>(value);
 262   const unsigned char* uresult =
 263       FindInBufferAVX2<unsigned char>(uptr, uvalue, length);
 264   return reinterpret_cast<const char*>(uresult);
 265 }
 266
 267 const char16_t* SIMD::memchr16AVX2(const char16_t* ptr, char16_t value,
 268                                    size_t length) {
 269   return FindInBufferAVX2<char16_t>(ptr, value, length);
 270 }
 271
 272 const uint32_t* SIMD::memchr32AVX2(const uint32_t* ptr, uint32_t value,
 273                                    size_t length) {
 274   return FindInBufferAVX2<uint32_t>(ptr, value, length);
 275 }
 276
 277 const uint64_t* SIMD::memchr64AVX2(const uint64_t* ptr, uint64_t value,
 278                                    size_t length) {
 279   return FindInBufferAVX2<uint64_t>(ptr, value, length);
 280 }
 281
 282 }  // namespace mozilla
 283
 284 #else
 285
 286 namespace mozilla {
 287
 288 const char* SIMD::memchr8AVX2(const char* ptr, char value, size_t length) {
 289   MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
 290 }
 291
 292 const char16_t* SIMD::memchr16AVX2(const char16_t* ptr, char16_t value,
 293                                    size_t length) {
 294   MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
 295 }
 296
 297 const uint32_t* SIMD::memchr32AVX2(const uint32_t* ptr, uint32_t value,
 298                                    size_t length) {
 299   MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
 300 }
 301
 302 const uint64_t* SIMD::memchr64AVX2(const uint64_t* ptr, uint64_t value,
 303                                    size_t length) {
 304   MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
 305 }
 306
 307 }  // namespace mozilla
 308
 309 #endif