Backed out changeset b71c8c052463 (bug 1943846) for causing mass failures. CLOSED...
[gecko.git] / mozglue / misc / SIMD_avx2.cpp
blob8585a2534be415790ed42ea95bb2f2672d27d94f
1 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozilla/SIMD.h"
8 #include "mozilla/SSE.h"
9 #include "mozilla/Assertions.h"
11 // Restricting to x86_64 simplifies things, and we're not particularly
12 // worried about slightly degraded performance on 32 bit processors which
13 // support AVX2, as this should be quite a minority.
14 #if defined(MOZILLA_MAY_SUPPORT_AVX2) && defined(__x86_64__)
16 # include <cstring>
17 # include <immintrin.h>
18 # include <stdint.h>
19 # include <type_traits>
21 # include "mozilla/EndianUtils.h"
23 namespace mozilla {
25 const __m256i* Cast256(uintptr_t ptr) {
26 return reinterpret_cast<const __m256i*>(ptr);
29 template <typename T>
30 T GetAs(uintptr_t ptr) {
31 return *reinterpret_cast<const T*>(ptr);
34 uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; }
36 uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); }
38 template <typename TValue>
39 __m128i CmpEq128(__m128i a, __m128i b) {
40 static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
41 if (sizeof(TValue) == 1) {
42 return _mm_cmpeq_epi8(a, b);
44 return _mm_cmpeq_epi16(a, b);
47 template <typename TValue>
48 __m256i CmpEq256(__m256i a, __m256i b) {
49 static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
50 sizeof(TValue) == 4 || sizeof(TValue) == 8);
51 if (sizeof(TValue) == 1) {
52 return _mm256_cmpeq_epi8(a, b);
54 if (sizeof(TValue) == 2) {
55 return _mm256_cmpeq_epi16(a, b);
57 if (sizeof(TValue) == 4) {
58 return _mm256_cmpeq_epi32(a, b);
61 return _mm256_cmpeq_epi64(a, b);
64 # if defined(__GNUC__) && !defined(__clang__)
66 // See the comment in SIMD.cpp over Load32BitsIntoXMM. This is just adapted
67 // from that workaround. Testing this, it also yields the correct instructions
68 // across all tested compilers.
69 __m128i Load64BitsIntoXMM(uintptr_t ptr) {
70 int64_t tmp;
71 memcpy(&tmp, reinterpret_cast<const void*>(ptr), sizeof(tmp));
72 return _mm_cvtsi64_si128(tmp);
75 # else
77 __m128i Load64BitsIntoXMM(uintptr_t ptr) {
78 return _mm_loadu_si64(reinterpret_cast<const __m128i*>(ptr));
81 # endif
83 template <typename TValue>
84 const TValue* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
85 uintptr_t c, uintptr_t d) {
86 __m128i haystackA = Load64BitsIntoXMM(a);
87 __m128i cmpA = CmpEq128<TValue>(needle, haystackA);
88 __m128i haystackB = Load64BitsIntoXMM(b);
89 __m128i cmpB = CmpEq128<TValue>(needle, haystackB);
90 __m128i haystackC = Load64BitsIntoXMM(c);
91 __m128i cmpC = CmpEq128<TValue>(needle, haystackC);
92 __m128i haystackD = Load64BitsIntoXMM(d);
93 __m128i cmpD = CmpEq128<TValue>(needle, haystackD);
94 __m128i or_ab = _mm_or_si128(cmpA, cmpB);
95 __m128i or_cd = _mm_or_si128(cmpC, cmpD);
96 __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
97 int orMask = _mm_movemask_epi8(or_abcd);
98 if (orMask & 0xff) {
99 int cmpMask;
100 cmpMask = _mm_movemask_epi8(cmpA);
101 if (cmpMask & 0xff) {
102 return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
104 cmpMask = _mm_movemask_epi8(cmpB);
105 if (cmpMask & 0xff) {
106 return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
108 cmpMask = _mm_movemask_epi8(cmpC);
109 if (cmpMask & 0xff) {
110 return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
112 cmpMask = _mm_movemask_epi8(cmpD);
113 if (cmpMask & 0xff) {
114 return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
118 return nullptr;
121 template <typename TValue>
122 const TValue* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
123 uintptr_t c, uintptr_t d) {
124 __m256i haystackA = _mm256_loadu_si256(Cast256(a));
125 __m256i cmpA = CmpEq256<TValue>(needle, haystackA);
126 __m256i haystackB = _mm256_loadu_si256(Cast256(b));
127 __m256i cmpB = CmpEq256<TValue>(needle, haystackB);
128 __m256i haystackC = _mm256_loadu_si256(Cast256(c));
129 __m256i cmpC = CmpEq256<TValue>(needle, haystackC);
130 __m256i haystackD = _mm256_loadu_si256(Cast256(d));
131 __m256i cmpD = CmpEq256<TValue>(needle, haystackD);
132 __m256i or_ab = _mm256_or_si256(cmpA, cmpB);
133 __m256i or_cd = _mm256_or_si256(cmpC, cmpD);
134 __m256i or_abcd = _mm256_or_si256(or_ab, or_cd);
135 int orMask = _mm256_movemask_epi8(or_abcd);
136 if (orMask) {
137 int cmpMask;
138 cmpMask = _mm256_movemask_epi8(cmpA);
139 if (cmpMask) {
140 return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
142 cmpMask = _mm256_movemask_epi8(cmpB);
143 if (cmpMask) {
144 return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
146 cmpMask = _mm256_movemask_epi8(cmpC);
147 if (cmpMask) {
148 return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
150 cmpMask = _mm256_movemask_epi8(cmpD);
151 if (cmpMask) {
152 return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
156 return nullptr;
159 template <typename TValue>
160 const TValue* FindInBufferAVX2(const TValue* ptr, TValue value, size_t length) {
161 static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
162 sizeof(TValue) == 4 || sizeof(TValue) == 8);
163 static_assert(std::is_unsigned<TValue>::value);
165 // Load our needle into a 32-byte register
166 __m256i needle;
167 if (sizeof(TValue) == 1) {
168 needle = _mm256_set1_epi8(value);
169 } else if (sizeof(TValue) == 2) {
170 needle = _mm256_set1_epi16(value);
171 } else if (sizeof(TValue) == 4) {
172 needle = _mm256_set1_epi32(value);
173 } else {
174 needle = _mm256_set1_epi64x(value);
177 size_t numBytes = length * sizeof(TValue);
178 uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
179 uintptr_t end = cur + numBytes;
181 if (numBytes < 8 || (sizeof(TValue) >= 4 && numBytes < 32)) {
182 while (cur < end) {
183 if (GetAs<TValue>(cur) == value) {
184 return reinterpret_cast<const TValue*>(cur);
186 cur += sizeof(TValue);
188 return nullptr;
191 if constexpr (sizeof(TValue) < 4) {
192 if (numBytes < 32) {
193 __m128i needle_narrow;
194 if (sizeof(TValue) == 1) {
195 needle_narrow = _mm_set1_epi8(value);
196 } else {
197 needle_narrow = _mm_set1_epi16(value);
199 uintptr_t a = cur;
200 uintptr_t b = cur + ((numBytes & 16) >> 1);
201 uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
202 uintptr_t d = end - 8;
203 return Check4x8Bytes<TValue>(needle_narrow, a, b, c, d);
207 if (numBytes < 128) {
208 // NOTE: here and below, we have some bit fiddling which could look a
209 // little weird. The important thing to note though is it's just a trick
210 // for getting the number 32 if numBytes is greater than or equal to 64,
211 // and 0 otherwise. This lets us fully cover the range without any
212 // branching for the case where numBytes is in [32,64), and [64,128). We get
213 // four ranges from this - if numbytes > 64, we get:
214 // [0,32), [32,64], [end - 64), [end - 32)
215 // and if numbytes < 64, we get
216 // [0,32), [0,32), [end - 32), [end - 32)
217 uintptr_t a = cur;
218 uintptr_t b = cur + ((numBytes & 64) >> 1);
219 uintptr_t c = end - 32 - ((numBytes & 64) >> 1);
220 uintptr_t d = end - 32;
221 return Check4x32Bytes<TValue>(needle, a, b, c, d);
224 // Get the initial unaligned load out of the way. This will overlap with the
225 // aligned stuff below, but the overlapped part should effectively be free
226 // (relative to a mispredict from doing a byte-by-byte loop).
227 __m256i haystack = _mm256_loadu_si256(Cast256(cur));
228 __m256i cmp = CmpEq256<TValue>(needle, haystack);
229 int cmpMask = _mm256_movemask_epi8(cmp);
230 if (cmpMask) {
231 return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
234 // Now we're working with aligned memory. Hooray! \o/
235 cur = AlignUp32(cur);
237 uintptr_t tailStartPtr = AlignDown32(end - 96);
238 uintptr_t tailEndPtr = end - 32;
240 while (cur < tailStartPtr) {
241 uintptr_t a = cur;
242 uintptr_t b = cur + 32;
243 uintptr_t c = cur + 64;
244 uintptr_t d = cur + 96;
245 const TValue* result = Check4x32Bytes<TValue>(needle, a, b, c, d);
246 if (result) {
247 return result;
249 cur += 128;
252 uintptr_t a = tailStartPtr;
253 uintptr_t b = tailStartPtr + 32;
254 uintptr_t c = tailStartPtr + 64;
255 uintptr_t d = tailEndPtr;
256 return Check4x32Bytes<TValue>(needle, a, b, c, d);
259 const char* SIMD::memchr8AVX2(const char* ptr, char value, size_t length) {
260 const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr);
261 unsigned char uvalue = static_cast<unsigned char>(value);
262 const unsigned char* uresult =
263 FindInBufferAVX2<unsigned char>(uptr, uvalue, length);
264 return reinterpret_cast<const char*>(uresult);
267 const char16_t* SIMD::memchr16AVX2(const char16_t* ptr, char16_t value,
268 size_t length) {
269 return FindInBufferAVX2<char16_t>(ptr, value, length);
272 const uint32_t* SIMD::memchr32AVX2(const uint32_t* ptr, uint32_t value,
273 size_t length) {
274 return FindInBufferAVX2<uint32_t>(ptr, value, length);
277 const uint64_t* SIMD::memchr64AVX2(const uint64_t* ptr, uint64_t value,
278 size_t length) {
279 return FindInBufferAVX2<uint64_t>(ptr, value, length);
282 } // namespace mozilla
284 #else
286 namespace mozilla {
288 const char* SIMD::memchr8AVX2(const char* ptr, char value, size_t length) {
289 MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
292 const char16_t* SIMD::memchr16AVX2(const char16_t* ptr, char16_t value,
293 size_t length) {
294 MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
297 const uint32_t* SIMD::memchr32AVX2(const uint32_t* ptr, uint32_t value,
298 size_t length) {
299 MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
302 const uint64_t* SIMD::memchr64AVX2(const uint64_t* ptr, uint64_t value,
303 size_t length) {
304 MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
307 } // namespace mozilla
309 #endif