1 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozilla/SIMD.h"
8 #include "mozilla/SSE.h"
9 #include "mozilla/Assertions.h"
11 // Restricting to x86_64 simplifies things, and we're not particularly
12 // worried about slightly degraded performance on 32 bit processors which
13 // support AVX2, as this should be quite a minority.
14 #if defined(MOZILLA_MAY_SUPPORT_AVX2) && defined(__x86_64__)
17 # include <immintrin.h>
19 # include <type_traits>
21 # include "mozilla/EndianUtils.h"
25 const __m256i
* Cast256(uintptr_t ptr
) {
26 return reinterpret_cast<const __m256i
*>(ptr
);
30 T
GetAs(uintptr_t ptr
) {
31 return *reinterpret_cast<const T
*>(ptr
);
34 uintptr_t AlignDown32(uintptr_t ptr
) { return ptr
& ~0x1f; }
36 uintptr_t AlignUp32(uintptr_t ptr
) { return AlignDown32(ptr
+ 0x1f); }
38 template <typename TValue
>
39 __m128i
CmpEq128(__m128i a
, __m128i b
) {
40 static_assert(sizeof(TValue
) == 1 || sizeof(TValue
) == 2);
41 if (sizeof(TValue
) == 1) {
42 return _mm_cmpeq_epi8(a
, b
);
44 return _mm_cmpeq_epi16(a
, b
);
47 template <typename TValue
>
48 __m256i
CmpEq256(__m256i a
, __m256i b
) {
49 static_assert(sizeof(TValue
) == 1 || sizeof(TValue
) == 2 ||
50 sizeof(TValue
) == 4 || sizeof(TValue
) == 8);
51 if (sizeof(TValue
) == 1) {
52 return _mm256_cmpeq_epi8(a
, b
);
54 if (sizeof(TValue
) == 2) {
55 return _mm256_cmpeq_epi16(a
, b
);
57 if (sizeof(TValue
) == 4) {
58 return _mm256_cmpeq_epi32(a
, b
);
61 return _mm256_cmpeq_epi64(a
, b
);
64 # if defined(__GNUC__) && !defined(__clang__)
66 // See the comment in SIMD.cpp over Load32BitsIntoXMM. This is just adapted
67 // from that workaround. Testing this, it also yields the correct instructions
68 // across all tested compilers.
69 __m128i
Load64BitsIntoXMM(uintptr_t ptr
) {
71 memcpy(&tmp
, reinterpret_cast<const void*>(ptr
), sizeof(tmp
));
72 return _mm_cvtsi64_si128(tmp
);
77 __m128i
Load64BitsIntoXMM(uintptr_t ptr
) {
78 return _mm_loadu_si64(reinterpret_cast<const __m128i
*>(ptr
));
83 template <typename TValue
>
84 const TValue
* Check4x8Bytes(__m128i needle
, uintptr_t a
, uintptr_t b
,
85 uintptr_t c
, uintptr_t d
) {
86 __m128i haystackA
= Load64BitsIntoXMM(a
);
87 __m128i cmpA
= CmpEq128
<TValue
>(needle
, haystackA
);
88 __m128i haystackB
= Load64BitsIntoXMM(b
);
89 __m128i cmpB
= CmpEq128
<TValue
>(needle
, haystackB
);
90 __m128i haystackC
= Load64BitsIntoXMM(c
);
91 __m128i cmpC
= CmpEq128
<TValue
>(needle
, haystackC
);
92 __m128i haystackD
= Load64BitsIntoXMM(d
);
93 __m128i cmpD
= CmpEq128
<TValue
>(needle
, haystackD
);
94 __m128i or_ab
= _mm_or_si128(cmpA
, cmpB
);
95 __m128i or_cd
= _mm_or_si128(cmpC
, cmpD
);
96 __m128i or_abcd
= _mm_or_si128(or_ab
, or_cd
);
97 int orMask
= _mm_movemask_epi8(or_abcd
);
100 cmpMask
= _mm_movemask_epi8(cmpA
);
101 if (cmpMask
& 0xff) {
102 return reinterpret_cast<const TValue
*>(a
+ __builtin_ctz(cmpMask
));
104 cmpMask
= _mm_movemask_epi8(cmpB
);
105 if (cmpMask
& 0xff) {
106 return reinterpret_cast<const TValue
*>(b
+ __builtin_ctz(cmpMask
));
108 cmpMask
= _mm_movemask_epi8(cmpC
);
109 if (cmpMask
& 0xff) {
110 return reinterpret_cast<const TValue
*>(c
+ __builtin_ctz(cmpMask
));
112 cmpMask
= _mm_movemask_epi8(cmpD
);
113 if (cmpMask
& 0xff) {
114 return reinterpret_cast<const TValue
*>(d
+ __builtin_ctz(cmpMask
));
121 template <typename TValue
>
122 const TValue
* Check4x32Bytes(__m256i needle
, uintptr_t a
, uintptr_t b
,
123 uintptr_t c
, uintptr_t d
) {
124 __m256i haystackA
= _mm256_loadu_si256(Cast256(a
));
125 __m256i cmpA
= CmpEq256
<TValue
>(needle
, haystackA
);
126 __m256i haystackB
= _mm256_loadu_si256(Cast256(b
));
127 __m256i cmpB
= CmpEq256
<TValue
>(needle
, haystackB
);
128 __m256i haystackC
= _mm256_loadu_si256(Cast256(c
));
129 __m256i cmpC
= CmpEq256
<TValue
>(needle
, haystackC
);
130 __m256i haystackD
= _mm256_loadu_si256(Cast256(d
));
131 __m256i cmpD
= CmpEq256
<TValue
>(needle
, haystackD
);
132 __m256i or_ab
= _mm256_or_si256(cmpA
, cmpB
);
133 __m256i or_cd
= _mm256_or_si256(cmpC
, cmpD
);
134 __m256i or_abcd
= _mm256_or_si256(or_ab
, or_cd
);
135 int orMask
= _mm256_movemask_epi8(or_abcd
);
138 cmpMask
= _mm256_movemask_epi8(cmpA
);
140 return reinterpret_cast<const TValue
*>(a
+ __builtin_ctz(cmpMask
));
142 cmpMask
= _mm256_movemask_epi8(cmpB
);
144 return reinterpret_cast<const TValue
*>(b
+ __builtin_ctz(cmpMask
));
146 cmpMask
= _mm256_movemask_epi8(cmpC
);
148 return reinterpret_cast<const TValue
*>(c
+ __builtin_ctz(cmpMask
));
150 cmpMask
= _mm256_movemask_epi8(cmpD
);
152 return reinterpret_cast<const TValue
*>(d
+ __builtin_ctz(cmpMask
));
159 template <typename TValue
>
160 const TValue
* FindInBufferAVX2(const TValue
* ptr
, TValue value
, size_t length
) {
161 static_assert(sizeof(TValue
) == 1 || sizeof(TValue
) == 2 ||
162 sizeof(TValue
) == 4 || sizeof(TValue
) == 8);
163 static_assert(std::is_unsigned
<TValue
>::value
);
165 // Load our needle into a 32-byte register
167 if (sizeof(TValue
) == 1) {
168 needle
= _mm256_set1_epi8(value
);
169 } else if (sizeof(TValue
) == 2) {
170 needle
= _mm256_set1_epi16(value
);
171 } else if (sizeof(TValue
) == 4) {
172 needle
= _mm256_set1_epi32(value
);
174 needle
= _mm256_set1_epi64x(value
);
177 size_t numBytes
= length
* sizeof(TValue
);
178 uintptr_t cur
= reinterpret_cast<uintptr_t>(ptr
);
179 uintptr_t end
= cur
+ numBytes
;
181 if (numBytes
< 8 || (sizeof(TValue
) >= 4 && numBytes
< 32)) {
183 if (GetAs
<TValue
>(cur
) == value
) {
184 return reinterpret_cast<const TValue
*>(cur
);
186 cur
+= sizeof(TValue
);
191 if constexpr (sizeof(TValue
) < 4) {
193 __m128i needle_narrow
;
194 if (sizeof(TValue
) == 1) {
195 needle_narrow
= _mm_set1_epi8(value
);
197 needle_narrow
= _mm_set1_epi16(value
);
200 uintptr_t b
= cur
+ ((numBytes
& 16) >> 1);
201 uintptr_t c
= end
- 8 - ((numBytes
& 16) >> 1);
202 uintptr_t d
= end
- 8;
203 return Check4x8Bytes
<TValue
>(needle_narrow
, a
, b
, c
, d
);
207 if (numBytes
< 128) {
208 // NOTE: here and below, we have some bit fiddling which could look a
209 // little weird. The important thing to note though is it's just a trick
210 // for getting the number 32 if numBytes is greater than or equal to 64,
211 // and 0 otherwise. This lets us fully cover the range without any
212 // branching for the case where numBytes is in [32,64), and [64,128). We get
213 // four ranges from this - if numbytes > 64, we get:
214 // [0,32), [32,64], [end - 64), [end - 32)
215 // and if numbytes < 64, we get
216 // [0,32), [0,32), [end - 32), [end - 32)
218 uintptr_t b
= cur
+ ((numBytes
& 64) >> 1);
219 uintptr_t c
= end
- 32 - ((numBytes
& 64) >> 1);
220 uintptr_t d
= end
- 32;
221 return Check4x32Bytes
<TValue
>(needle
, a
, b
, c
, d
);
224 // Get the initial unaligned load out of the way. This will overlap with the
225 // aligned stuff below, but the overlapped part should effectively be free
226 // (relative to a mispredict from doing a byte-by-byte loop).
227 __m256i haystack
= _mm256_loadu_si256(Cast256(cur
));
228 __m256i cmp
= CmpEq256
<TValue
>(needle
, haystack
);
229 int cmpMask
= _mm256_movemask_epi8(cmp
);
231 return reinterpret_cast<const TValue
*>(cur
+ __builtin_ctz(cmpMask
));
234 // Now we're working with aligned memory. Hooray! \o/
235 cur
= AlignUp32(cur
);
237 uintptr_t tailStartPtr
= AlignDown32(end
- 96);
238 uintptr_t tailEndPtr
= end
- 32;
240 while (cur
< tailStartPtr
) {
242 uintptr_t b
= cur
+ 32;
243 uintptr_t c
= cur
+ 64;
244 uintptr_t d
= cur
+ 96;
245 const TValue
* result
= Check4x32Bytes
<TValue
>(needle
, a
, b
, c
, d
);
252 uintptr_t a
= tailStartPtr
;
253 uintptr_t b
= tailStartPtr
+ 32;
254 uintptr_t c
= tailStartPtr
+ 64;
255 uintptr_t d
= tailEndPtr
;
256 return Check4x32Bytes
<TValue
>(needle
, a
, b
, c
, d
);
259 const char* SIMD::memchr8AVX2(const char* ptr
, char value
, size_t length
) {
260 const unsigned char* uptr
= reinterpret_cast<const unsigned char*>(ptr
);
261 unsigned char uvalue
= static_cast<unsigned char>(value
);
262 const unsigned char* uresult
=
263 FindInBufferAVX2
<unsigned char>(uptr
, uvalue
, length
);
264 return reinterpret_cast<const char*>(uresult
);
267 const char16_t
* SIMD::memchr16AVX2(const char16_t
* ptr
, char16_t value
,
269 return FindInBufferAVX2
<char16_t
>(ptr
, value
, length
);
272 const uint32_t* SIMD::memchr32AVX2(const uint32_t* ptr
, uint32_t value
,
274 return FindInBufferAVX2
<uint32_t>(ptr
, value
, length
);
277 const uint64_t* SIMD::memchr64AVX2(const uint64_t* ptr
, uint64_t value
,
279 return FindInBufferAVX2
<uint64_t>(ptr
, value
, length
);
282 } // namespace mozilla
288 const char* SIMD::memchr8AVX2(const char* ptr
, char value
, size_t length
) {
289 MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
292 const char16_t
* SIMD::memchr16AVX2(const char16_t
* ptr
, char16_t value
,
294 MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
297 const uint32_t* SIMD::memchr32AVX2(const uint32_t* ptr
, uint32_t value
,
299 MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
302 const uint64_t* SIMD::memchr64AVX2(const uint64_t* ptr
, uint64_t value
,
304 MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
307 } // namespace mozilla