2 ==============================================================================
4 This file is part of the JUCE library.
5 Copyright (c) 2022 - Raw Material Software Limited
7 JUCE is an open source library subject to commercial or open-source
10 By using JUCE, you agree to the terms of both the JUCE 7 End-User License
11 Agreement and JUCE Privacy Policy.
13 End User License Agreement: www.juce.com/juce-7-licence
14 Privacy Policy: www.juce.com/juce-privacy-policy
16 Or: You may also use this code under the terms of the GPL v3 (see
17 www.gnu.org/licenses).
19 JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
20 EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
23 ==============================================================================
33 JUCE_BEGIN_IGNORE_WARNINGS_GCC_LIKE ("-Wignored-attributes")
36 #define DECLARE_AVX_SIMD_CONST(type, name) \
37 static __declspec(align(32)) const type name[32 / sizeof (type)]
39 #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
40 __declspec(align(32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
43 #define DECLARE_AVX_SIMD_CONST(type, name) \
44 static const type name[32 / sizeof (type)] __attribute__((aligned(32)))
46 #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
47 const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__((aligned(32)))
51 template <typename type
>
54 //==============================================================================
55 /** Single-precision floating point AVX intrinsics.
60 struct SIMDNativeOps
<float>
62 using vSIMDType
= __m256
;
64 //==============================================================================
65 DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet
);
66 DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit
);
67 DECLARE_AVX_SIMD_CONST (float, kOne
);
69 //==============================================================================
70 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
vconst (const float* a
) noexcept
{ return load (a
); }
71 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
vconst (const int32_t* a
) noexcept
{ return _mm256_castsi256_ps (_mm256_load_si256 (reinterpret_cast <const __m256i
*> (a
))); }
72 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
expand (float s
) noexcept
{ return _mm256_broadcast_ss (&s
); }
73 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
load (const float* a
) noexcept
{ return _mm256_load_ps (a
); }
74 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256 value
, float* dest
) noexcept
{ _mm256_store_ps (dest
, value
); }
75 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
add (__m256 a
, __m256 b
) noexcept
{ return _mm256_add_ps (a
, b
); }
76 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
sub (__m256 a
, __m256 b
) noexcept
{ return _mm256_sub_ps (a
, b
); }
77 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
mul (__m256 a
, __m256 b
) noexcept
{ return _mm256_mul_ps (a
, b
); }
78 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
bit_and (__m256 a
, __m256 b
) noexcept
{ return _mm256_and_ps (a
, b
); }
79 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
bit_or (__m256 a
, __m256 b
) noexcept
{ return _mm256_or_ps (a
, b
); }
80 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
bit_xor (__m256 a
, __m256 b
) noexcept
{ return _mm256_xor_ps (a
, b
); }
81 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
bit_notand (__m256 a
, __m256 b
) noexcept
{ return _mm256_andnot_ps (a
, b
); }
82 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
bit_not (__m256 a
) noexcept
{ return bit_notand (a
, vconst (kAllBitsSet
)); }
83 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
min (__m256 a
, __m256 b
) noexcept
{ return _mm256_min_ps (a
, b
); }
84 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
max (__m256 a
, __m256 b
) noexcept
{ return _mm256_max_ps (a
, b
); }
85 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
equal (__m256 a
, __m256 b
) noexcept
{ return _mm256_cmp_ps (a
, b
, _CMP_EQ_OQ
); }
86 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
notEqual (__m256 a
, __m256 b
) noexcept
{ return _mm256_cmp_ps (a
, b
, _CMP_NEQ_OQ
); }
87 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
greaterThan (__m256 a
, __m256 b
) noexcept
{ return _mm256_cmp_ps (a
, b
, _CMP_GT_OQ
); }
88 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256 a
, __m256 b
) noexcept
{ return _mm256_cmp_ps (a
, b
, _CMP_GE_OQ
); }
89 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256 a
, __m256 b
) noexcept
{ return (_mm256_movemask_ps (equal (a
, b
)) == 0xff); }
90 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
dupeven (__m256 a
) noexcept
{ return _mm256_shuffle_ps (a
, a
, _MM_SHUFFLE (2, 2, 0, 0)); }
91 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
dupodd (__m256 a
) noexcept
{ return _mm256_shuffle_ps (a
, a
, _MM_SHUFFLE (3, 3, 1, 1)); }
92 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
swapevenodd (__m256 a
) noexcept
{ return _mm256_shuffle_ps (a
, a
, _MM_SHUFFLE (2, 3, 0, 1)); }
93 static forcedinline
float JUCE_VECTOR_CALLTYPE
get (__m256 v
, size_t i
) noexcept
{ return SIMDFallbackOps
<float, __m256
>::get (v
, i
); }
94 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
set (__m256 v
, size_t i
, float s
) noexcept
{ return SIMDFallbackOps
<float, __m256
>::set (v
, i
, s
); }
95 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
truncate (__m256 a
) noexcept
{ return _mm256_cvtepi32_ps (_mm256_cvttps_epi32 (a
)); }
97 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256 a
, __m256 b
, __m256 c
) noexcept
100 return _mm256_fmadd_ps (b
, c
, a
);
102 return add (a
, mul (b
, c
));
106 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
oddevensum (__m256 a
) noexcept
108 a
= _mm256_add_ps (_mm256_shuffle_ps (a
, a
, _MM_SHUFFLE (1, 0, 3, 2)), a
);
109 return add (_mm256_permute2f128_ps (a
, a
, 1), a
);
112 //==============================================================================
113 static forcedinline __m256 JUCE_VECTOR_CALLTYPE
cmplxmul (__m256 a
, __m256 b
) noexcept
115 __m256 rr_ir
= mul (a
, dupeven (b
));
116 __m256 ii_ri
= mul (swapevenodd (a
), dupodd (b
));
117 return add (rr_ir
, bit_xor (ii_ri
, vconst (kEvenHighBit
)));
120 static forcedinline
float JUCE_VECTOR_CALLTYPE
sum (__m256 a
) noexcept
122 __m256 retval
= _mm256_dp_ps (a
, vconst (kOne
), 0xff);
123 __m256 tmp
= _mm256_permute2f128_ps (retval
, retval
, 1);
124 retval
= _mm256_add_ps (retval
, tmp
);
129 return _mm256_cvtss_f32 (retval
);
134 //==============================================================================
135 /** Double-precision floating point AVX intrinsics.
140 struct SIMDNativeOps
<double>
142 using vSIMDType
= __m256d
;
144 //==============================================================================
145 DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet
);
146 DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit
);
147 DECLARE_AVX_SIMD_CONST (double, kOne
);
149 //==============================================================================
150 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
vconst (const double* a
) noexcept
{ return load (a
); }
151 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
vconst (const int64_t* a
) noexcept
{ return _mm256_castsi256_pd (_mm256_load_si256 (reinterpret_cast <const __m256i
*> (a
))); }
152 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
expand (double s
) noexcept
{ return _mm256_broadcast_sd (&s
); }
153 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
load (const double* a
) noexcept
{ return _mm256_load_pd (a
); }
154 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256d value
, double* dest
) noexcept
{ _mm256_store_pd (dest
, value
); }
155 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
add (__m256d a
, __m256d b
) noexcept
{ return _mm256_add_pd (a
, b
); }
156 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
sub (__m256d a
, __m256d b
) noexcept
{ return _mm256_sub_pd (a
, b
); }
157 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
mul (__m256d a
, __m256d b
) noexcept
{ return _mm256_mul_pd (a
, b
); }
158 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
bit_and (__m256d a
, __m256d b
) noexcept
{ return _mm256_and_pd (a
, b
); }
159 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
bit_or (__m256d a
, __m256d b
) noexcept
{ return _mm256_or_pd (a
, b
); }
160 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
bit_xor (__m256d a
, __m256d b
) noexcept
{ return _mm256_xor_pd (a
, b
); }
161 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
bit_notand (__m256d a
, __m256d b
) noexcept
{ return _mm256_andnot_pd (a
, b
); }
162 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
bit_not (__m256d a
) noexcept
{ return bit_notand (a
, vconst (kAllBitsSet
)); }
163 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
min (__m256d a
, __m256d b
) noexcept
{ return _mm256_min_pd (a
, b
); }
164 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
max (__m256d a
, __m256d b
) noexcept
{ return _mm256_max_pd (a
, b
); }
165 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
equal (__m256d a
, __m256d b
) noexcept
{ return _mm256_cmp_pd (a
, b
, _CMP_EQ_OQ
); }
166 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
notEqual (__m256d a
, __m256d b
) noexcept
{ return _mm256_cmp_pd (a
, b
, _CMP_NEQ_OQ
); }
167 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
greaterThan (__m256d a
, __m256d b
) noexcept
{ return _mm256_cmp_pd (a
, b
, _CMP_GT_OQ
); }
168 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256d a
, __m256d b
) noexcept
{ return _mm256_cmp_pd (a
, b
, _CMP_GE_OQ
); }
169 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256d a
, __m256d b
) noexcept
{ return (_mm256_movemask_pd (equal (a
, b
)) == 0xf); }
170 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256d a
, __m256d b
, __m256d c
) noexcept
{ return _mm256_add_pd (a
, _mm256_mul_pd (b
, c
)); }
171 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
dupeven (__m256d a
) noexcept
{ return _mm256_shuffle_pd (a
, a
, 0); }
172 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
dupodd (__m256d a
) noexcept
{ return _mm256_shuffle_pd (a
, a
, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
173 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
swapevenodd (__m256d a
) noexcept
{ return _mm256_shuffle_pd (a
, a
, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
174 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
oddevensum (__m256d a
) noexcept
{ return _mm256_add_pd (_mm256_permute2f128_pd (a
, a
, 1), a
); }
175 static forcedinline
double JUCE_VECTOR_CALLTYPE
get (__m256d v
, size_t i
) noexcept
{ return SIMDFallbackOps
<double, __m256d
>::get (v
, i
); }
176 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
set (__m256d v
, size_t i
, double s
) noexcept
{ return SIMDFallbackOps
<double, __m256d
>::set (v
, i
, s
); }
177 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
truncate (__m256d a
) noexcept
{ return _mm256_cvtepi32_pd (_mm256_cvttpd_epi32 (a
)); }
179 //==============================================================================
180 static forcedinline __m256d JUCE_VECTOR_CALLTYPE
cmplxmul (__m256d a
, __m256d b
) noexcept
182 __m256d rr_ir
= mul (a
, dupeven (b
));
183 __m256d ii_ri
= mul (swapevenodd (a
), dupodd (b
));
184 return add (rr_ir
, bit_xor (ii_ri
, vconst (kEvenHighBit
)));
187 static forcedinline
double JUCE_VECTOR_CALLTYPE
sum (__m256d a
) noexcept
189 __m256d retval
= _mm256_hadd_pd (a
, a
);
190 __m256d tmp
= _mm256_permute2f128_pd (retval
, retval
, 1);
191 retval
= _mm256_add_pd (retval
, tmp
);
196 return _mm256_cvtsd_f64 (retval
);
201 //==============================================================================
202 /** Signed 8-bit integer AVX intrinsics
207 struct SIMDNativeOps
<int8_t>
209 using vSIMDType
= __m256i
;
211 //==============================================================================
212 DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet
);
214 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
expand (int8_t s
) noexcept
{ return _mm256_set1_epi8 (s
); }
215 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
load (const int8_t* p
) noexcept
{ return _mm256_load_si256 (reinterpret_cast<const __m256i
*> (p
)); }
216 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256i value
, int8_t* dest
) noexcept
{ _mm256_store_si256 (reinterpret_cast<__m256i
*> (dest
), value
); }
217 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
add (__m256i a
, __m256i b
) noexcept
{ return _mm256_add_epi8 (a
, b
); }
218 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
sub (__m256i a
, __m256i b
) noexcept
{ return _mm256_sub_epi8 (a
, b
); }
219 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_and (__m256i a
, __m256i b
) noexcept
{ return _mm256_and_si256 (a
, b
); }
220 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_or (__m256i a
, __m256i b
) noexcept
{ return _mm256_or_si256 (a
, b
); }
221 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_xor (__m256i a
, __m256i b
) noexcept
{ return _mm256_xor_si256 (a
, b
); }
222 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_andnot (__m256i a
, __m256i b
) noexcept
{ return _mm256_andnot_si256 (a
, b
); }
223 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_not (__m256i a
) noexcept
{ return _mm256_andnot_si256 (a
, load (kAllBitsSet
)); }
224 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
min (__m256i a
, __m256i b
) noexcept
{ return _mm256_min_epi8 (a
, b
); }
225 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
max (__m256i a
, __m256i b
) noexcept
{ return _mm256_max_epi8 (a
, b
); }
226 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
equal (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpeq_epi8 (a
, b
); }
227 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThan (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpgt_epi8 (a
, b
); }
228 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256i a
, __m256i b
) noexcept
{ return bit_or (greaterThan (a
, b
), equal (a
,b
)); }
229 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256i a
, __m256i b
) noexcept
{ return _mm256_movemask_epi8 (equal (a
, b
)) == -1; }
230 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256i a
, __m256i b
, __m256i c
) noexcept
{ return add (a
, mul (b
, c
)); }
231 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
notEqual (__m256i a
, __m256i b
) noexcept
{ return bit_not (equal (a
, b
)); }
232 static forcedinline
int8_t JUCE_VECTOR_CALLTYPE
get (__m256i v
, size_t i
) noexcept
{ return SIMDFallbackOps
<int8_t, __m256i
>::get (v
, i
); }
233 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
set (__m256i v
, size_t i
, int8_t s
) noexcept
{ return SIMDFallbackOps
<int8_t, __m256i
>::set (v
, i
, s
); }
234 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
truncate (__m256i a
) noexcept
{ return a
; }
236 //==============================================================================
237 static forcedinline
int8_t JUCE_VECTOR_CALLTYPE
sum (__m256i a
) noexcept
239 __m256i lo
= _mm256_unpacklo_epi8 (a
, _mm256_setzero_si256());
240 __m256i hi
= _mm256_unpackhi_epi8 (a
, _mm256_setzero_si256());
242 for (int i
= 0; i
< 3; ++i
)
244 lo
= _mm256_hadd_epi16 (lo
, lo
);
245 hi
= _mm256_hadd_epi16 (hi
, hi
);
249 return (int8_t) ((lo
[0] & 0xff) +
254 constexpr int mask
= (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
256 return (int8_t) ((_mm256_cvtsi256_si32 (lo
) & 0xff) +
257 (_mm256_cvtsi256_si32 (hi
) & 0xff) +
258 (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo
, mask
)) & 0xff) +
259 (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi
, mask
)) & 0xff));
263 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
mul (__m256i a
, __m256i b
)
265 // unpack and multiply
266 __m256i even
= _mm256_mullo_epi16 (a
, b
);
267 __m256i odd
= _mm256_mullo_epi16 (_mm256_srli_epi16 (a
, 8), _mm256_srli_epi16 (b
, 8));
269 return _mm256_or_si256 (_mm256_slli_epi16 (odd
, 8),
270 _mm256_srli_epi16 (_mm256_slli_epi16 (even
, 8), 8));
274 //==============================================================================
275 /** Unsigned 8-bit integer AVX intrinsics.
280 struct SIMDNativeOps
<uint8_t>
282 //==============================================================================
283 using vSIMDType
= __m256i
;
285 //==============================================================================
286 DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit
);
287 DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet
);
289 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
ssign (__m256i a
) noexcept
{ return _mm256_xor_si256 (a
, load (kHighBit
)); }
290 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
expand (uint8_t s
) noexcept
{ return _mm256_set1_epi8 ((int8_t) s
); }
291 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
load (const uint8_t* p
) noexcept
{ return _mm256_load_si256 (reinterpret_cast<const __m256i
*> (p
)); }
292 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256i value
, uint8_t* dest
) noexcept
{ _mm256_store_si256 (reinterpret_cast<__m256i
*> (dest
), value
); }
293 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
add (__m256i a
, __m256i b
) noexcept
{ return _mm256_add_epi8 (a
, b
); }
294 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
sub (__m256i a
, __m256i b
) noexcept
{ return _mm256_sub_epi8 (a
, b
); }
295 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_and (__m256i a
, __m256i b
) noexcept
{ return _mm256_and_si256 (a
, b
); }
296 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_or (__m256i a
, __m256i b
) noexcept
{ return _mm256_or_si256 (a
, b
); }
297 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_xor (__m256i a
, __m256i b
) noexcept
{ return _mm256_xor_si256 (a
, b
); }
298 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_andnot (__m256i a
, __m256i b
) noexcept
{ return _mm256_andnot_si256 (a
, b
); }
299 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_not (__m256i a
) noexcept
{ return _mm256_andnot_si256 (a
, load (kAllBitsSet
)); }
300 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
min (__m256i a
, __m256i b
) noexcept
{ return _mm256_min_epu8 (a
, b
); }
301 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
max (__m256i a
, __m256i b
) noexcept
{ return _mm256_max_epu8 (a
, b
); }
302 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
equal (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpeq_epi8 (a
, b
); }
303 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThan (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpgt_epi8 (ssign (a
), ssign (b
)); }
304 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256i a
, __m256i b
) noexcept
{ return bit_or (greaterThan (a
, b
), equal (a
,b
)); }
305 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256i a
, __m256i b
) noexcept
{ return (_mm256_movemask_epi8 (equal (a
, b
)) == -1); }
306 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256i a
, __m256i b
, __m256i c
) noexcept
{ return add (a
, mul (b
, c
)); }
307 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
notEqual (__m256i a
, __m256i b
) noexcept
{ return bit_not (equal (a
, b
)); }
308 static forcedinline
uint8_t JUCE_VECTOR_CALLTYPE
get (__m256i v
, size_t i
) noexcept
{ return SIMDFallbackOps
<uint8_t, __m256i
>::get (v
, i
); }
309 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
set (__m256i v
, size_t i
, uint8_t s
) noexcept
{ return SIMDFallbackOps
<uint8_t, __m256i
>::set (v
, i
, s
); }
310 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
truncate (__m256i a
) noexcept
{ return a
; }
312 //==============================================================================
313 static forcedinline
uint8_t JUCE_VECTOR_CALLTYPE
sum (__m256i a
) noexcept
315 __m256i lo
= _mm256_unpacklo_epi8 (a
, _mm256_setzero_si256());
316 __m256i hi
= _mm256_unpackhi_epi8 (a
, _mm256_setzero_si256());
318 for (int i
= 0; i
< 3; ++i
)
320 lo
= _mm256_hadd_epi16 (lo
, lo
);
321 hi
= _mm256_hadd_epi16 (hi
, hi
);
325 return (uint8_t) ((static_cast<uint32_t> (lo
[0]) & 0xffu
) +
326 (static_cast<uint32_t> (hi
[0]) & 0xffu
) +
327 (static_cast<uint32_t> (lo
[2]) & 0xffu
) +
328 (static_cast<uint32_t> (hi
[2]) & 0xffu
));
330 constexpr int mask
= (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
332 return (uint8_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (lo
)) & 0xffu
) +
333 (static_cast<uint32_t> (_mm256_cvtsi256_si32 (hi
)) & 0xffu
) +
334 (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo
, mask
))) & 0xffu
) +
335 (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi
, mask
))) & 0xffu
));
339 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
mul (__m256i a
, __m256i b
)
341 // unpack and multiply
342 __m256i even
= _mm256_mullo_epi16 (a
, b
);
343 __m256i odd
= _mm256_mullo_epi16 (_mm256_srli_epi16 (a
, 8), _mm256_srli_epi16 (b
, 8));
345 return _mm256_or_si256 (_mm256_slli_epi16 (odd
, 8),
346 _mm256_srli_epi16 (_mm256_slli_epi16 (even
, 8), 8));
350 //==============================================================================
351 /** Signed 16-bit integer AVX intrinsics.
356 struct SIMDNativeOps
<int16_t>
358 //==============================================================================
359 using vSIMDType
= __m256i
;
361 //==============================================================================
362 DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet
);
364 //==============================================================================
365 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
expand (int16_t s
) noexcept
{ return _mm256_set1_epi16 (s
); }
366 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
load (const int16_t* p
) noexcept
{ return _mm256_load_si256 (reinterpret_cast<const __m256i
*> (p
)); }
367 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256i value
, int16_t* dest
) noexcept
{ _mm256_store_si256 (reinterpret_cast<__m256i
*> (dest
), value
); }
368 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
add (__m256i a
, __m256i b
) noexcept
{ return _mm256_add_epi16 (a
, b
); }
369 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
sub (__m256i a
, __m256i b
) noexcept
{ return _mm256_sub_epi16 (a
, b
); }
370 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
mul (__m256i a
, __m256i b
) noexcept
{ return _mm256_mullo_epi16 (a
, b
); }
371 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_and (__m256i a
, __m256i b
) noexcept
{ return _mm256_and_si256 (a
, b
); }
372 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_or (__m256i a
, __m256i b
) noexcept
{ return _mm256_or_si256 (a
, b
); }
373 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_xor (__m256i a
, __m256i b
) noexcept
{ return _mm256_xor_si256 (a
, b
); }
374 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_andnot (__m256i a
, __m256i b
) noexcept
{ return _mm256_andnot_si256 (a
, b
); }
375 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_not (__m256i a
) noexcept
{ return _mm256_andnot_si256 (a
, load (kAllBitsSet
)); }
376 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
min (__m256i a
, __m256i b
) noexcept
{ return _mm256_min_epi16 (a
, b
); }
377 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
max (__m256i a
, __m256i b
) noexcept
{ return _mm256_max_epi16 (a
, b
); }
378 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
equal (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpeq_epi16 (a
, b
); }
379 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThan (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpgt_epi16 (a
, b
); }
380 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256i a
, __m256i b
) noexcept
{ return bit_or (greaterThan (a
, b
), equal (a
,b
)); }
381 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256i a
, __m256i b
, __m256i c
) noexcept
{ return add (a
, mul (b
, c
)); }
382 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
notEqual (__m256i a
, __m256i b
) noexcept
{ return bit_not (equal (a
, b
)); }
383 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256i a
, __m256i b
) noexcept
{ return (_mm256_movemask_epi8 (equal (a
, b
)) == -1); }
384 static forcedinline
int16_t JUCE_VECTOR_CALLTYPE
get (__m256i v
, size_t i
) noexcept
{ return SIMDFallbackOps
<int16_t, __m256i
>::get (v
, i
); }
385 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
set (__m256i v
, size_t i
, int16_t s
) noexcept
{ return SIMDFallbackOps
<int16_t, __m256i
>::set (v
, i
, s
); }
386 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
truncate (__m256i a
) noexcept
{ return a
; }
388 //==============================================================================
389 static forcedinline
int16_t JUCE_VECTOR_CALLTYPE
sum (__m256i a
) noexcept
391 __m256i tmp
= _mm256_hadd_epi16 (a
, a
);
392 tmp
= _mm256_hadd_epi16 (tmp
, tmp
);
393 tmp
= _mm256_hadd_epi16 (tmp
, tmp
);
396 return (int16_t) ((tmp
[0] & 0xffff) + (tmp
[2] & 0xffff));
398 constexpr int mask
= (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
400 return (int16_t) ((_mm256_cvtsi256_si32 (tmp
) & 0xffff) +
401 (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp
, mask
)) & 0xffff));
406 //==============================================================================
407 /** Unsigned 16-bit integer AVX intrinsics.
412 struct SIMDNativeOps
<uint16_t>
414 //==============================================================================
415 using vSIMDType
= __m256i
;
417 //==============================================================================
418 DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit
);
419 DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet
);
421 //==============================================================================
422 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
ssign (__m256i a
) noexcept
{ return _mm256_xor_si256 (a
, load (kHighBit
)); }
423 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
expand (uint16_t s
) noexcept
{ return _mm256_set1_epi16 ((int16_t) s
); }
424 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
load (const uint16_t* p
) noexcept
{ return _mm256_load_si256 (reinterpret_cast<const __m256i
*> (p
)); }
425 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256i value
, uint16_t* dest
) noexcept
{ _mm256_store_si256 (reinterpret_cast<__m256i
*> (dest
), value
); }
426 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
add (__m256i a
, __m256i b
) noexcept
{ return _mm256_add_epi16 (a
, b
); }
427 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
sub (__m256i a
, __m256i b
) noexcept
{ return _mm256_sub_epi16 (a
, b
); }
428 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
mul (__m256i a
, __m256i b
) noexcept
{ return _mm256_mullo_epi16 (a
, b
); }
429 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_and (__m256i a
, __m256i b
) noexcept
{ return _mm256_and_si256 (a
, b
); }
430 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_or (__m256i a
, __m256i b
) noexcept
{ return _mm256_or_si256 (a
, b
); }
431 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_xor (__m256i a
, __m256i b
) noexcept
{ return _mm256_xor_si256 (a
, b
); }
432 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_andnot (__m256i a
, __m256i b
) noexcept
{ return _mm256_andnot_si256 (a
, b
); }
433 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_not (__m256i a
) noexcept
{ return _mm256_andnot_si256 (a
, load (kAllBitsSet
)); }
434 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
min (__m256i a
, __m256i b
) noexcept
{ return _mm256_min_epu16 (a
, b
); }
435 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
max (__m256i a
, __m256i b
) noexcept
{ return _mm256_max_epu16 (a
, b
); }
436 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
equal (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpeq_epi16 (a
, b
); }
437 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThan (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpgt_epi16 (ssign (a
), ssign (b
)); }
438 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256i a
, __m256i b
) noexcept
{ return bit_or (greaterThan (a
, b
), equal (a
,b
)); }
439 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256i a
, __m256i b
, __m256i c
) noexcept
{ return add (a
, mul (b
, c
)); }
440 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
notEqual (__m256i a
, __m256i b
) noexcept
{ return bit_not (equal (a
, b
)); }
441 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256i a
, __m256i b
) noexcept
{ return (_mm256_movemask_epi8 (equal (a
, b
)) == -1); }
442 static forcedinline
uint16_t JUCE_VECTOR_CALLTYPE
get (__m256i v
, size_t i
) noexcept
{ return SIMDFallbackOps
<uint16_t, __m256i
>::get (v
, i
); }
443 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
set (__m256i v
, size_t i
, uint16_t s
) noexcept
{ return SIMDFallbackOps
<uint16_t, __m256i
>::set (v
, i
, s
); }
444 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
truncate (__m256i a
) noexcept
{ return a
; }
446 //==============================================================================
447 static forcedinline
uint16_t JUCE_VECTOR_CALLTYPE
sum (__m256i a
) noexcept
449 __m256i tmp
= _mm256_hadd_epi16 (a
, a
);
450 tmp
= _mm256_hadd_epi16 (tmp
, tmp
);
451 tmp
= _mm256_hadd_epi16 (tmp
, tmp
);
454 return (uint16_t) ((static_cast<uint32_t> (tmp
[0]) & 0xffffu
) +
455 (static_cast<uint32_t> (tmp
[2]) & 0xffffu
));
457 constexpr int mask
= (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
459 return (uint16_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp
)) & 0xffffu
) +
460 (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp
, mask
))) & 0xffffu
));
465 //==============================================================================
466 /** Signed 32-bit integer AVX intrinsics.
471 struct SIMDNativeOps
<int32_t>
473 //==============================================================================
474 using vSIMDType
= __m256i
;
476 //==============================================================================
477 DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet
);
479 //==============================================================================
480 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
expand (int32_t s
) noexcept
{ return _mm256_set1_epi32 (s
); }
481 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
load (const int32_t* p
) noexcept
{ return _mm256_load_si256 (reinterpret_cast<const __m256i
*> (p
)); }
482 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256i value
, int32_t* dest
) noexcept
{ _mm256_store_si256 (reinterpret_cast<__m256i
*> (dest
), value
); }
483 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
add (__m256i a
, __m256i b
) noexcept
{ return _mm256_add_epi32 (a
, b
); }
484 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
sub (__m256i a
, __m256i b
) noexcept
{ return _mm256_sub_epi32 (a
, b
); }
485 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
mul (__m256i a
, __m256i b
) noexcept
{ return _mm256_mullo_epi32 (a
, b
); }
486 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_and (__m256i a
, __m256i b
) noexcept
{ return _mm256_and_si256 (a
, b
); }
487 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_or (__m256i a
, __m256i b
) noexcept
{ return _mm256_or_si256 (a
, b
); }
488 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_xor (__m256i a
, __m256i b
) noexcept
{ return _mm256_xor_si256 (a
, b
); }
489 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_andnot (__m256i a
, __m256i b
) noexcept
{ return _mm256_andnot_si256 (a
, b
); }
490 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_not (__m256i a
) noexcept
{ return _mm256_andnot_si256 (a
, load (kAllBitsSet
)); }
491 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
min (__m256i a
, __m256i b
) noexcept
{ return _mm256_min_epi32 (a
, b
); }
492 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
max (__m256i a
, __m256i b
) noexcept
{ return _mm256_max_epi32 (a
, b
); }
493 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
equal (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpeq_epi32 (a
, b
); }
494 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThan (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpgt_epi32 (a
, b
); }
495 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256i a
, __m256i b
) noexcept
{ return bit_or (greaterThan (a
, b
), equal (a
,b
)); }
496 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256i a
, __m256i b
, __m256i c
) noexcept
{ return add (a
, mul (b
, c
)); }
497 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
notEqual (__m256i a
, __m256i b
) noexcept
{ return bit_not (equal (a
, b
)); }
498 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256i a
, __m256i b
) noexcept
{ return (_mm256_movemask_epi8 (equal (a
, b
)) == -1); }
499 static forcedinline
int32_t JUCE_VECTOR_CALLTYPE
get (__m256i v
, size_t i
) noexcept
{ return SIMDFallbackOps
<int32_t, __m256i
>::get (v
, i
); }
500 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
set (__m256i v
, size_t i
, int32_t s
) noexcept
{ return SIMDFallbackOps
<int32_t, __m256i
>::set (v
, i
, s
); }
501 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
truncate (__m256i a
) noexcept
{ return a
; }
503 //==============================================================================
504 static forcedinline
int32_t JUCE_VECTOR_CALLTYPE
sum (__m256i a
) noexcept
506 __m256i tmp
= _mm256_hadd_epi32 (a
, a
);
507 tmp
= _mm256_hadd_epi32 (tmp
, tmp
);
510 return (int32_t) (tmp
[0] + tmp
[2]);
512 constexpr int mask
= (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
514 return _mm256_cvtsi256_si32 (tmp
) + _mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp
, mask
));
519 //==============================================================================
520 /** Unsigned 32-bit integer AVX intrinsics.
525 struct SIMDNativeOps
<uint32_t>
527 //==============================================================================
528 using vSIMDType
= __m256i
;
530 //==============================================================================
531 DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet
);
532 DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit
);
534 //==============================================================================
535 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
ssign (__m256i a
) noexcept
{ return _mm256_xor_si256 (a
, load (kHighBit
)); }
536 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
expand (uint32_t s
) noexcept
{ return _mm256_set1_epi32 ((int32_t) s
); }
537 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
load (const uint32_t* p
) noexcept
{ return _mm256_load_si256 (reinterpret_cast<const __m256i
*> (p
)); }
538 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256i value
, uint32_t* dest
) noexcept
{ _mm256_store_si256 (reinterpret_cast<__m256i
*> (dest
), value
); }
539 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
add (__m256i a
, __m256i b
) noexcept
{ return _mm256_add_epi32 (a
, b
); }
540 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
sub (__m256i a
, __m256i b
) noexcept
{ return _mm256_sub_epi32 (a
, b
); }
541 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
mul (__m256i a
, __m256i b
) noexcept
{ return _mm256_mullo_epi32 (a
, b
); }
542 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_and (__m256i a
, __m256i b
) noexcept
{ return _mm256_and_si256 (a
, b
); }
543 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_or (__m256i a
, __m256i b
) noexcept
{ return _mm256_or_si256 (a
, b
); }
544 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_xor (__m256i a
, __m256i b
) noexcept
{ return _mm256_xor_si256 (a
, b
); }
545 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_andnot (__m256i a
, __m256i b
) noexcept
{ return _mm256_andnot_si256 (a
, b
); }
546 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_not (__m256i a
) noexcept
{ return _mm256_andnot_si256 (a
, load (kAllBitsSet
)); }
547 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
min (__m256i a
, __m256i b
) noexcept
{ return _mm256_min_epu32 (a
, b
); }
548 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
max (__m256i a
, __m256i b
) noexcept
{ return _mm256_max_epu32 (a
, b
); }
549 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
equal (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpeq_epi32 (a
, b
); }
550 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThan (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpgt_epi32 (ssign (a
), ssign (b
)); }
551 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256i a
, __m256i b
) noexcept
{ return bit_or (greaterThan (a
, b
), equal (a
,b
)); }
552 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256i a
, __m256i b
, __m256i c
) noexcept
{ return add (a
, mul (b
, c
)); }
553 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
notEqual (__m256i a
, __m256i b
) noexcept
{ return bit_not (equal (a
, b
)); }
554 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256i a
, __m256i b
) noexcept
{ return (_mm256_movemask_epi8 (equal (a
, b
)) == -1); }
555 static forcedinline
uint32_t JUCE_VECTOR_CALLTYPE
get (__m256i v
, size_t i
) noexcept
{ return SIMDFallbackOps
<uint32_t, __m256i
>::get (v
, i
); }
556 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
set (__m256i v
, size_t i
, uint32_t s
) noexcept
{ return SIMDFallbackOps
<uint32_t, __m256i
>::set (v
, i
, s
); }
557 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
truncate (__m256i a
) noexcept
{ return a
; }
559 //==============================================================================
560 static forcedinline
uint32_t JUCE_VECTOR_CALLTYPE
sum (__m256i a
) noexcept
562 __m256i tmp
= _mm256_hadd_epi32 (a
, a
);
563 tmp
= _mm256_hadd_epi32 (tmp
, tmp
);
566 return static_cast<uint32_t> (tmp
[0]) + static_cast<uint32_t> (tmp
[2]);
568 constexpr int mask
= (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
570 return static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp
))
571 + static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp
, mask
)));
576 //==============================================================================
577 /** Signed 64-bit integer AVX intrinsics.
582 struct SIMDNativeOps
<int64_t>
584 //==============================================================================
585 using vSIMDType
= __m256i
;
587 //==============================================================================
588 DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet
);
590 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
expand (int64_t s
) noexcept
{ return _mm256_set1_epi64x ((int64_t) s
); }
591 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
load (const int64_t* p
) noexcept
{ return _mm256_load_si256 (reinterpret_cast<const __m256i
*> (p
)); }
592 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256i value
, int64_t* dest
) noexcept
{ _mm256_store_si256 (reinterpret_cast<__m256i
*> (dest
), value
); }
593 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
add (__m256i a
, __m256i b
) noexcept
{ return _mm256_add_epi64 (a
, b
); }
594 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
sub (__m256i a
, __m256i b
) noexcept
{ return _mm256_sub_epi64 (a
, b
); }
595 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_and (__m256i a
, __m256i b
) noexcept
{ return _mm256_and_si256 (a
, b
); }
596 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_or (__m256i a
, __m256i b
) noexcept
{ return _mm256_or_si256 (a
, b
); }
597 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_xor (__m256i a
, __m256i b
) noexcept
{ return _mm256_xor_si256 (a
, b
); }
598 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_andnot (__m256i a
, __m256i b
) noexcept
{ return _mm256_andnot_si256 (a
, b
); }
599 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_not (__m256i a
) noexcept
{ return _mm256_andnot_si256 (a
, load (kAllBitsSet
)); }
600 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
min (__m256i a
, __m256i b
) noexcept
{ __m256i lt
= greaterThan (b
, a
); return bit_or (bit_and (lt
, a
), bit_andnot (lt
, b
)); }
601 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
max (__m256i a
, __m256i b
) noexcept
{ __m256i gt
= greaterThan (a
, b
); return bit_or (bit_and (gt
, a
), bit_andnot (gt
, b
)); }
602 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
equal (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpeq_epi64 (a
, b
); }
603 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThan (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpgt_epi64 (a
, b
); }
604 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256i a
, __m256i b
) noexcept
{ return bit_or (greaterThan (a
, b
), equal (a
,b
)); }
605 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256i a
, __m256i b
, __m256i c
) noexcept
{ return add (a
, mul (b
, c
)); }
606 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
notEqual (__m256i a
, __m256i b
) noexcept
{ return bit_not (equal (a
, b
)); }
607 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256i a
, __m256i b
) noexcept
{ return (_mm256_movemask_epi8 (equal (a
, b
)) == -1); }
608 static forcedinline
int64_t JUCE_VECTOR_CALLTYPE
get (__m256i v
, size_t i
) noexcept
{ return SIMDFallbackOps
<int64_t, __m256i
>::get (v
, i
); }
609 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
set (__m256i v
, size_t i
, int64_t s
) noexcept
{ return SIMDFallbackOps
<int64_t, __m256i
>::set (v
, i
, s
); }
610 static forcedinline
int64_t JUCE_VECTOR_CALLTYPE
sum (__m256i a
) noexcept
{ return SIMDFallbackOps
<int64_t, __m256i
>::sum (a
); }
611 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
mul (__m256i a
, __m256i b
) noexcept
{ return SIMDFallbackOps
<int64_t, __m256i
>::mul (a
, b
); }
612 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
truncate (__m256i a
) noexcept
{ return a
; }
615 //==============================================================================
616 /** Unsigned 64-bit integer AVX intrinsics.
621 struct SIMDNativeOps
<uint64_t>
623 //==============================================================================
624 using vSIMDType
= __m256i
;
626 //==============================================================================
627 DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet
);
628 DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit
);
630 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
expand (uint64_t s
) noexcept
{ return _mm256_set1_epi64x ((int64_t) s
); }
631 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
load (const uint64_t* p
) noexcept
{ return _mm256_load_si256 (reinterpret_cast<const __m256i
*> (p
)); }
632 static forcedinline
void JUCE_VECTOR_CALLTYPE
store (__m256i value
, uint64_t* dest
) noexcept
{ _mm256_store_si256 (reinterpret_cast<__m256i
*> (dest
), value
); }
633 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
ssign (__m256i a
) noexcept
{ return _mm256_xor_si256 (a
, load (kHighBit
)); }
634 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
add (__m256i a
, __m256i b
) noexcept
{ return _mm256_add_epi64 (a
, b
); }
635 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
sub (__m256i a
, __m256i b
) noexcept
{ return _mm256_sub_epi64 (a
, b
); }
636 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_and (__m256i a
, __m256i b
) noexcept
{ return _mm256_and_si256 (a
, b
); }
637 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_or (__m256i a
, __m256i b
) noexcept
{ return _mm256_or_si256 (a
, b
); }
638 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_xor (__m256i a
, __m256i b
) noexcept
{ return _mm256_xor_si256 (a
, b
); }
639 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_andnot (__m256i a
, __m256i b
) noexcept
{ return _mm256_andnot_si256 (a
, b
); }
640 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
bit_not (__m256i a
) noexcept
{ return _mm256_andnot_si256 (a
, load (kAllBitsSet
)); }
641 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
min (__m256i a
, __m256i b
) noexcept
{ __m256i lt
= greaterThan (b
, a
); return bit_or (bit_and (lt
, a
), bit_andnot (lt
, b
)); }
642 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
max (__m256i a
, __m256i b
) noexcept
{ __m256i gt
= greaterThan (a
, b
); return bit_or (bit_and (gt
, a
), bit_andnot (gt
, b
)); }
643 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
equal (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpeq_epi64 (a
, b
); }
644 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThan (__m256i a
, __m256i b
) noexcept
{ return _mm256_cmpgt_epi64 (ssign (a
), ssign (b
)); }
645 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
greaterThanOrEqual (__m256i a
, __m256i b
) noexcept
{ return bit_or (greaterThan (a
, b
), equal (a
,b
)); }
646 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
multiplyAdd (__m256i a
, __m256i b
, __m256i c
) noexcept
{ return add (a
, mul (b
, c
)); }
647 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
notEqual (__m256i a
, __m256i b
) noexcept
{ return bit_not (equal (a
, b
)); }
648 static forcedinline
bool JUCE_VECTOR_CALLTYPE
allEqual (__m256i a
, __m256i b
) noexcept
{ return (_mm256_movemask_epi8 (equal (a
, b
)) == -1); }
649 static forcedinline
uint64_t JUCE_VECTOR_CALLTYPE
get (__m256i v
, size_t i
) noexcept
{ return SIMDFallbackOps
<uint64_t, __m256i
>::get (v
, i
); }
650 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
set (__m256i v
, size_t i
, uint64_t s
) noexcept
{ return SIMDFallbackOps
<uint64_t, __m256i
>::set (v
, i
, s
); }
651 static forcedinline
uint64_t JUCE_VECTOR_CALLTYPE
sum (__m256i a
) noexcept
{ return SIMDFallbackOps
<uint64_t, __m256i
>::sum (a
); }
652 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
mul (__m256i a
, __m256i b
) noexcept
{ return SIMDFallbackOps
<uint64_t, __m256i
>::mul (a
, b
); }
653 static forcedinline __m256i JUCE_VECTOR_CALLTYPE
truncate (__m256i a
) noexcept
{ return a
; }
658 JUCE_END_IGNORE_WARNINGS_GCC_LIKE