2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H
36 #define GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H
44 #include <emmintrin.h>
46 #include "gromacs/utility/basedefinitions.h"
48 #include "impl_x86_sse2_simd_float.h"
55 static inline void gmx_simdcall
56 gatherLoadTranspose(const float * base
,
57 const std::int32_t offset
[],
63 assert(std::size_t(base
+ align
* offset
[0]) % 16 == 0);
64 assert(std::size_t(base
+ align
* offset
[1]) % 16 == 0);
65 assert(std::size_t(base
+ align
* offset
[2]) % 16 == 0);
66 assert(std::size_t(base
+ align
* offset
[3]) % 16 == 0);
68 v0
->simdInternal_
= _mm_load_ps( base
+ align
* offset
[0] );
69 v1
->simdInternal_
= _mm_load_ps( base
+ align
* offset
[1] );
70 v2
->simdInternal_
= _mm_load_ps( base
+ align
* offset
[2] );
71 v3
->simdInternal_
= _mm_load_ps( base
+ align
* offset
[3] );
73 _MM_TRANSPOSE4_PS(v0
->simdInternal_
, v1
->simdInternal_
, v2
->simdInternal_
, v3
->simdInternal_
);
77 static inline void gmx_simdcall
78 gatherLoadTranspose(const float * base
,
79 const std::int32_t offset
[],
85 v0
->simdInternal_
= _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base
+ align
* offset
[0] ) ));
86 v1
->simdInternal_
= _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base
+ align
* offset
[1] ) ));
87 t1
= _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base
+ align
* offset
[2] ) ));
88 t2
= _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base
+ align
* offset
[3] ) ));
89 t1
= _mm_unpacklo_ps(v0
->simdInternal_
, t1
);
90 t2
= _mm_unpacklo_ps(v1
->simdInternal_
, t2
);
91 v0
->simdInternal_
= _mm_unpacklo_ps(t1
, t2
);
92 v1
->simdInternal_
= _mm_unpackhi_ps(t1
, t2
);
95 static const int c_simdBestPairAlignmentFloat
= 2;
98 static inline void gmx_simdcall
99 gatherLoadUTranspose(const float * base
,
100 const std::int32_t offset
[],
105 __m128 t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
;
109 // general case, not aligned to 4-byte boundary
110 t1
= _mm_loadu_ps( base
+ align
* offset
[0] );
111 t2
= _mm_loadu_ps( base
+ align
* offset
[1] );
112 t3
= _mm_loadu_ps( base
+ align
* offset
[2] );
113 t4
= _mm_loadu_ps( base
+ align
* offset
[3] );
117 // aligned to 4-byte boundary or more
118 t1
= _mm_load_ps( base
+ align
* offset
[0] );
119 t2
= _mm_load_ps( base
+ align
* offset
[1] );
120 t3
= _mm_load_ps( base
+ align
* offset
[2] );
121 t4
= _mm_load_ps( base
+ align
* offset
[3] );
123 t5
= _mm_unpacklo_ps(t1
, t2
);
124 t6
= _mm_unpacklo_ps(t3
, t4
);
125 t7
= _mm_unpackhi_ps(t1
, t2
);
126 t8
= _mm_unpackhi_ps(t3
, t4
);
127 *v0
= _mm_movelh_ps(t5
, t6
);
128 *v1
= _mm_movehl_ps(t6
, t5
);
129 *v2
= _mm_movelh_ps(t7
, t8
);
134 static inline void gmx_simdcall
135 transposeScatterStoreU(float * base
,
136 const std::int32_t offset
[],
143 // general case, not aligned to 4-byte boundary
144 t1
= _mm_unpacklo_ps(v0
.simdInternal_
, v1
.simdInternal_
);
145 t2
= _mm_unpackhi_ps(v0
.simdInternal_
, v1
.simdInternal_
);
146 _mm_storel_pi( reinterpret_cast< __m64
*>( base
+ align
* offset
[0] ), t1
);
147 _mm_store_ss(base
+ align
* offset
[0] + 2, v2
.simdInternal_
);
148 _mm_storeh_pi( reinterpret_cast< __m64
*>( base
+ align
* offset
[1] ), t1
);
149 _mm_store_ss(base
+ align
* offset
[1] + 2, _mm_shuffle_ps(v2
.simdInternal_
, v2
.simdInternal_
, _MM_SHUFFLE(1, 1, 1, 1)));
150 _mm_storel_pi( reinterpret_cast< __m64
*>( base
+ align
* offset
[2] ), t2
);
151 _mm_store_ss(base
+ align
* offset
[2] + 2, _mm_shuffle_ps(v2
.simdInternal_
, v2
.simdInternal_
, _MM_SHUFFLE(2, 2, 2, 2)));
152 _mm_storeh_pi( reinterpret_cast< __m64
*>( base
+ align
* offset
[3] ), t2
);
153 _mm_store_ss(base
+ align
* offset
[3] + 2, _mm_shuffle_ps(v2
.simdInternal_
, v2
.simdInternal_
, _MM_SHUFFLE(3, 3, 3, 3)));
158 static inline void gmx_simdcall
159 transposeScatterIncrU(float * base
,
160 const std::int32_t offset
[],
165 __m128 t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, t9
, t10
;
169 t5
= _mm_unpacklo_ps(v1
.simdInternal_
, v2
.simdInternal_
);
170 t6
= _mm_unpackhi_ps(v1
.simdInternal_
, v2
.simdInternal_
);
171 t7
= _mm_shuffle_ps(v0
.simdInternal_
, t5
, _MM_SHUFFLE(1, 0, 0, 0));
172 t8
= _mm_shuffle_ps(v0
.simdInternal_
, t5
, _MM_SHUFFLE(3, 2, 0, 1));
173 t9
= _mm_shuffle_ps(v0
.simdInternal_
, t6
, _MM_SHUFFLE(1, 0, 0, 2));
174 t10
= _mm_shuffle_ps(v0
.simdInternal_
, t6
, _MM_SHUFFLE(3, 2, 0, 3));
176 t1
= _mm_load_ss(base
+ align
* offset
[0]);
177 t1
= _mm_loadh_pi(t1
, reinterpret_cast< __m64
*>(base
+ align
* offset
[0] + 1));
178 t1
= _mm_add_ps(t1
, t7
);
179 _mm_store_ss(base
+ align
* offset
[0], t1
);
180 _mm_storeh_pi(reinterpret_cast< __m64
*>(base
+ align
* offset
[0] + 1), t1
);
182 t2
= _mm_load_ss(base
+ align
* offset
[1]);
183 t2
= _mm_loadh_pi(t2
, reinterpret_cast< __m64
*>(base
+ align
* offset
[1] + 1));
184 t2
= _mm_add_ps(t2
, t8
);
185 _mm_store_ss(base
+ align
* offset
[1], t2
);
186 _mm_storeh_pi(reinterpret_cast< __m64
*>(base
+ align
* offset
[1] + 1), t2
);
188 t3
= _mm_load_ss(base
+ align
* offset
[2]);
189 t3
= _mm_loadh_pi(t3
, reinterpret_cast< __m64
*>(base
+ align
* offset
[2] + 1));
190 t3
= _mm_add_ps(t3
, t9
);
191 _mm_store_ss(base
+ align
* offset
[2], t3
);
192 _mm_storeh_pi(reinterpret_cast< __m64
*>(base
+ align
* offset
[2] + 1), t3
);
194 t4
= _mm_load_ss(base
+ align
* offset
[3]);
195 t4
= _mm_loadh_pi(t4
, reinterpret_cast< __m64
*>(base
+ align
* offset
[3] + 1));
196 t4
= _mm_add_ps(t4
, t10
);
197 _mm_store_ss(base
+ align
* offset
[3], t4
);
198 _mm_storeh_pi(reinterpret_cast< __m64
*>(base
+ align
* offset
[3] + 1), t4
);
202 // Extra elements means we can use full width-4 load/store operations
204 t1
= _mm_unpacklo_ps(v0
.simdInternal_
, v2
.simdInternal_
); // x0 z0 x1 z1
205 t2
= _mm_unpackhi_ps(v0
.simdInternal_
, v2
.simdInternal_
); // x2 z2 x3 z3
206 t3
= _mm_unpacklo_ps(v1
.simdInternal_
, _mm_setzero_ps()); // y0 0 y1 0
207 t4
= _mm_unpackhi_ps(v1
.simdInternal_
, _mm_setzero_ps()); // y2 0 y3 0
208 t5
= _mm_unpacklo_ps(t1
, t3
); // x0 y0 z0 0
209 t6
= _mm_unpackhi_ps(t1
, t3
); // x1 y1 z1 0
210 t7
= _mm_unpacklo_ps(t2
, t4
); // x2 y2 z2 0
211 t8
= _mm_unpackhi_ps(t2
, t4
); // x3 y3 z3 0
215 // alignment is a multiple of 4
216 _mm_store_ps(base
+ align
* offset
[0], _mm_add_ps(_mm_load_ps(base
+ align
* offset
[0]), t5
));
217 _mm_store_ps(base
+ align
* offset
[1], _mm_add_ps(_mm_load_ps(base
+ align
* offset
[1]), t6
));
218 _mm_store_ps(base
+ align
* offset
[2], _mm_add_ps(_mm_load_ps(base
+ align
* offset
[2]), t7
));
219 _mm_store_ps(base
+ align
* offset
[3], _mm_add_ps(_mm_load_ps(base
+ align
* offset
[3]), t8
));
223 // alignment >=5, but not a multiple of 4
224 _mm_storeu_ps(base
+ align
* offset
[0], _mm_add_ps(_mm_loadu_ps(base
+ align
* offset
[0]), t5
));
225 _mm_storeu_ps(base
+ align
* offset
[1], _mm_add_ps(_mm_loadu_ps(base
+ align
* offset
[1]), t6
));
226 _mm_storeu_ps(base
+ align
* offset
[2], _mm_add_ps(_mm_loadu_ps(base
+ align
* offset
[2]), t7
));
227 _mm_storeu_ps(base
+ align
* offset
[3], _mm_add_ps(_mm_loadu_ps(base
+ align
* offset
[3]), t8
));
233 static inline void gmx_simdcall
234 transposeScatterDecrU(float * base
,
235 const std::int32_t offset
[],
240 // This implementation is identical to the increment version, apart from using subtraction instead
241 __m128 t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, t9
, t10
;
245 t5
= _mm_unpacklo_ps(v1
.simdInternal_
, v2
.simdInternal_
);
246 t6
= _mm_unpackhi_ps(v1
.simdInternal_
, v2
.simdInternal_
);
247 t7
= _mm_shuffle_ps(v0
.simdInternal_
, t5
, _MM_SHUFFLE(1, 0, 0, 0));
248 t8
= _mm_shuffle_ps(v0
.simdInternal_
, t5
, _MM_SHUFFLE(3, 2, 0, 1));
249 t9
= _mm_shuffle_ps(v0
.simdInternal_
, t6
, _MM_SHUFFLE(1, 0, 0, 2));
250 t10
= _mm_shuffle_ps(v0
.simdInternal_
, t6
, _MM_SHUFFLE(3, 2, 0, 3));
252 t1
= _mm_load_ss(base
+ align
* offset
[0]);
253 t1
= _mm_loadh_pi(t1
, reinterpret_cast< __m64
*>(base
+ align
* offset
[0] + 1));
254 t1
= _mm_sub_ps(t1
, t7
);
255 _mm_store_ss(base
+ align
* offset
[0], t1
);
256 _mm_storeh_pi(reinterpret_cast< __m64
*>(base
+ align
* offset
[0] + 1), t1
);
258 t2
= _mm_load_ss(base
+ align
* offset
[1]);
259 t2
= _mm_loadh_pi(t2
, reinterpret_cast< __m64
*>(base
+ align
* offset
[1] + 1));
260 t2
= _mm_sub_ps(t2
, t8
);
261 _mm_store_ss(base
+ align
* offset
[1], t2
);
262 _mm_storeh_pi(reinterpret_cast< __m64
*>(base
+ align
* offset
[1] + 1), t2
);
264 t3
= _mm_load_ss(base
+ align
* offset
[2]);
265 t3
= _mm_loadh_pi(t3
, reinterpret_cast< __m64
*>(base
+ align
* offset
[2] + 1));
266 t3
= _mm_sub_ps(t3
, t9
);
267 _mm_store_ss(base
+ align
* offset
[2], t3
);
268 _mm_storeh_pi(reinterpret_cast< __m64
*>(base
+ align
* offset
[2] + 1), t3
);
270 t4
= _mm_load_ss(base
+ align
* offset
[3]);
271 t4
= _mm_loadh_pi(t4
, reinterpret_cast< __m64
*>(base
+ align
* offset
[3] + 1));
272 t4
= _mm_sub_ps(t4
, t10
);
273 _mm_store_ss(base
+ align
* offset
[3], t4
);
274 _mm_storeh_pi(reinterpret_cast< __m64
*>(base
+ align
* offset
[3] + 1), t4
);
278 // Extra elements means we can use full width-4 load/store operations
280 t1
= _mm_unpacklo_ps(v0
.simdInternal_
, v2
.simdInternal_
); // x0 z0 x1 z1
281 t2
= _mm_unpackhi_ps(v0
.simdInternal_
, v2
.simdInternal_
); // x2 z2 x3 z3
282 t3
= _mm_unpacklo_ps(v1
.simdInternal_
, _mm_setzero_ps()); // y0 0 y1 0
283 t4
= _mm_unpackhi_ps(v1
.simdInternal_
, _mm_setzero_ps()); // y2 0 y3 0
284 t5
= _mm_unpacklo_ps(t1
, t3
); // x0 y0 z0 0
285 t6
= _mm_unpackhi_ps(t1
, t3
); // x1 y1 z1 0
286 t7
= _mm_unpacklo_ps(t2
, t4
); // x2 y2 z2 0
287 t8
= _mm_unpackhi_ps(t2
, t4
); // x3 y3 z3 0
291 // alignment is a multiple of 4
292 _mm_store_ps(base
+ align
* offset
[0], _mm_sub_ps(_mm_load_ps(base
+ align
* offset
[0]), t5
));
293 _mm_store_ps(base
+ align
* offset
[1], _mm_sub_ps(_mm_load_ps(base
+ align
* offset
[1]), t6
));
294 _mm_store_ps(base
+ align
* offset
[2], _mm_sub_ps(_mm_load_ps(base
+ align
* offset
[2]), t7
));
295 _mm_store_ps(base
+ align
* offset
[3], _mm_sub_ps(_mm_load_ps(base
+ align
* offset
[3]), t8
));
299 // alignment >=5, but not a multiple of 4
300 _mm_storeu_ps(base
+ align
* offset
[0], _mm_sub_ps(_mm_loadu_ps(base
+ align
* offset
[0]), t5
));
301 _mm_storeu_ps(base
+ align
* offset
[1], _mm_sub_ps(_mm_loadu_ps(base
+ align
* offset
[1]), t6
));
302 _mm_storeu_ps(base
+ align
* offset
[2], _mm_sub_ps(_mm_loadu_ps(base
+ align
* offset
[2]), t7
));
303 _mm_storeu_ps(base
+ align
* offset
[3], _mm_sub_ps(_mm_loadu_ps(base
+ align
* offset
[3]), t8
));
308 // Override for AVX-128-FMA and higher
309 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
310 static inline void gmx_simdcall
311 expandScalarsToTriplets(SimdFloat scalar
,
312 SimdFloat
* triplets0
,
313 SimdFloat
* triplets1
,
314 SimdFloat
* triplets2
)
316 triplets0
->simdInternal_
= _mm_shuffle_ps(scalar
.simdInternal_
, scalar
.simdInternal_
, _MM_SHUFFLE(1, 0, 0, 0));
317 triplets1
->simdInternal_
= _mm_shuffle_ps(scalar
.simdInternal_
, scalar
.simdInternal_
, _MM_SHUFFLE(2, 2, 1, 1));
318 triplets2
->simdInternal_
= _mm_shuffle_ps(scalar
.simdInternal_
, scalar
.simdInternal_
, _MM_SHUFFLE(3, 3, 3, 2));
324 static inline void gmx_simdcall
325 gatherLoadBySimdIntTranspose(const float * base
,
332 // For present-generation x86 CPUs it appears to be faster to simply
333 // store the SIMD integer to memory and then use the normal load operations.
334 // This is likely because (a) the extract function is expensive, and (b)
335 // the alignment scaling can often be done as part of the load instruction
336 // (which is even cheaper than doing it in SIMD registers).
337 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
338 _mm_store_si128( (__m128i
*)ioffset
, offset
.simdInternal_
);
339 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
, v2
, v3
);
343 static inline void gmx_simdcall
344 gatherLoadBySimdIntTranspose(const float * base
,
349 // For present-generation x86 CPUs it appears to be faster to simply
350 // store the SIMD integer to memory and then use the normal load operations.
351 // This is likely because (a) the extract function is expensive, and (b)
352 // the alignment scaling can often be done as part of the load instruction
353 // (which is even cheaper than doing it in SIMD registers).
354 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
355 _mm_store_si128( (__m128i
*)ioffset
, offset
.simdInternal_
);
356 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
);
362 static inline void gmx_simdcall
363 gatherLoadUBySimdIntTranspose(const float * base
,
368 // For present-generation x86 CPUs it appears to be faster to simply
369 // store the SIMD integer to memory and then use the normal load operations.
370 // This is likely because (a) the extract function is expensive, and (b)
371 // the alignment scaling can often be done as part of the load instruction
372 // (which is even cheaper than doing it in SIMD registers).
373 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
374 _mm_store_si128( (__m128i
*)ioffset
, offset
.simdInternal_
);
375 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
);
378 // Override for AVX-128-FMA and higher
379 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
380 static inline float gmx_simdcall
381 reduceIncr4ReturnSum(float * m
,
387 _MM_TRANSPOSE4_PS(v0
.simdInternal_
, v1
.simdInternal_
, v2
.simdInternal_
, v3
.simdInternal_
);
388 v0
.simdInternal_
= _mm_add_ps(v0
.simdInternal_
, v1
.simdInternal_
);
389 v2
.simdInternal_
= _mm_add_ps(v2
.simdInternal_
, v3
.simdInternal_
);
390 v0
.simdInternal_
= _mm_add_ps(v0
.simdInternal_
, v2
.simdInternal_
);
391 v2
.simdInternal_
= _mm_add_ps(v0
.simdInternal_
, _mm_load_ps(m
));
393 assert(std::size_t(m
) % 16 == 0);
394 _mm_store_ps(m
, v2
.simdInternal_
);
396 __m128 b
= _mm_add_ps(v0
.simdInternal_
, _mm_shuffle_ps(v0
.simdInternal_
, v0
.simdInternal_
, _MM_SHUFFLE(1, 0, 3, 2)));
397 b
= _mm_add_ss(b
, _mm_shuffle_ps(b
, b
, _MM_SHUFFLE(0, 3, 2, 1)));
398 return *reinterpret_cast<float *>(&b
);
404 #endif // GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H